memplex 3.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- memnex/__init__.py +31 -0
- memnex/__main__.py +6 -0
- memnex/_plugin/.claude-plugin/plugin.json +24 -0
- memnex/_plugin/.mcp.json +9 -0
- memnex/_plugin/__init__.py +0 -0
- memnex/_plugin/hooks/hooks.json +43 -0
- memnex/_plugin/scripts/hook-runner.py +166 -0
- memnex/_plugin/skills/mem-explore/SKILL.md +83 -0
- memnex/_plugin/skills/mem-manage/SKILL.md +92 -0
- memnex/_plugin/skills/mem-search/SKILL.md +85 -0
- memnex/_plugin/skills/mem-write/SKILL.md +78 -0
- memnex/adapters/__init__.py +14 -0
- memnex/adapters/claude_skill.py +169 -0
- memnex/adapters/cli.py +525 -0
- memnex/adapters/http_api.py +314 -0
- memnex/adapters/mcp_server.py +448 -0
- memnex/compaction.py +563 -0
- memnex/config.py +366 -0
- memnex/core/__init__.py +13 -0
- memnex/core/associator/__init__.py +8 -0
- memnex/core/associator/domain_classifier.py +75 -0
- memnex/core/associator/entity_aligner.py +127 -0
- memnex/core/associator/ref_linker.py +197 -0
- memnex/core/associator/term_mapper.py +77 -0
- memnex/core/dictionaries/__init__.py +50 -0
- memnex/core/engine.py +667 -0
- memnex/core/extractors/__init__.py +15 -0
- memnex/core/extractors/docx.py +97 -0
- memnex/core/extractors/image.py +233 -0
- memnex/core/extractors/markdown.py +139 -0
- memnex/core/extractors/pdf.py +133 -0
- memnex/core/extractors/vision_mapper.py +131 -0
- memnex/core/handlers/__init__.py +7 -0
- memnex/core/handlers/clipboard.py +40 -0
- memnex/core/handlers/file_handler.py +62 -0
- memnex/core/handlers/url_handler.py +132 -0
- memnex/llm/__init__.py +25 -0
- memnex/llm/enhancer.py +226 -0
- memnex/llm/fallback_chain.py +87 -0
- memnex/llm/injection_guard.py +178 -0
- memnex/llm/provider.py +130 -0
- memnex/llm/providers/__init__.py +22 -0
- memnex/llm/providers/anthropic.py +135 -0
- memnex/llm/providers/local.py +135 -0
- memnex/llm/providers/rule_based.py +68 -0
- memnex/llm/sanitizer.py +67 -0
- memnex/models/__init__.py +68 -0
- memnex/models/feedback.py +42 -0
- memnex/models/graph.py +33 -0
- memnex/models/memory.py +102 -0
- memnex/models/misc.py +185 -0
- memnex/models/paragraph.py +45 -0
- memnex/models/search.py +51 -0
- memnex/models/source.py +23 -0
- memnex/models/task.py +62 -0
- memnex/processing/__init__.py +1 -0
- memnex/processing/graph_builder.py +278 -0
- memnex/processing/merger/__init__.py +6 -0
- memnex/processing/merger/confidence_calculator.py +127 -0
- memnex/processing/merger/conflict_resolver.py +116 -0
- memnex/retrieval/__init__.py +1 -0
- memnex/retrieval/dedup.py +386 -0
- memnex/retrieval/embedding.py +289 -0
- memnex/retrieval/reranker.py +299 -0
- memnex/service.py +902 -0
- memnex/storage/__init__.py +65 -0
- memnex/storage/base.py +132 -0
- memnex/storage/changelog.py +106 -0
- memnex/storage/feedback.py +486 -0
- memnex/storage/lite/__init__.py +5 -0
- memnex/storage/lite/store.py +606 -0
- memnex/storage/vector.py +265 -0
- memnex/wiki/__init__.py +11 -0
- memnex/wiki/community.py +221 -0
- memnex/wiki/compiler.py +545 -0
- memnex/wiki/generator.py +270 -0
- memnex/wiki/search.py +282 -0
- memnex/worker.py +412 -0
- memplex-3.2.0.dist-info/METADATA +37 -0
- memplex-3.2.0.dist-info/RECORD +83 -0
- memplex-3.2.0.dist-info/WHEEL +5 -0
- memplex-3.2.0.dist-info/entry_points.txt +2 -0
- memplex-3.2.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,386 @@
|
|
|
1
|
+
"""MemoryDeduplicator -- exact + semantic deduplication for Compaction.
|
|
2
|
+
|
|
3
|
+
Dedup strategies (by scale, automatic fallback)::
|
|
4
|
+
|
|
5
|
+
1. FAISS IVF ANN -- O(n log n), cross-domain, requires faiss-cpu
|
|
6
|
+
2. NumPy matrix -- O(n*d), grouped by domain, requires numpy
|
|
7
|
+
3. Pure Python -- O(n^2), zero dependencies, Lite fallback
|
|
8
|
+
|
|
9
|
+
Usage::
|
|
10
|
+
|
|
11
|
+
dedup = MemoryDeduplicator(embedding_service)
|
|
12
|
+
result = dedup.deduplicate(memories)
|
|
13
|
+
print(result.exact_removed, result.semantic_removed)
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
from __future__ import annotations
|
|
17
|
+
|
|
18
|
+
import copy
|
|
19
|
+
import hashlib
|
|
20
|
+
import logging
|
|
21
|
+
import math
|
|
22
|
+
from enum import Enum
|
|
23
|
+
from typing import Dict, List, Optional
|
|
24
|
+
|
|
25
|
+
from memnex.retrieval.embedding import EmbeddingService
|
|
26
|
+
from memnex.models import DedupResult, Memory
|
|
27
|
+
|
|
28
|
+
logger = logging.getLogger(__name__)
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
# ── Dedup strategy ────────────────────────────────────────────────────
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class DedupStrategy(Enum):
|
|
35
|
+
"""Deduplication strategy."""
|
|
36
|
+
|
|
37
|
+
EXACT = "exact" # exact content hash match
|
|
38
|
+
SEMANTIC = "semantic" # embedding cosine similarity
|
|
39
|
+
BOTH = "both" # exact first, then semantic
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
# ── MemoryDeduplicator ────────────────────────────────────────────────
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
class MemoryDeduplicator:
|
|
46
|
+
"""Memory deduplication cleaner.
|
|
47
|
+
|
|
48
|
+
Used in the Compaction Pipeline's Dedup stage.
|
|
49
|
+
|
|
50
|
+
Parameters
|
|
51
|
+
----------
|
|
52
|
+
embedding_service:
|
|
53
|
+
Used to generate embeddings for semantic dedup.
|
|
54
|
+
strategy:
|
|
55
|
+
Which dedup strategy to apply.
|
|
56
|
+
threshold:
|
|
57
|
+
Cosine similarity threshold for semantic dedup (default 0.95).
|
|
58
|
+
chunk_threshold:
|
|
59
|
+
Max memories per NumPy chunk before domain-based splitting.
|
|
60
|
+
"""
|
|
61
|
+
|
|
62
|
+
def __init__(
|
|
63
|
+
self,
|
|
64
|
+
embedding_service: EmbeddingService,
|
|
65
|
+
strategy: DedupStrategy = DedupStrategy.BOTH,
|
|
66
|
+
threshold: float = 0.95,
|
|
67
|
+
chunk_threshold: int = 20000,
|
|
68
|
+
) -> None:
|
|
69
|
+
self.embedder = embedding_service
|
|
70
|
+
self.strategy = strategy
|
|
71
|
+
self.threshold = threshold
|
|
72
|
+
self.chunk_threshold = chunk_threshold
|
|
73
|
+
|
|
74
|
+
# Counters set at the start of each ``deduplicate`` call
|
|
75
|
+
self._original_count: int = 0
|
|
76
|
+
self._exact_removed: int = 0
|
|
77
|
+
self._semantic_removed: int = 0
|
|
78
|
+
|
|
79
|
+
# ── Public API ──────────────────────────────────────────────────
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def deduplicate(self, memories: List[Memory]) -> DedupResult:
|
|
83
|
+
"""Run dedup and return cleaned memories with statistics."""
|
|
84
|
+
self._original_count = len(memories)
|
|
85
|
+
self._exact_removed = 0
|
|
86
|
+
self._semantic_removed = 0
|
|
87
|
+
|
|
88
|
+
if self.strategy in (DedupStrategy.EXACT, DedupStrategy.BOTH):
|
|
89
|
+
memories = self._exact_dedup(memories)
|
|
90
|
+
|
|
91
|
+
if self.strategy in (DedupStrategy.SEMANTIC, DedupStrategy.BOTH):
|
|
92
|
+
memories = self._semantic_dedup(memories)
|
|
93
|
+
|
|
94
|
+
return DedupResult(
|
|
95
|
+
original_count=self._original_count,
|
|
96
|
+
final_count=len(memories),
|
|
97
|
+
exact_removed=self._exact_removed,
|
|
98
|
+
semantic_removed=self._semantic_removed,
|
|
99
|
+
deduplicated=memories,
|
|
100
|
+
)
|
|
101
|
+
|
|
102
|
+
# ── Exact dedup ─────────────────────────────────────────────────
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
def _exact_dedup(self, memories: List[Memory]) -> List[Memory]:
|
|
106
|
+
"""Remove exact duplicates by content hash.
|
|
107
|
+
|
|
108
|
+
When two memories share the same hash, the *better* one is kept
|
|
109
|
+
(see :meth:`_choose_better`).
|
|
110
|
+
"""
|
|
111
|
+
seen: Dict[str, Memory] = {}
|
|
112
|
+
for m in memories:
|
|
113
|
+
key = self._content_hash(m)
|
|
114
|
+
if key not in seen:
|
|
115
|
+
seen[key] = m
|
|
116
|
+
else:
|
|
117
|
+
seen[key] = self._choose_better(seen[key], m)
|
|
118
|
+
self._exact_removed += 1
|
|
119
|
+
return list(seen.values())
|
|
120
|
+
|
|
121
|
+
@staticmethod
|
|
122
|
+
def _choose_better(a: Memory, b: Memory) -> Memory:
|
|
123
|
+
"""Pick the higher-quality memory to keep.
|
|
124
|
+
|
|
125
|
+
Priority: newer *updated_at* > more populated fields > higher confidence.
|
|
126
|
+
"""
|
|
127
|
+
# updated_at comparison (they may be str or datetime)
|
|
128
|
+
a_updated = str(getattr(a, "updated_at", "") or "")
|
|
129
|
+
b_updated = str(getattr(b, "updated_at", "") or "")
|
|
130
|
+
if a_updated > b_updated:
|
|
131
|
+
return a
|
|
132
|
+
if b_updated > a_updated:
|
|
133
|
+
return b
|
|
134
|
+
|
|
135
|
+
# field completeness
|
|
136
|
+
a_fields = sum(
|
|
137
|
+
1 for role in ("trigger", "condition", "action", "benefit")
|
|
138
|
+
if getattr(a, role, [])
|
|
139
|
+
)
|
|
140
|
+
b_fields = sum(
|
|
141
|
+
1 for role in ("trigger", "condition", "action", "benefit")
|
|
142
|
+
if getattr(b, role, [])
|
|
143
|
+
)
|
|
144
|
+
if a_fields >= b_fields:
|
|
145
|
+
return a
|
|
146
|
+
return b
|
|
147
|
+
|
|
148
|
+
# ── Semantic dedup (dispatcher) ─────────────────────────────────
|
|
149
|
+
|
|
150
|
+
|
|
151
|
+
def _semantic_dedup(self, memories: List[Memory]) -> List[Memory]:
|
|
152
|
+
"""Semantic dedup with automatic backend selection.
|
|
153
|
+
|
|
154
|
+
Routing:
|
|
155
|
+
1. FAISS IVF ANN (if *faiss* is importable) -- global, cross-domain.
|
|
156
|
+
2. NumPy matrix (if *numpy* is importable, n <= chunk_threshold)
|
|
157
|
+
-- grouped by domain when n > chunk_threshold.
|
|
158
|
+
3. Pure-Python pairwise -- O(n^2), zero dependencies.
|
|
159
|
+
"""
|
|
160
|
+
# 1. FAISS (best)
|
|
161
|
+
try:
|
|
162
|
+
import faiss # noqa: F401 -- check availability
|
|
163
|
+
|
|
164
|
+
return self._semantic_dedup_faiss(memories)
|
|
165
|
+
except ImportError:
|
|
166
|
+
pass
|
|
167
|
+
|
|
168
|
+
# 2. NumPy (good for medium scale)
|
|
169
|
+
if len(memories) > self.chunk_threshold:
|
|
170
|
+
# Split by domain to keep memory bounded
|
|
171
|
+
grouped: Dict[str, List[Memory]] = {}
|
|
172
|
+
for m in memories:
|
|
173
|
+
domain = getattr(m, "domain", "unknown") or "unknown"
|
|
174
|
+
grouped.setdefault(domain, []).append(m)
|
|
175
|
+
result: List[Memory] = []
|
|
176
|
+
for group in grouped.values():
|
|
177
|
+
result.extend(self._semantic_dedup_chunk(group))
|
|
178
|
+
return result
|
|
179
|
+
|
|
180
|
+
return self._semantic_dedup_chunk(memories)
|
|
181
|
+
|
|
182
|
+
# ── FAISS ANN dedup ─────────────────────────────────────────────
|
|
183
|
+
|
|
184
|
+
|
|
185
|
+
def _semantic_dedup_faiss(self, memories: List[Memory]) -> List[Memory]:
|
|
186
|
+
"""FAISS IndexFlatIP with Union-Find clustering.
|
|
187
|
+
|
|
188
|
+
Steps:
|
|
189
|
+
1. Batch embed + L2-normalise (inner product == cosine).
|
|
190
|
+
2. Build IndexFlatIP, search top-K=5 neighbours.
|
|
191
|
+
3. Union-Find groups memories above *threshold*.
|
|
192
|
+
4. Merge each cluster into one representative memory.
|
|
193
|
+
"""
|
|
194
|
+
import faiss # type: ignore
|
|
195
|
+
import numpy as np # type: ignore
|
|
196
|
+
|
|
197
|
+
if not memories:
|
|
198
|
+
return []
|
|
199
|
+
|
|
200
|
+
texts = [self._memory_to_text(m) for m in memories]
|
|
201
|
+
raw_vecs = np.array(
|
|
202
|
+
self.embedder.embed_batch(texts), dtype="float32"
|
|
203
|
+
)
|
|
204
|
+
|
|
205
|
+
# Normalise to unit vectors so IP == cosine
|
|
206
|
+
norms = np.linalg.norm(raw_vecs, axis=1, keepdims=True)
|
|
207
|
+
vecs = raw_vecs / (norms + 1e-8)
|
|
208
|
+
|
|
209
|
+
index = faiss.IndexFlatIP(vecs.shape[1])
|
|
210
|
+
index.add(vecs)
|
|
211
|
+
|
|
212
|
+
k = min(6, len(memories)) # top-1 is self; real neighbours = k-1
|
|
213
|
+
distances, indices = index.search(vecs, k)
|
|
214
|
+
|
|
215
|
+
# Union-Find
|
|
216
|
+
parent = list(range(len(memories)))
|
|
217
|
+
|
|
218
|
+
def find(x: int) -> int:
|
|
219
|
+
while parent[x] != x:
|
|
220
|
+
parent[x] = parent[parent[x]]
|
|
221
|
+
x = parent[x]
|
|
222
|
+
return x
|
|
223
|
+
|
|
224
|
+
def union(x: int, y: int) -> None:
|
|
225
|
+
parent[find(x)] = find(y)
|
|
226
|
+
|
|
227
|
+
for i in range(len(memories)):
|
|
228
|
+
for j_idx in range(1, k):
|
|
229
|
+
j = int(indices[i][j_idx])
|
|
230
|
+
if j < 0 or j == i:
|
|
231
|
+
continue
|
|
232
|
+
if float(distances[i][j_idx]) > self.threshold:
|
|
233
|
+
union(i, j)
|
|
234
|
+
|
|
235
|
+
# Collect clusters
|
|
236
|
+
clusters: Dict[int, List[int]] = {}
|
|
237
|
+
for i in range(len(memories)):
|
|
238
|
+
root = find(i)
|
|
239
|
+
clusters.setdefault(root, []).append(i)
|
|
240
|
+
|
|
241
|
+
result: List[Memory] = []
|
|
242
|
+
for idxs in clusters.values():
|
|
243
|
+
cluster_mems = [memories[i] for i in idxs]
|
|
244
|
+
merged = self._merge_memories(cluster_mems)
|
|
245
|
+
self._semantic_removed += len(idxs) - 1
|
|
246
|
+
result.append(merged)
|
|
247
|
+
|
|
248
|
+
return result
|
|
249
|
+
|
|
250
|
+
# ── NumPy matrix dedup ──────────────────────────────────────────
|
|
251
|
+
|
|
252
|
+
|
|
253
|
+
def _semantic_dedup_chunk(self, memories: List[Memory]) -> List[Memory]:
|
|
254
|
+
"""Dedup one chunk using NumPy cosine similarity matrix."""
|
|
255
|
+
if not memories:
|
|
256
|
+
return []
|
|
257
|
+
|
|
258
|
+
texts = [self._memory_to_text(m) for m in memories]
|
|
259
|
+
embeddings = self.embedder.embed_batch(texts)
|
|
260
|
+
|
|
261
|
+
# Try numpy; fall back to pure Python
|
|
262
|
+
try:
|
|
263
|
+
import numpy as np # type: ignore
|
|
264
|
+
except ImportError:
|
|
265
|
+
return self._semantic_dedup_fallback(memories, embeddings)
|
|
266
|
+
|
|
267
|
+
emb_matrix = np.array(embeddings, dtype="float64")
|
|
268
|
+
norms = np.linalg.norm(emb_matrix, axis=1, keepdims=True)
|
|
269
|
+
emb_normalized = emb_matrix / (norms + 1e-8)
|
|
270
|
+
sim_matrix = emb_normalized @ emb_normalized.T
|
|
271
|
+
|
|
272
|
+
result: List[Memory] = []
|
|
273
|
+
used: set = set()
|
|
274
|
+
|
|
275
|
+
for i, m in enumerate(memories):
|
|
276
|
+
mid = m.id
|
|
277
|
+
if mid in used:
|
|
278
|
+
continue
|
|
279
|
+
similar = [m]
|
|
280
|
+
used.add(mid)
|
|
281
|
+
|
|
282
|
+
for j in range(i + 1, len(memories)):
|
|
283
|
+
mjid = memories[j].id
|
|
284
|
+
if mjid in used:
|
|
285
|
+
continue
|
|
286
|
+
if sim_matrix[i][j] > self.threshold:
|
|
287
|
+
similar.append(memories[j])
|
|
288
|
+
used.add(mjid)
|
|
289
|
+
self._semantic_removed += 1
|
|
290
|
+
|
|
291
|
+
merged = self._merge_memories(similar)
|
|
292
|
+
result.append(merged)
|
|
293
|
+
|
|
294
|
+
return result
|
|
295
|
+
|
|
296
|
+
# ── Pure-Python fallback dedup ──────────────────────────────────
|
|
297
|
+
|
|
298
|
+
|
|
299
|
+
def _semantic_dedup_fallback(
|
|
300
|
+
self, memories: List[Memory], embeddings: List[list]
|
|
301
|
+
) -> List[Memory]:
|
|
302
|
+
"""O(n^2) pairwise cosine similarity, zero dependencies."""
|
|
303
|
+
result: List[Memory] = []
|
|
304
|
+
used: set = set()
|
|
305
|
+
|
|
306
|
+
for i, m in enumerate(memories):
|
|
307
|
+
mid = m.id
|
|
308
|
+
if mid in used:
|
|
309
|
+
continue
|
|
310
|
+
similar = [m]
|
|
311
|
+
used.add(mid)
|
|
312
|
+
|
|
313
|
+
for j in range(i + 1, len(memories)):
|
|
314
|
+
mjid = memories[j].id
|
|
315
|
+
if mjid in used:
|
|
316
|
+
continue
|
|
317
|
+
sim = self._cosine_sim(embeddings[i], embeddings[j])
|
|
318
|
+
if sim > self.threshold:
|
|
319
|
+
similar.append(memories[j])
|
|
320
|
+
used.add(mjid)
|
|
321
|
+
self._semantic_removed += 1
|
|
322
|
+
|
|
323
|
+
result.append(self._merge_memories(similar))
|
|
324
|
+
return result
|
|
325
|
+
|
|
326
|
+
# ── Merge helpers ───────────────────────────────────────────────
|
|
327
|
+
|
|
328
|
+
|
|
329
|
+
@staticmethod
|
|
330
|
+
def _merge_memories(memories: List[Memory]) -> Memory:
|
|
331
|
+
"""Merge a list of similar memories into one.
|
|
332
|
+
|
|
333
|
+
Strategy:
|
|
334
|
+
- Keep the newest *updated_at* as the base (deep-copy to avoid
|
|
335
|
+
mutating live objects in MemoryStore).
|
|
336
|
+
- Merge *source_paragraphs* and role field-values, deduplicating
|
|
337
|
+
by ``desc`` string.
|
|
338
|
+
"""
|
|
339
|
+
if len(memories) == 1:
|
|
340
|
+
return memories[0]
|
|
341
|
+
|
|
342
|
+
base = copy.deepcopy(
|
|
343
|
+
max(memories, key=lambda m: str(getattr(m, "updated_at", "") or ""))
|
|
344
|
+
)
|
|
345
|
+
|
|
346
|
+
for m in memories:
|
|
347
|
+
if m.id == base.id:
|
|
348
|
+
continue
|
|
349
|
+
# Merge source_paragraphs
|
|
350
|
+
for sp in getattr(m, "source_paragraphs", []):
|
|
351
|
+
if sp not in base.source_paragraphs:
|
|
352
|
+
base.source_paragraphs.append(sp)
|
|
353
|
+
# Merge role field-values
|
|
354
|
+
for role in ("trigger", "condition", "action", "benefit"):
|
|
355
|
+
existing_descs = {fv.desc for fv in getattr(base, role, [])}
|
|
356
|
+
for fv in getattr(m, role, []):
|
|
357
|
+
if fv.desc not in existing_descs:
|
|
358
|
+
getattr(base, role).append(fv)
|
|
359
|
+
existing_descs.add(fv.desc)
|
|
360
|
+
|
|
361
|
+
return base
|
|
362
|
+
|
|
363
|
+
# ── Utility ─────────────────────────────────────────────────────
|
|
364
|
+
|
|
365
|
+
|
|
366
|
+
@staticmethod
|
|
367
|
+
def _cosine_sim(a: list, b: list) -> float:
|
|
368
|
+
"""Pure-Python cosine similarity (no numpy)."""
|
|
369
|
+
dot = sum(x * y for x, y in zip(a, b))
|
|
370
|
+
norm_a = sum(x * x for x in a) ** 0.5
|
|
371
|
+
norm_b = sum(x * x for x in b) ** 0.5
|
|
372
|
+
return dot / (norm_a * norm_b + 1e-8)
|
|
373
|
+
|
|
374
|
+
def _content_hash(self, memory: Memory) -> str:
|
|
375
|
+
"""SHA-256 of the memory's text representation."""
|
|
376
|
+
content = self._memory_to_text(memory)
|
|
377
|
+
return hashlib.sha256(content.encode()).hexdigest()
|
|
378
|
+
|
|
379
|
+
@staticmethod
|
|
380
|
+
def _memory_to_text(memory: Memory) -> str:
|
|
381
|
+
"""Flatten a memory into a single text for comparison."""
|
|
382
|
+
parts = [memory.name]
|
|
383
|
+
for role in ("trigger", "condition", "action", "benefit"):
|
|
384
|
+
for fv in getattr(memory, role, []):
|
|
385
|
+
parts.append(fv.desc)
|
|
386
|
+
return " ".join(parts)
|
|
@@ -0,0 +1,289 @@
|
|
|
1
|
+
"""EmbeddingService -- vector embedding generation, storage and refresh.
|
|
2
|
+
|
|
3
|
+
Supports multiple embedding models, configurable dimension, batch size,
|
|
4
|
+
and Contextual Retrieval (Anthropic's document-context prefix injection).
|
|
5
|
+
|
|
6
|
+
Embedding strategies::
|
|
7
|
+
|
|
8
|
+
NAME_ONLY -- function name only
|
|
9
|
+
NAME_DOMAIN -- name + domain
|
|
10
|
+
FULL -- name + trigger + action + benefit
|
|
11
|
+
SEMANTIC -- concise semantic summary for search
|
|
12
|
+
|
|
13
|
+
Usage::
|
|
14
|
+
|
|
15
|
+
svc = EmbeddingService(model="default", storage=store, vector_store=vs)
|
|
16
|
+
vector = svc.embed("some text")
|
|
17
|
+
svc.refresh_all()
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
from __future__ import annotations
|
|
21
|
+
|
|
22
|
+
import logging
|
|
23
|
+
from enum import Enum
|
|
24
|
+
from typing import List, Optional, TYPE_CHECKING
|
|
25
|
+
|
|
26
|
+
from memnex.models import Function, RefreshResult
|
|
27
|
+
|
|
28
|
+
if TYPE_CHECKING:
|
|
29
|
+
from memnex.storage.base import MemoryStore
|
|
30
|
+
from memnex.storage.vector import VectorStore as VectorStoreProtocol
|
|
31
|
+
|
|
32
|
+
logger = logging.getLogger(__name__)
|
|
33
|
+
|
|
34
|
+
# Type alias for embedding vectors
|
|
35
|
+
Vector = List[float]
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
# ── Embedding strategies ────────────────────────────────────────────
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
class EmbeddingStrategy(Enum):
|
|
42
|
+
"""Controls how a Function is converted to embeddable text."""
|
|
43
|
+
|
|
44
|
+
NAME_ONLY = "name"
|
|
45
|
+
NAME_DOMAIN = "name_domain"
|
|
46
|
+
FULL = "full"
|
|
47
|
+
SEMANTIC = "semantic"
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
# ── Embedder backends ───────────────────────────────────────────────
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
class _SentenceTransformerEmbedder:
|
|
54
|
+
"""Wraps ``sentence_transformers.SentenceTransformer``."""
|
|
55
|
+
|
|
56
|
+
def __init__(self, model_name: str, dimension: int) -> None:
|
|
57
|
+
from sentence_transformers import SentenceTransformer # type: ignore
|
|
58
|
+
|
|
59
|
+
self._model = SentenceTransformer(model_name)
|
|
60
|
+
self.dimension = dimension
|
|
61
|
+
|
|
62
|
+
def encode(self, text: str) -> Vector:
|
|
63
|
+
return self._model.encode([text])[0].tolist()
|
|
64
|
+
|
|
65
|
+
def encode_batch(self, texts: List[str], batch_size: int = 32) -> List[Vector]:
|
|
66
|
+
embeddings = self._model.encode(texts, batch_size=batch_size)
|
|
67
|
+
return [e.tolist() for e in embeddings]
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
class _SimpleTFIDFEmbedder:
|
|
71
|
+
"""Fallback embedder when sentence-transformers is unavailable.
|
|
72
|
+
|
|
73
|
+
Uses a TF-IDF-inspired bag-of-words representation. The dimension
|
|
74
|
+
is fixed to the number of unique words seen so far (padded / truncated
|
|
75
|
+
to *dimension*).
|
|
76
|
+
"""
|
|
77
|
+
|
|
78
|
+
def __init__(self, dimension: int = 384) -> None:
|
|
79
|
+
self.dimension = dimension
|
|
80
|
+
self._vocab: dict = {} # word -> index
|
|
81
|
+
self._idf: dict = {} # word -> idf score
|
|
82
|
+
self._doc_count: int = 0
|
|
83
|
+
|
|
84
|
+
def encode(self, text: str) -> Vector:
|
|
85
|
+
words = text.lower().split()
|
|
86
|
+
if not words:
|
|
87
|
+
return [0.0] * self.dimension
|
|
88
|
+
|
|
89
|
+
# Update vocabulary
|
|
90
|
+
self._doc_count += 1
|
|
91
|
+
unique_words = set(words)
|
|
92
|
+
for w in unique_words:
|
|
93
|
+
if w not in self._vocab:
|
|
94
|
+
self._vocab[w] = len(self._vocab)
|
|
95
|
+
self._idf[w] = self._idf.get(w, 0) + 1
|
|
96
|
+
|
|
97
|
+
# TF-IDF vector
|
|
98
|
+
vec = [0.0] * self.dimension
|
|
99
|
+
tf = {}
|
|
100
|
+
for w in words:
|
|
101
|
+
tf[w] = tf.get(w, 0) + 1
|
|
102
|
+
|
|
103
|
+
for w, count in tf.items():
|
|
104
|
+
idx = self._vocab.get(w, -1) % self.dimension
|
|
105
|
+
idf = self._doc_count / (self._idf.get(w, 1) + 1)
|
|
106
|
+
import math
|
|
107
|
+
vec[idx] = (count / len(words)) * math.log(idf + 1)
|
|
108
|
+
|
|
109
|
+
# L2 normalize
|
|
110
|
+
norm = sum(x * x for x in vec) ** 0.5
|
|
111
|
+
if norm > 0:
|
|
112
|
+
vec = [x / norm for x in vec]
|
|
113
|
+
return vec
|
|
114
|
+
|
|
115
|
+
def encode_batch(self, texts: List[str], batch_size: int = 32) -> List[Vector]:
|
|
116
|
+
return [self.encode(t) for t in texts]
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
# ── EmbeddingService ─────────────────────────────────────────────────
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
class EmbeddingService:
|
|
123
|
+
"""Vector embedding service.
|
|
124
|
+
|
|
125
|
+
Parameters
|
|
126
|
+
----------
|
|
127
|
+
model:
|
|
128
|
+
Embedding model name. ``"default"`` maps to
|
|
129
|
+
``all-MiniLM-L6-v2``. Falls back to TF-IDF when
|
|
130
|
+
sentence-transformers is not installed.
|
|
131
|
+
dimension:
|
|
132
|
+
Embedding vector dimension.
|
|
133
|
+
storage:
|
|
134
|
+
Optional :class:`MemoryStore` for ``refresh`` / ``refresh_all``.
|
|
135
|
+
vector_store:
|
|
136
|
+
Optional :class:`VectorStore` for upsert operations.
|
|
137
|
+
"""
|
|
138
|
+
|
|
139
|
+
_MODEL_MAP = {
|
|
140
|
+
"default": "all-MiniLM-L6-v2",
|
|
141
|
+
"bge-m3": "BAAI/bge-m3",
|
|
142
|
+
"bge-small": "BAAI/bge-small-en-v1.5",
|
|
143
|
+
}
|
|
144
|
+
|
|
145
|
+
def __init__(
|
|
146
|
+
self,
|
|
147
|
+
model: str = "default",
|
|
148
|
+
dimension: int = 384,
|
|
149
|
+
storage: Optional["MemoryStore"] = None,
|
|
150
|
+
vector_store: Optional["VectorStoreProtocol"] = None,
|
|
151
|
+
) -> None:
|
|
152
|
+
self.model = model
|
|
153
|
+
self.dimension = dimension
|
|
154
|
+
self.storage = storage
|
|
155
|
+
self.vector_store = vector_store
|
|
156
|
+
self._embedder = self._create_embedder(model, dimension)
|
|
157
|
+
|
|
158
|
+
# ── Public API ──────────────────────────────────────────────────
|
|
159
|
+
|
|
160
|
+
def embed(self, text: str) -> Vector:
|
|
161
|
+
"""Generate an embedding vector for a single text."""
|
|
162
|
+
return self._embedder.encode(text)
|
|
163
|
+
|
|
164
|
+
def embed_batch(
|
|
165
|
+
self, texts: List[str], batch_size: int = 32
|
|
166
|
+
) -> List[Vector]:
|
|
167
|
+
"""Batch generate embedding vectors."""
|
|
168
|
+
return self._embedder.encode_batch(texts, batch_size=batch_size)
|
|
169
|
+
|
|
170
|
+
def embed_function(
|
|
171
|
+
self,
|
|
172
|
+
func: Function,
|
|
173
|
+
source: Optional[object] = None,
|
|
174
|
+
use_contextual: bool = True,
|
|
175
|
+
) -> Vector:
|
|
176
|
+
"""Generate an embedding for a Function.
|
|
177
|
+
|
|
178
|
+
When *source* is available and *use_contextual* is True, a document
|
|
179
|
+
context prefix is prepended (Contextual Retrieval).
|
|
180
|
+
"""
|
|
181
|
+
content = self.function_to_text(func)
|
|
182
|
+
if use_contextual and source is not None:
|
|
183
|
+
origin = (
|
|
184
|
+
getattr(source, "url", None)
|
|
185
|
+
or str(getattr(source, "source_path", ""))
|
|
186
|
+
or "unknown"
|
|
187
|
+
)
|
|
188
|
+
content = (
|
|
189
|
+
f"[文档: {origin} | 领域: {func.domain or '未分类'}] {content}"
|
|
190
|
+
)
|
|
191
|
+
return self.embed(content)
|
|
192
|
+
|
|
193
|
+
def function_to_text(
|
|
194
|
+
self,
|
|
195
|
+
func: Function,
|
|
196
|
+
strategy: EmbeddingStrategy = EmbeddingStrategy.FULL,
|
|
197
|
+
) -> str:
|
|
198
|
+
"""Convert a Function to embeddable text per *strategy*."""
|
|
199
|
+
if strategy == EmbeddingStrategy.NAME_ONLY:
|
|
200
|
+
return func.name
|
|
201
|
+
|
|
202
|
+
if strategy == EmbeddingStrategy.NAME_DOMAIN:
|
|
203
|
+
return f"{func.name} {func.domain or ''}"
|
|
204
|
+
|
|
205
|
+
if strategy == EmbeddingStrategy.SEMANTIC:
|
|
206
|
+
parts = [func.name, func.domain or ""]
|
|
207
|
+
if func.trigger:
|
|
208
|
+
parts.append(
|
|
209
|
+
f"触发: {'; '.join(fv.desc for fv in func.trigger[:2])}"
|
|
210
|
+
)
|
|
211
|
+
if func.action:
|
|
212
|
+
parts.append(
|
|
213
|
+
f"动作: {'; '.join(fv.desc for fv in func.action[:2])}"
|
|
214
|
+
)
|
|
215
|
+
return " ".join(parts)
|
|
216
|
+
|
|
217
|
+
# FULL
|
|
218
|
+
parts = [func.name, func.domain or ""]
|
|
219
|
+
for fv in func.trigger:
|
|
220
|
+
parts.append(fv.desc)
|
|
221
|
+
for fv in func.action:
|
|
222
|
+
parts.append(fv.desc)
|
|
223
|
+
for fv in func.benefit:
|
|
224
|
+
parts.append(fv.desc)
|
|
225
|
+
return " ".join(parts)
|
|
226
|
+
|
|
227
|
+
def refresh(self, func_id: str) -> None:
|
|
228
|
+
"""Re-embed a single Function and upsert into the vector store."""
|
|
229
|
+
if self.storage is None or self.vector_store is None:
|
|
230
|
+
logger.warning("Cannot refresh: storage or vector_store not configured")
|
|
231
|
+
return
|
|
232
|
+
func = self.storage.get(func_id)
|
|
233
|
+
if func is None:
|
|
234
|
+
logger.warning("Function %s not found for refresh", func_id)
|
|
235
|
+
return
|
|
236
|
+
vector = self.embed_function(func)
|
|
237
|
+
self.vector_store.upsert(func_id, vector)
|
|
238
|
+
|
|
239
|
+
def refresh_all(self, batch_size: int = 100) -> RefreshResult:
|
|
240
|
+
"""Re-embed all Functions in batches and upsert into the vector store.
|
|
241
|
+
|
|
242
|
+
Used after model or strategy changes.
|
|
243
|
+
"""
|
|
244
|
+
if self.storage is None or self.vector_store is None:
|
|
245
|
+
logger.warning("Cannot refresh_all: storage or vector_store not configured")
|
|
246
|
+
return RefreshResult(total=0, refreshed=0)
|
|
247
|
+
|
|
248
|
+
refreshed = 0
|
|
249
|
+
offset = 0
|
|
250
|
+
|
|
251
|
+
while True:
|
|
252
|
+
batch = self.storage.list_functions(
|
|
253
|
+
offset=offset, limit=batch_size
|
|
254
|
+
)
|
|
255
|
+
if not batch:
|
|
256
|
+
break
|
|
257
|
+
|
|
258
|
+
texts = [self.function_to_text(f) for f in batch]
|
|
259
|
+
vectors = self.embed_batch(texts)
|
|
260
|
+
self.vector_store.upsert_batch(
|
|
261
|
+
{f.id: v for f, v in zip(batch, vectors)}
|
|
262
|
+
)
|
|
263
|
+
refreshed += len(batch)
|
|
264
|
+
offset += batch_size
|
|
265
|
+
|
|
266
|
+
return RefreshResult(total=refreshed, refreshed=refreshed)
|
|
267
|
+
|
|
268
|
+
# ── Private ─────────────────────────────────────────────────────
|
|
269
|
+
|
|
270
|
+
def _create_embedder(self, model: str, dimension: int):
|
|
271
|
+
"""Create the appropriate embedder backend."""
|
|
272
|
+
model_name = self._MODEL_MAP.get(model, model)
|
|
273
|
+
|
|
274
|
+
try:
|
|
275
|
+
return _SentenceTransformerEmbedder(model_name, dimension)
|
|
276
|
+
except ImportError:
|
|
277
|
+
logger.info(
|
|
278
|
+
"sentence-transformers not available, "
|
|
279
|
+
"falling back to TF-IDF embedder"
|
|
280
|
+
)
|
|
281
|
+
return _SimpleTFIDFEmbedder(dimension=dimension)
|
|
282
|
+
except Exception as exc:
|
|
283
|
+
logger.warning(
|
|
284
|
+
"Failed to load sentence-transformers model %s: %s. "
|
|
285
|
+
"Falling back to TF-IDF embedder",
|
|
286
|
+
model_name,
|
|
287
|
+
exc,
|
|
288
|
+
)
|
|
289
|
+
return _SimpleTFIDFEmbedder(dimension=dimension)
|