memplex 3.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (83) hide show
  1. memnex/__init__.py +31 -0
  2. memnex/__main__.py +6 -0
  3. memnex/_plugin/.claude-plugin/plugin.json +24 -0
  4. memnex/_plugin/.mcp.json +9 -0
  5. memnex/_plugin/__init__.py +0 -0
  6. memnex/_plugin/hooks/hooks.json +43 -0
  7. memnex/_plugin/scripts/hook-runner.py +166 -0
  8. memnex/_plugin/skills/mem-explore/SKILL.md +83 -0
  9. memnex/_plugin/skills/mem-manage/SKILL.md +92 -0
  10. memnex/_plugin/skills/mem-search/SKILL.md +85 -0
  11. memnex/_plugin/skills/mem-write/SKILL.md +78 -0
  12. memnex/adapters/__init__.py +14 -0
  13. memnex/adapters/claude_skill.py +169 -0
  14. memnex/adapters/cli.py +525 -0
  15. memnex/adapters/http_api.py +314 -0
  16. memnex/adapters/mcp_server.py +448 -0
  17. memnex/compaction.py +563 -0
  18. memnex/config.py +366 -0
  19. memnex/core/__init__.py +13 -0
  20. memnex/core/associator/__init__.py +8 -0
  21. memnex/core/associator/domain_classifier.py +75 -0
  22. memnex/core/associator/entity_aligner.py +127 -0
  23. memnex/core/associator/ref_linker.py +197 -0
  24. memnex/core/associator/term_mapper.py +77 -0
  25. memnex/core/dictionaries/__init__.py +50 -0
  26. memnex/core/engine.py +667 -0
  27. memnex/core/extractors/__init__.py +15 -0
  28. memnex/core/extractors/docx.py +97 -0
  29. memnex/core/extractors/image.py +233 -0
  30. memnex/core/extractors/markdown.py +139 -0
  31. memnex/core/extractors/pdf.py +133 -0
  32. memnex/core/extractors/vision_mapper.py +131 -0
  33. memnex/core/handlers/__init__.py +7 -0
  34. memnex/core/handlers/clipboard.py +40 -0
  35. memnex/core/handlers/file_handler.py +62 -0
  36. memnex/core/handlers/url_handler.py +132 -0
  37. memnex/llm/__init__.py +25 -0
  38. memnex/llm/enhancer.py +226 -0
  39. memnex/llm/fallback_chain.py +87 -0
  40. memnex/llm/injection_guard.py +178 -0
  41. memnex/llm/provider.py +130 -0
  42. memnex/llm/providers/__init__.py +22 -0
  43. memnex/llm/providers/anthropic.py +135 -0
  44. memnex/llm/providers/local.py +135 -0
  45. memnex/llm/providers/rule_based.py +68 -0
  46. memnex/llm/sanitizer.py +67 -0
  47. memnex/models/__init__.py +68 -0
  48. memnex/models/feedback.py +42 -0
  49. memnex/models/graph.py +33 -0
  50. memnex/models/memory.py +102 -0
  51. memnex/models/misc.py +185 -0
  52. memnex/models/paragraph.py +45 -0
  53. memnex/models/search.py +51 -0
  54. memnex/models/source.py +23 -0
  55. memnex/models/task.py +62 -0
  56. memnex/processing/__init__.py +1 -0
  57. memnex/processing/graph_builder.py +278 -0
  58. memnex/processing/merger/__init__.py +6 -0
  59. memnex/processing/merger/confidence_calculator.py +127 -0
  60. memnex/processing/merger/conflict_resolver.py +116 -0
  61. memnex/retrieval/__init__.py +1 -0
  62. memnex/retrieval/dedup.py +386 -0
  63. memnex/retrieval/embedding.py +289 -0
  64. memnex/retrieval/reranker.py +299 -0
  65. memnex/service.py +902 -0
  66. memnex/storage/__init__.py +65 -0
  67. memnex/storage/base.py +132 -0
  68. memnex/storage/changelog.py +106 -0
  69. memnex/storage/feedback.py +486 -0
  70. memnex/storage/lite/__init__.py +5 -0
  71. memnex/storage/lite/store.py +606 -0
  72. memnex/storage/vector.py +265 -0
  73. memnex/wiki/__init__.py +11 -0
  74. memnex/wiki/community.py +221 -0
  75. memnex/wiki/compiler.py +545 -0
  76. memnex/wiki/generator.py +270 -0
  77. memnex/wiki/search.py +282 -0
  78. memnex/worker.py +412 -0
  79. memplex-3.2.0.dist-info/METADATA +37 -0
  80. memplex-3.2.0.dist-info/RECORD +83 -0
  81. memplex-3.2.0.dist-info/WHEEL +5 -0
  82. memplex-3.2.0.dist-info/entry_points.txt +2 -0
  83. memplex-3.2.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,386 @@
1
+ """MemoryDeduplicator -- exact + semantic deduplication for Compaction.
2
+
3
+ Dedup strategies (by scale, automatic fallback)::
4
+
5
+ 1. FAISS IVF ANN -- O(n log n), cross-domain, requires faiss-cpu
6
+ 2. NumPy matrix -- O(n*d), grouped by domain, requires numpy
7
+ 3. Pure Python -- O(n^2), zero dependencies, Lite fallback
8
+
9
+ Usage::
10
+
11
+ dedup = MemoryDeduplicator(embedding_service)
12
+ result = dedup.deduplicate(memories)
13
+ print(result.exact_removed, result.semantic_removed)
14
+ """
15
+
16
+ from __future__ import annotations
17
+
18
+ import copy
19
+ import hashlib
20
+ import logging
21
+ import math
22
+ from enum import Enum
23
+ from typing import Dict, List, Optional
24
+
25
+ from memnex.retrieval.embedding import EmbeddingService
26
+ from memnex.models import DedupResult, Memory
27
+
28
+ logger = logging.getLogger(__name__)
29
+
30
+
31
+ # ── Dedup strategy ────────────────────────────────────────────────────
32
+
33
+
34
+ class DedupStrategy(Enum):
35
+ """Deduplication strategy."""
36
+
37
+ EXACT = "exact" # exact content hash match
38
+ SEMANTIC = "semantic" # embedding cosine similarity
39
+ BOTH = "both" # exact first, then semantic
40
+
41
+
42
+ # ── MemoryDeduplicator ────────────────────────────────────────────────
43
+
44
+
45
+ class MemoryDeduplicator:
46
+ """Memory deduplication cleaner.
47
+
48
+ Used in the Compaction Pipeline's Dedup stage.
49
+
50
+ Parameters
51
+ ----------
52
+ embedding_service:
53
+ Used to generate embeddings for semantic dedup.
54
+ strategy:
55
+ Which dedup strategy to apply.
56
+ threshold:
57
+ Cosine similarity threshold for semantic dedup (default 0.95).
58
+ chunk_threshold:
59
+ Max memories per NumPy chunk before domain-based splitting.
60
+ """
61
+
62
+ def __init__(
63
+ self,
64
+ embedding_service: EmbeddingService,
65
+ strategy: DedupStrategy = DedupStrategy.BOTH,
66
+ threshold: float = 0.95,
67
+ chunk_threshold: int = 20000,
68
+ ) -> None:
69
+ self.embedder = embedding_service
70
+ self.strategy = strategy
71
+ self.threshold = threshold
72
+ self.chunk_threshold = chunk_threshold
73
+
74
+ # Counters set at the start of each ``deduplicate`` call
75
+ self._original_count: int = 0
76
+ self._exact_removed: int = 0
77
+ self._semantic_removed: int = 0
78
+
79
+ # ── Public API ──────────────────────────────────────────────────
80
+
81
+
82
+ def deduplicate(self, memories: List[Memory]) -> DedupResult:
83
+ """Run dedup and return cleaned memories with statistics."""
84
+ self._original_count = len(memories)
85
+ self._exact_removed = 0
86
+ self._semantic_removed = 0
87
+
88
+ if self.strategy in (DedupStrategy.EXACT, DedupStrategy.BOTH):
89
+ memories = self._exact_dedup(memories)
90
+
91
+ if self.strategy in (DedupStrategy.SEMANTIC, DedupStrategy.BOTH):
92
+ memories = self._semantic_dedup(memories)
93
+
94
+ return DedupResult(
95
+ original_count=self._original_count,
96
+ final_count=len(memories),
97
+ exact_removed=self._exact_removed,
98
+ semantic_removed=self._semantic_removed,
99
+ deduplicated=memories,
100
+ )
101
+
102
+ # ── Exact dedup ─────────────────────────────────────────────────
103
+
104
+
105
+ def _exact_dedup(self, memories: List[Memory]) -> List[Memory]:
106
+ """Remove exact duplicates by content hash.
107
+
108
+ When two memories share the same hash, the *better* one is kept
109
+ (see :meth:`_choose_better`).
110
+ """
111
+ seen: Dict[str, Memory] = {}
112
+ for m in memories:
113
+ key = self._content_hash(m)
114
+ if key not in seen:
115
+ seen[key] = m
116
+ else:
117
+ seen[key] = self._choose_better(seen[key], m)
118
+ self._exact_removed += 1
119
+ return list(seen.values())
120
+
121
+ @staticmethod
122
+ def _choose_better(a: Memory, b: Memory) -> Memory:
123
+ """Pick the higher-quality memory to keep.
124
+
125
+ Priority: newer *updated_at* > more populated fields > higher confidence.
126
+ """
127
+ # updated_at comparison (they may be str or datetime)
128
+ a_updated = str(getattr(a, "updated_at", "") or "")
129
+ b_updated = str(getattr(b, "updated_at", "") or "")
130
+ if a_updated > b_updated:
131
+ return a
132
+ if b_updated > a_updated:
133
+ return b
134
+
135
+ # field completeness
136
+ a_fields = sum(
137
+ 1 for role in ("trigger", "condition", "action", "benefit")
138
+ if getattr(a, role, [])
139
+ )
140
+ b_fields = sum(
141
+ 1 for role in ("trigger", "condition", "action", "benefit")
142
+ if getattr(b, role, [])
143
+ )
144
+ if a_fields >= b_fields:
145
+ return a
146
+ return b
147
+
148
+ # ── Semantic dedup (dispatcher) ─────────────────────────────────
149
+
150
+
151
+ def _semantic_dedup(self, memories: List[Memory]) -> List[Memory]:
152
+ """Semantic dedup with automatic backend selection.
153
+
154
+ Routing:
155
+ 1. FAISS IVF ANN (if *faiss* is importable) -- global, cross-domain.
156
+ 2. NumPy matrix (if *numpy* is importable, n <= chunk_threshold)
157
+ -- grouped by domain when n > chunk_threshold.
158
+ 3. Pure-Python pairwise -- O(n^2), zero dependencies.
159
+ """
160
+ # 1. FAISS (best)
161
+ try:
162
+ import faiss # noqa: F401 -- check availability
163
+
164
+ return self._semantic_dedup_faiss(memories)
165
+ except ImportError:
166
+ pass
167
+
168
+ # 2. NumPy (good for medium scale)
169
+ if len(memories) > self.chunk_threshold:
170
+ # Split by domain to keep memory bounded
171
+ grouped: Dict[str, List[Memory]] = {}
172
+ for m in memories:
173
+ domain = getattr(m, "domain", "unknown") or "unknown"
174
+ grouped.setdefault(domain, []).append(m)
175
+ result: List[Memory] = []
176
+ for group in grouped.values():
177
+ result.extend(self._semantic_dedup_chunk(group))
178
+ return result
179
+
180
+ return self._semantic_dedup_chunk(memories)
181
+
182
+ # ── FAISS ANN dedup ─────────────────────────────────────────────
183
+
184
+
185
+ def _semantic_dedup_faiss(self, memories: List[Memory]) -> List[Memory]:
186
+ """FAISS IndexFlatIP with Union-Find clustering.
187
+
188
+ Steps:
189
+ 1. Batch embed + L2-normalise (inner product == cosine).
190
+ 2. Build IndexFlatIP, search top-K=5 neighbours.
191
+ 3. Union-Find groups memories above *threshold*.
192
+ 4. Merge each cluster into one representative memory.
193
+ """
194
+ import faiss # type: ignore
195
+ import numpy as np # type: ignore
196
+
197
+ if not memories:
198
+ return []
199
+
200
+ texts = [self._memory_to_text(m) for m in memories]
201
+ raw_vecs = np.array(
202
+ self.embedder.embed_batch(texts), dtype="float32"
203
+ )
204
+
205
+ # Normalise to unit vectors so IP == cosine
206
+ norms = np.linalg.norm(raw_vecs, axis=1, keepdims=True)
207
+ vecs = raw_vecs / (norms + 1e-8)
208
+
209
+ index = faiss.IndexFlatIP(vecs.shape[1])
210
+ index.add(vecs)
211
+
212
+ k = min(6, len(memories)) # top-1 is self; real neighbours = k-1
213
+ distances, indices = index.search(vecs, k)
214
+
215
+ # Union-Find
216
+ parent = list(range(len(memories)))
217
+
218
+ def find(x: int) -> int:
219
+ while parent[x] != x:
220
+ parent[x] = parent[parent[x]]
221
+ x = parent[x]
222
+ return x
223
+
224
+ def union(x: int, y: int) -> None:
225
+ parent[find(x)] = find(y)
226
+
227
+ for i in range(len(memories)):
228
+ for j_idx in range(1, k):
229
+ j = int(indices[i][j_idx])
230
+ if j < 0 or j == i:
231
+ continue
232
+ if float(distances[i][j_idx]) > self.threshold:
233
+ union(i, j)
234
+
235
+ # Collect clusters
236
+ clusters: Dict[int, List[int]] = {}
237
+ for i in range(len(memories)):
238
+ root = find(i)
239
+ clusters.setdefault(root, []).append(i)
240
+
241
+ result: List[Memory] = []
242
+ for idxs in clusters.values():
243
+ cluster_mems = [memories[i] for i in idxs]
244
+ merged = self._merge_memories(cluster_mems)
245
+ self._semantic_removed += len(idxs) - 1
246
+ result.append(merged)
247
+
248
+ return result
249
+
250
+ # ── NumPy matrix dedup ──────────────────────────────────────────
251
+
252
+
253
+ def _semantic_dedup_chunk(self, memories: List[Memory]) -> List[Memory]:
254
+ """Dedup one chunk using NumPy cosine similarity matrix."""
255
+ if not memories:
256
+ return []
257
+
258
+ texts = [self._memory_to_text(m) for m in memories]
259
+ embeddings = self.embedder.embed_batch(texts)
260
+
261
+ # Try numpy; fall back to pure Python
262
+ try:
263
+ import numpy as np # type: ignore
264
+ except ImportError:
265
+ return self._semantic_dedup_fallback(memories, embeddings)
266
+
267
+ emb_matrix = np.array(embeddings, dtype="float64")
268
+ norms = np.linalg.norm(emb_matrix, axis=1, keepdims=True)
269
+ emb_normalized = emb_matrix / (norms + 1e-8)
270
+ sim_matrix = emb_normalized @ emb_normalized.T
271
+
272
+ result: List[Memory] = []
273
+ used: set = set()
274
+
275
+ for i, m in enumerate(memories):
276
+ mid = m.id
277
+ if mid in used:
278
+ continue
279
+ similar = [m]
280
+ used.add(mid)
281
+
282
+ for j in range(i + 1, len(memories)):
283
+ mjid = memories[j].id
284
+ if mjid in used:
285
+ continue
286
+ if sim_matrix[i][j] > self.threshold:
287
+ similar.append(memories[j])
288
+ used.add(mjid)
289
+ self._semantic_removed += 1
290
+
291
+ merged = self._merge_memories(similar)
292
+ result.append(merged)
293
+
294
+ return result
295
+
296
+ # ── Pure-Python fallback dedup ──────────────────────────────────
297
+
298
+
299
+ def _semantic_dedup_fallback(
300
+ self, memories: List[Memory], embeddings: List[list]
301
+ ) -> List[Memory]:
302
+ """O(n^2) pairwise cosine similarity, zero dependencies."""
303
+ result: List[Memory] = []
304
+ used: set = set()
305
+
306
+ for i, m in enumerate(memories):
307
+ mid = m.id
308
+ if mid in used:
309
+ continue
310
+ similar = [m]
311
+ used.add(mid)
312
+
313
+ for j in range(i + 1, len(memories)):
314
+ mjid = memories[j].id
315
+ if mjid in used:
316
+ continue
317
+ sim = self._cosine_sim(embeddings[i], embeddings[j])
318
+ if sim > self.threshold:
319
+ similar.append(memories[j])
320
+ used.add(mjid)
321
+ self._semantic_removed += 1
322
+
323
+ result.append(self._merge_memories(similar))
324
+ return result
325
+
326
+ # ── Merge helpers ───────────────────────────────────────────────
327
+
328
+
329
+ @staticmethod
330
+ def _merge_memories(memories: List[Memory]) -> Memory:
331
+ """Merge a list of similar memories into one.
332
+
333
+ Strategy:
334
+ - Keep the newest *updated_at* as the base (deep-copy to avoid
335
+ mutating live objects in MemoryStore).
336
+ - Merge *source_paragraphs* and role field-values, deduplicating
337
+ by ``desc`` string.
338
+ """
339
+ if len(memories) == 1:
340
+ return memories[0]
341
+
342
+ base = copy.deepcopy(
343
+ max(memories, key=lambda m: str(getattr(m, "updated_at", "") or ""))
344
+ )
345
+
346
+ for m in memories:
347
+ if m.id == base.id:
348
+ continue
349
+ # Merge source_paragraphs
350
+ for sp in getattr(m, "source_paragraphs", []):
351
+ if sp not in base.source_paragraphs:
352
+ base.source_paragraphs.append(sp)
353
+ # Merge role field-values
354
+ for role in ("trigger", "condition", "action", "benefit"):
355
+ existing_descs = {fv.desc for fv in getattr(base, role, [])}
356
+ for fv in getattr(m, role, []):
357
+ if fv.desc not in existing_descs:
358
+ getattr(base, role).append(fv)
359
+ existing_descs.add(fv.desc)
360
+
361
+ return base
362
+
363
+ # ── Utility ─────────────────────────────────────────────────────
364
+
365
+
366
+ @staticmethod
367
+ def _cosine_sim(a: list, b: list) -> float:
368
+ """Pure-Python cosine similarity (no numpy)."""
369
+ dot = sum(x * y for x, y in zip(a, b))
370
+ norm_a = sum(x * x for x in a) ** 0.5
371
+ norm_b = sum(x * x for x in b) ** 0.5
372
+ return dot / (norm_a * norm_b + 1e-8)
373
+
374
+ def _content_hash(self, memory: Memory) -> str:
375
+ """SHA-256 of the memory's text representation."""
376
+ content = self._memory_to_text(memory)
377
+ return hashlib.sha256(content.encode()).hexdigest()
378
+
379
+ @staticmethod
380
+ def _memory_to_text(memory: Memory) -> str:
381
+ """Flatten a memory into a single text for comparison."""
382
+ parts = [memory.name]
383
+ for role in ("trigger", "condition", "action", "benefit"):
384
+ for fv in getattr(memory, role, []):
385
+ parts.append(fv.desc)
386
+ return " ".join(parts)
@@ -0,0 +1,289 @@
1
+ """EmbeddingService -- vector embedding generation, storage and refresh.
2
+
3
+ Supports multiple embedding models, configurable dimension, batch size,
4
+ and Contextual Retrieval (Anthropic's document-context prefix injection).
5
+
6
+ Embedding strategies::
7
+
8
+ NAME_ONLY -- function name only
9
+ NAME_DOMAIN -- name + domain
10
+ FULL -- name + trigger + action + benefit
11
+ SEMANTIC -- concise semantic summary for search
12
+
13
+ Usage::
14
+
15
+ svc = EmbeddingService(model="default", storage=store, vector_store=vs)
16
+ vector = svc.embed("some text")
17
+ svc.refresh_all()
18
+ """
19
+
20
+ from __future__ import annotations
21
+
22
+ import logging
23
+ from enum import Enum
24
+ from typing import List, Optional, TYPE_CHECKING
25
+
26
+ from memnex.models import Function, RefreshResult
27
+
28
+ if TYPE_CHECKING:
29
+ from memnex.storage.base import MemoryStore
30
+ from memnex.storage.vector import VectorStore as VectorStoreProtocol
31
+
32
+ logger = logging.getLogger(__name__)
33
+
34
+ # Type alias for embedding vectors
35
+ Vector = List[float]
36
+
37
+
38
+ # ── Embedding strategies ────────────────────────────────────────────
39
+
40
+
41
+ class EmbeddingStrategy(Enum):
42
+ """Controls how a Function is converted to embeddable text."""
43
+
44
+ NAME_ONLY = "name"
45
+ NAME_DOMAIN = "name_domain"
46
+ FULL = "full"
47
+ SEMANTIC = "semantic"
48
+
49
+
50
+ # ── Embedder backends ───────────────────────────────────────────────
51
+
52
+
53
+ class _SentenceTransformerEmbedder:
54
+ """Wraps ``sentence_transformers.SentenceTransformer``."""
55
+
56
+ def __init__(self, model_name: str, dimension: int) -> None:
57
+ from sentence_transformers import SentenceTransformer # type: ignore
58
+
59
+ self._model = SentenceTransformer(model_name)
60
+ self.dimension = dimension
61
+
62
+ def encode(self, text: str) -> Vector:
63
+ return self._model.encode([text])[0].tolist()
64
+
65
+ def encode_batch(self, texts: List[str], batch_size: int = 32) -> List[Vector]:
66
+ embeddings = self._model.encode(texts, batch_size=batch_size)
67
+ return [e.tolist() for e in embeddings]
68
+
69
+
70
+ class _SimpleTFIDFEmbedder:
71
+ """Fallback embedder when sentence-transformers is unavailable.
72
+
73
+ Uses a TF-IDF-inspired bag-of-words representation. The dimension
74
+ is fixed to the number of unique words seen so far (padded / truncated
75
+ to *dimension*).
76
+ """
77
+
78
+ def __init__(self, dimension: int = 384) -> None:
79
+ self.dimension = dimension
80
+ self._vocab: dict = {} # word -> index
81
+ self._idf: dict = {} # word -> idf score
82
+ self._doc_count: int = 0
83
+
84
+ def encode(self, text: str) -> Vector:
85
+ words = text.lower().split()
86
+ if not words:
87
+ return [0.0] * self.dimension
88
+
89
+ # Update vocabulary
90
+ self._doc_count += 1
91
+ unique_words = set(words)
92
+ for w in unique_words:
93
+ if w not in self._vocab:
94
+ self._vocab[w] = len(self._vocab)
95
+ self._idf[w] = self._idf.get(w, 0) + 1
96
+
97
+ # TF-IDF vector
98
+ vec = [0.0] * self.dimension
99
+ tf = {}
100
+ for w in words:
101
+ tf[w] = tf.get(w, 0) + 1
102
+
103
+ for w, count in tf.items():
104
+ idx = self._vocab.get(w, -1) % self.dimension
105
+ idf = self._doc_count / (self._idf.get(w, 1) + 1)
106
+ import math
107
+ vec[idx] = (count / len(words)) * math.log(idf + 1)
108
+
109
+ # L2 normalize
110
+ norm = sum(x * x for x in vec) ** 0.5
111
+ if norm > 0:
112
+ vec = [x / norm for x in vec]
113
+ return vec
114
+
115
+ def encode_batch(self, texts: List[str], batch_size: int = 32) -> List[Vector]:
116
+ return [self.encode(t) for t in texts]
117
+
118
+
119
+ # ── EmbeddingService ─────────────────────────────────────────────────
120
+
121
+
122
+ class EmbeddingService:
123
+ """Vector embedding service.
124
+
125
+ Parameters
126
+ ----------
127
+ model:
128
+ Embedding model name. ``"default"`` maps to
129
+ ``all-MiniLM-L6-v2``. Falls back to TF-IDF when
130
+ sentence-transformers is not installed.
131
+ dimension:
132
+ Embedding vector dimension.
133
+ storage:
134
+ Optional :class:`MemoryStore` for ``refresh`` / ``refresh_all``.
135
+ vector_store:
136
+ Optional :class:`VectorStore` for upsert operations.
137
+ """
138
+
139
+ _MODEL_MAP = {
140
+ "default": "all-MiniLM-L6-v2",
141
+ "bge-m3": "BAAI/bge-m3",
142
+ "bge-small": "BAAI/bge-small-en-v1.5",
143
+ }
144
+
145
+ def __init__(
146
+ self,
147
+ model: str = "default",
148
+ dimension: int = 384,
149
+ storage: Optional["MemoryStore"] = None,
150
+ vector_store: Optional["VectorStoreProtocol"] = None,
151
+ ) -> None:
152
+ self.model = model
153
+ self.dimension = dimension
154
+ self.storage = storage
155
+ self.vector_store = vector_store
156
+ self._embedder = self._create_embedder(model, dimension)
157
+
158
+ # ── Public API ──────────────────────────────────────────────────
159
+
160
+ def embed(self, text: str) -> Vector:
161
+ """Generate an embedding vector for a single text."""
162
+ return self._embedder.encode(text)
163
+
164
+ def embed_batch(
165
+ self, texts: List[str], batch_size: int = 32
166
+ ) -> List[Vector]:
167
+ """Batch generate embedding vectors."""
168
+ return self._embedder.encode_batch(texts, batch_size=batch_size)
169
+
170
+ def embed_function(
171
+ self,
172
+ func: Function,
173
+ source: Optional[object] = None,
174
+ use_contextual: bool = True,
175
+ ) -> Vector:
176
+ """Generate an embedding for a Function.
177
+
178
+ When *source* is available and *use_contextual* is True, a document
179
+ context prefix is prepended (Contextual Retrieval).
180
+ """
181
+ content = self.function_to_text(func)
182
+ if use_contextual and source is not None:
183
+ origin = (
184
+ getattr(source, "url", None)
185
+ or str(getattr(source, "source_path", ""))
186
+ or "unknown"
187
+ )
188
+ content = (
189
+ f"[文档: {origin} | 领域: {func.domain or '未分类'}] {content}"
190
+ )
191
+ return self.embed(content)
192
+
193
+ def function_to_text(
194
+ self,
195
+ func: Function,
196
+ strategy: EmbeddingStrategy = EmbeddingStrategy.FULL,
197
+ ) -> str:
198
+ """Convert a Function to embeddable text per *strategy*."""
199
+ if strategy == EmbeddingStrategy.NAME_ONLY:
200
+ return func.name
201
+
202
+ if strategy == EmbeddingStrategy.NAME_DOMAIN:
203
+ return f"{func.name} {func.domain or ''}"
204
+
205
+ if strategy == EmbeddingStrategy.SEMANTIC:
206
+ parts = [func.name, func.domain or ""]
207
+ if func.trigger:
208
+ parts.append(
209
+ f"触发: {'; '.join(fv.desc for fv in func.trigger[:2])}"
210
+ )
211
+ if func.action:
212
+ parts.append(
213
+ f"动作: {'; '.join(fv.desc for fv in func.action[:2])}"
214
+ )
215
+ return " ".join(parts)
216
+
217
+ # FULL
218
+ parts = [func.name, func.domain or ""]
219
+ for fv in func.trigger:
220
+ parts.append(fv.desc)
221
+ for fv in func.action:
222
+ parts.append(fv.desc)
223
+ for fv in func.benefit:
224
+ parts.append(fv.desc)
225
+ return " ".join(parts)
226
+
227
+ def refresh(self, func_id: str) -> None:
228
+ """Re-embed a single Function and upsert into the vector store."""
229
+ if self.storage is None or self.vector_store is None:
230
+ logger.warning("Cannot refresh: storage or vector_store not configured")
231
+ return
232
+ func = self.storage.get(func_id)
233
+ if func is None:
234
+ logger.warning("Function %s not found for refresh", func_id)
235
+ return
236
+ vector = self.embed_function(func)
237
+ self.vector_store.upsert(func_id, vector)
238
+
239
+ def refresh_all(self, batch_size: int = 100) -> RefreshResult:
240
+ """Re-embed all Functions in batches and upsert into the vector store.
241
+
242
+ Used after model or strategy changes.
243
+ """
244
+ if self.storage is None or self.vector_store is None:
245
+ logger.warning("Cannot refresh_all: storage or vector_store not configured")
246
+ return RefreshResult(total=0, refreshed=0)
247
+
248
+ refreshed = 0
249
+ offset = 0
250
+
251
+ while True:
252
+ batch = self.storage.list_functions(
253
+ offset=offset, limit=batch_size
254
+ )
255
+ if not batch:
256
+ break
257
+
258
+ texts = [self.function_to_text(f) for f in batch]
259
+ vectors = self.embed_batch(texts)
260
+ self.vector_store.upsert_batch(
261
+ {f.id: v for f, v in zip(batch, vectors)}
262
+ )
263
+ refreshed += len(batch)
264
+ offset += batch_size
265
+
266
+ return RefreshResult(total=refreshed, refreshed=refreshed)
267
+
268
+ # ── Private ─────────────────────────────────────────────────────
269
+
270
+ def _create_embedder(self, model: str, dimension: int):
271
+ """Create the appropriate embedder backend."""
272
+ model_name = self._MODEL_MAP.get(model, model)
273
+
274
+ try:
275
+ return _SentenceTransformerEmbedder(model_name, dimension)
276
+ except ImportError:
277
+ logger.info(
278
+ "sentence-transformers not available, "
279
+ "falling back to TF-IDF embedder"
280
+ )
281
+ return _SimpleTFIDFEmbedder(dimension=dimension)
282
+ except Exception as exc:
283
+ logger.warning(
284
+ "Failed to load sentence-transformers model %s: %s. "
285
+ "Falling back to TF-IDF embedder",
286
+ model_name,
287
+ exc,
288
+ )
289
+ return _SimpleTFIDFEmbedder(dimension=dimension)