memplex 3.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- memnex/__init__.py +31 -0
- memnex/__main__.py +6 -0
- memnex/_plugin/.claude-plugin/plugin.json +24 -0
- memnex/_plugin/.mcp.json +9 -0
- memnex/_plugin/__init__.py +0 -0
- memnex/_plugin/hooks/hooks.json +43 -0
- memnex/_plugin/scripts/hook-runner.py +166 -0
- memnex/_plugin/skills/mem-explore/SKILL.md +83 -0
- memnex/_plugin/skills/mem-manage/SKILL.md +92 -0
- memnex/_plugin/skills/mem-search/SKILL.md +85 -0
- memnex/_plugin/skills/mem-write/SKILL.md +78 -0
- memnex/adapters/__init__.py +14 -0
- memnex/adapters/claude_skill.py +169 -0
- memnex/adapters/cli.py +525 -0
- memnex/adapters/http_api.py +314 -0
- memnex/adapters/mcp_server.py +448 -0
- memnex/compaction.py +563 -0
- memnex/config.py +366 -0
- memnex/core/__init__.py +13 -0
- memnex/core/associator/__init__.py +8 -0
- memnex/core/associator/domain_classifier.py +75 -0
- memnex/core/associator/entity_aligner.py +127 -0
- memnex/core/associator/ref_linker.py +197 -0
- memnex/core/associator/term_mapper.py +77 -0
- memnex/core/dictionaries/__init__.py +50 -0
- memnex/core/engine.py +667 -0
- memnex/core/extractors/__init__.py +15 -0
- memnex/core/extractors/docx.py +97 -0
- memnex/core/extractors/image.py +233 -0
- memnex/core/extractors/markdown.py +139 -0
- memnex/core/extractors/pdf.py +133 -0
- memnex/core/extractors/vision_mapper.py +131 -0
- memnex/core/handlers/__init__.py +7 -0
- memnex/core/handlers/clipboard.py +40 -0
- memnex/core/handlers/file_handler.py +62 -0
- memnex/core/handlers/url_handler.py +132 -0
- memnex/llm/__init__.py +25 -0
- memnex/llm/enhancer.py +226 -0
- memnex/llm/fallback_chain.py +87 -0
- memnex/llm/injection_guard.py +178 -0
- memnex/llm/provider.py +130 -0
- memnex/llm/providers/__init__.py +22 -0
- memnex/llm/providers/anthropic.py +135 -0
- memnex/llm/providers/local.py +135 -0
- memnex/llm/providers/rule_based.py +68 -0
- memnex/llm/sanitizer.py +67 -0
- memnex/models/__init__.py +68 -0
- memnex/models/feedback.py +42 -0
- memnex/models/graph.py +33 -0
- memnex/models/memory.py +102 -0
- memnex/models/misc.py +185 -0
- memnex/models/paragraph.py +45 -0
- memnex/models/search.py +51 -0
- memnex/models/source.py +23 -0
- memnex/models/task.py +62 -0
- memnex/processing/__init__.py +1 -0
- memnex/processing/graph_builder.py +278 -0
- memnex/processing/merger/__init__.py +6 -0
- memnex/processing/merger/confidence_calculator.py +127 -0
- memnex/processing/merger/conflict_resolver.py +116 -0
- memnex/retrieval/__init__.py +1 -0
- memnex/retrieval/dedup.py +386 -0
- memnex/retrieval/embedding.py +289 -0
- memnex/retrieval/reranker.py +299 -0
- memnex/service.py +902 -0
- memnex/storage/__init__.py +65 -0
- memnex/storage/base.py +132 -0
- memnex/storage/changelog.py +106 -0
- memnex/storage/feedback.py +486 -0
- memnex/storage/lite/__init__.py +5 -0
- memnex/storage/lite/store.py +606 -0
- memnex/storage/vector.py +265 -0
- memnex/wiki/__init__.py +11 -0
- memnex/wiki/community.py +221 -0
- memnex/wiki/compiler.py +545 -0
- memnex/wiki/generator.py +270 -0
- memnex/wiki/search.py +282 -0
- memnex/worker.py +412 -0
- memplex-3.2.0.dist-info/METADATA +37 -0
- memplex-3.2.0.dist-info/RECORD +83 -0
- memplex-3.2.0.dist-info/WHEEL +5 -0
- memplex-3.2.0.dist-info/entry_points.txt +2 -0
- memplex-3.2.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,299 @@
|
|
|
1
|
+
"""Reranker -- multi-dimensional result re-ranking + optional CrossEncoder.
|
|
2
|
+
|
|
3
|
+
Two-stage retrieval architecture::
|
|
4
|
+
|
|
5
|
+
Stage 1 (bi-encoder, fast):
|
|
6
|
+
Reranker scores candidates across 5 dimensions and returns top-K.
|
|
7
|
+
|
|
8
|
+
Stage 2 (cross-encoder, precise, optional):
|
|
9
|
+
CrossEncoderReranker re-scores the top-K with a jointly-encoded model
|
|
10
|
+
for significantly higher accuracy on ambiguous queries.
|
|
11
|
+
|
|
12
|
+
Usage::
|
|
13
|
+
|
|
14
|
+
reranker = Reranker(embedding_service)
|
|
15
|
+
ranked = reranker.rerank("query text", search_results, top_k=10)
|
|
16
|
+
|
|
17
|
+
# Optional stage 2
|
|
18
|
+
cross = CrossEncoderReranker(enabled=True)
|
|
19
|
+
ranked = cross.rerank("query text", ranked)
|
|
20
|
+
"""
|
|
21
|
+
|
|
22
|
+
from __future__ import annotations
|
|
23
|
+
|
|
24
|
+
import logging
|
|
25
|
+
import math
|
|
26
|
+
from datetime import datetime
|
|
27
|
+
from typing import Dict, List, Optional, TYPE_CHECKING
|
|
28
|
+
|
|
29
|
+
from memnex.retrieval.embedding import EmbeddingService, Vector
|
|
30
|
+
from memnex.models import SearchResult, SourceType
|
|
31
|
+
|
|
32
|
+
if TYPE_CHECKING:
|
|
33
|
+
from memnex.storage.base import MemoryStore
|
|
34
|
+
from memnex.models import Function
|
|
35
|
+
|
|
36
|
+
logger = logging.getLogger(__name__)
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
# ── Helper ────────────────────────────────────────────────────────────
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def cosine_similarity(a: Vector, b: Vector) -> float:
|
|
43
|
+
"""Compute cosine similarity between two vectors."""
|
|
44
|
+
dot = sum(x * y for x, y in zip(a, b))
|
|
45
|
+
norm_a = sum(x * x for x in a) ** 0.5
|
|
46
|
+
norm_b = sum(x * x for x in b) ** 0.5
|
|
47
|
+
return dot / (norm_a * norm_b + 1e-8)
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
# ── 5-dimensional Reranker ────────────────────────────────────────────
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
class Reranker:
|
|
54
|
+
"""Multi-path retrieval result re-ranker.
|
|
55
|
+
|
|
56
|
+
Scoring dimensions and default weights::
|
|
57
|
+
|
|
58
|
+
raw_relevance 0.25 -- original score from each retrieval path
|
|
59
|
+
semantic_similarity 0.30 -- cosine(query_vec, result_vec)
|
|
60
|
+
recency_decay 0.15 -- exponential decay (~0.5 at 30 days)
|
|
61
|
+
source_authority 0.15 -- requirement > meeting > code > wiki
|
|
62
|
+
frequency 0.15 -- log-scaled access count * recency
|
|
63
|
+
|
|
64
|
+
Parameters
|
|
65
|
+
----------
|
|
66
|
+
embedding_service:
|
|
67
|
+
Provides ``embed()`` for computing semantic similarity.
|
|
68
|
+
weights:
|
|
69
|
+
Optional custom dimension weights (must sum to ~1.0).
|
|
70
|
+
storage:
|
|
71
|
+
Optional :class:`MemoryStore` for reading *access_count*.
|
|
72
|
+
"""
|
|
73
|
+
|
|
74
|
+
_SOURCE_WEIGHTS: Dict[SourceType, float] = {
|
|
75
|
+
SourceType.REQUIREMENT: 1.0,
|
|
76
|
+
SourceType.MEETING: 0.8,
|
|
77
|
+
SourceType.CODE: 0.6,
|
|
78
|
+
SourceType.WIKI: 0.4,
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
def __init__(
|
|
82
|
+
self,
|
|
83
|
+
embedding_service: EmbeddingService,
|
|
84
|
+
weights: Optional[Dict[str, float]] = None,
|
|
85
|
+
storage: Optional["MemoryStore"] = None,
|
|
86
|
+
) -> None:
|
|
87
|
+
self.embedder = embedding_service
|
|
88
|
+
self.storage = storage
|
|
89
|
+
self.weights = weights or {
|
|
90
|
+
"raw_relevance": 0.25,
|
|
91
|
+
"semantic_similarity": 0.30,
|
|
92
|
+
"recency_decay": 0.15,
|
|
93
|
+
"source_authority": 0.15,
|
|
94
|
+
"frequency": 0.15,
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
# ── Public API ──────────────────────────────────────────────────
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
def rerank(
|
|
101
|
+
self,
|
|
102
|
+
query: str,
|
|
103
|
+
results: List[SearchResult],
|
|
104
|
+
top_k: int = 10,
|
|
105
|
+
query_vector: Optional[Vector] = None,
|
|
106
|
+
) -> List[SearchResult]:
|
|
107
|
+
"""Re-rank *results* using the 5-dimensional scoring model.
|
|
108
|
+
|
|
109
|
+
Parameters
|
|
110
|
+
----------
|
|
111
|
+
query:
|
|
112
|
+
Original query text.
|
|
113
|
+
results:
|
|
114
|
+
Candidate results from multi-path retrieval.
|
|
115
|
+
top_k:
|
|
116
|
+
Maximum number of results to return.
|
|
117
|
+
query_vector:
|
|
118
|
+
Pre-computed query embedding (avoids re-embedding).
|
|
119
|
+
"""
|
|
120
|
+
if not results:
|
|
121
|
+
return []
|
|
122
|
+
|
|
123
|
+
if query_vector is None:
|
|
124
|
+
query_vector = self.embedder.embed(query)
|
|
125
|
+
|
|
126
|
+
scored: list[tuple[float, SearchResult]] = []
|
|
127
|
+
|
|
128
|
+
for r in results:
|
|
129
|
+
# 1. Raw relevance from the retrieval path
|
|
130
|
+
raw_score = r.relevance_score
|
|
131
|
+
|
|
132
|
+
# 2. Semantic similarity (reuse vector_cache when available)
|
|
133
|
+
if r.vector_cache is not None:
|
|
134
|
+
result_vector = r.vector_cache
|
|
135
|
+
else:
|
|
136
|
+
result_vector = self.embedder.embed(r.summary)
|
|
137
|
+
semantic_score = cosine_similarity(query_vector, result_vector)
|
|
138
|
+
|
|
139
|
+
# 3. Recency decay
|
|
140
|
+
recency_score = self._recency_decay(r.updated_at)
|
|
141
|
+
|
|
142
|
+
# 4. Source authority
|
|
143
|
+
source_weight = self._source_weight(r.source_type)
|
|
144
|
+
|
|
145
|
+
# 5. Frequency (access count * recency of last access)
|
|
146
|
+
func: Optional["Function"] = None
|
|
147
|
+
if self.storage is not None:
|
|
148
|
+
try:
|
|
149
|
+
func = self.storage.get(r.func_id)
|
|
150
|
+
except Exception:
|
|
151
|
+
func = None
|
|
152
|
+
frequency_score = self._frequency_score(func) if func else 0.5
|
|
153
|
+
|
|
154
|
+
# Weighted sum
|
|
155
|
+
final_score = (
|
|
156
|
+
raw_score * self.weights["raw_relevance"]
|
|
157
|
+
+ semantic_score * self.weights["semantic_similarity"]
|
|
158
|
+
+ recency_score * self.weights["recency_decay"]
|
|
159
|
+
+ source_weight * self.weights["source_authority"]
|
|
160
|
+
+ frequency_score * self.weights["frequency"]
|
|
161
|
+
)
|
|
162
|
+
|
|
163
|
+
scored.append((final_score, r))
|
|
164
|
+
|
|
165
|
+
scored.sort(key=lambda x: x[0], reverse=True)
|
|
166
|
+
return [r for _, r in scored[:top_k]]
|
|
167
|
+
|
|
168
|
+
# ── Dimension scorers ───────────────────────────────────────────
|
|
169
|
+
|
|
170
|
+
|
|
171
|
+
@staticmethod
|
|
172
|
+
def _recency_decay(updated_at: Optional[datetime]) -> float:
|
|
173
|
+
"""Exponential time decay. ~0.5 at 30 days, range [0, 1].
|
|
174
|
+
|
|
175
|
+
Uses the same formula as the design spec:
|
|
176
|
+
``min(1.0, exp(-days / 60))``
|
|
177
|
+
"""
|
|
178
|
+
if updated_at is None:
|
|
179
|
+
return 0.5
|
|
180
|
+
# Handle both datetime objects and ISO strings
|
|
181
|
+
if isinstance(updated_at, str):
|
|
182
|
+
try:
|
|
183
|
+
updated_at = datetime.fromisoformat(updated_at)
|
|
184
|
+
except (ValueError, TypeError):
|
|
185
|
+
return 0.5
|
|
186
|
+
days_since = max(0, (datetime.now() - updated_at).days)
|
|
187
|
+
return min(1.0, math.exp(-days_since / 60))
|
|
188
|
+
|
|
189
|
+
def _source_weight(self, source_type: SourceType) -> float:
|
|
190
|
+
"""Authority weight by source type.
|
|
191
|
+
|
|
192
|
+
requirement=1.0 > meeting=0.8 > code=0.6 > wiki=0.4.
|
|
193
|
+
"""
|
|
194
|
+
return self._SOURCE_WEIGHTS.get(source_type, 0.5)
|
|
195
|
+
|
|
196
|
+
@staticmethod
|
|
197
|
+
def _frequency_score(func: "Function") -> float:
|
|
198
|
+
"""Access-frequency score combining count and recency.
|
|
199
|
+
|
|
200
|
+
``freq = log(1+count) / log(1+100)`` normalised to [0, 1].
|
|
201
|
+
Combined with a last-access recency factor: 60% freq + 40% recency.
|
|
202
|
+
"""
|
|
203
|
+
access_count = getattr(func, "access_count", 0)
|
|
204
|
+
last_accessed = getattr(func, "last_accessed_at", None)
|
|
205
|
+
|
|
206
|
+
# Frequency factor: log-scaled, normalised
|
|
207
|
+
freq = math.log1p(access_count) / math.log1p(100)
|
|
208
|
+
|
|
209
|
+
# Recency of last access
|
|
210
|
+
if last_accessed is not None:
|
|
211
|
+
if isinstance(last_accessed, str):
|
|
212
|
+
try:
|
|
213
|
+
last_accessed = datetime.fromisoformat(last_accessed)
|
|
214
|
+
except (ValueError, TypeError):
|
|
215
|
+
last_accessed = None
|
|
216
|
+
if last_accessed is not None:
|
|
217
|
+
days = max(0, (datetime.now() - last_accessed).days)
|
|
218
|
+
recency = min(1.0, math.exp(-days / 60))
|
|
219
|
+
else:
|
|
220
|
+
recency = 0.3
|
|
221
|
+
else:
|
|
222
|
+
recency = 0.3
|
|
223
|
+
|
|
224
|
+
return freq * 0.6 + recency * 0.4
|
|
225
|
+
|
|
226
|
+
|
|
227
|
+
# ── CrossEncoderReranker (stage 2) ────────────────────────────────────
|
|
228
|
+
|
|
229
|
+
|
|
230
|
+
class CrossEncoderReranker:
|
|
231
|
+
"""Cross-encoder precision re-ranker (stage 2 of two-stage retrieval).
|
|
232
|
+
|
|
233
|
+
Uses a jointly-encoded model (e.g. BGE-reranker-v2-m3) for significantly
|
|
234
|
+
higher accuracy than bi-encoder cosine similarity. Only runs on the
|
|
235
|
+
top-K candidates from the bi-encoder stage, so latency impact is minimal.
|
|
236
|
+
|
|
237
|
+
The model is **lazily loaded** on first use to avoid blocking startup.
|
|
238
|
+
|
|
239
|
+
Parameters
|
|
240
|
+
----------
|
|
241
|
+
model_name:
|
|
242
|
+
HuggingFace model identifier.
|
|
243
|
+
enabled:
|
|
244
|
+
Master switch. When *False*, :meth:`rerank` returns input unchanged.
|
|
245
|
+
"""
|
|
246
|
+
|
|
247
|
+
def __init__(
|
|
248
|
+
self,
|
|
249
|
+
model_name: str = "BAAI/bge-reranker-v2-m3",
|
|
250
|
+
enabled: bool = False,
|
|
251
|
+
) -> None:
|
|
252
|
+
self.model_name = model_name
|
|
253
|
+
self.enabled = enabled
|
|
254
|
+
self._model = None # lazy-loaded
|
|
255
|
+
|
|
256
|
+
def _load_model(self) -> None:
|
|
257
|
+
"""Load the cross-encoder model on first call."""
|
|
258
|
+
if self._model is not None:
|
|
259
|
+
return
|
|
260
|
+
try:
|
|
261
|
+
from sentence_transformers import CrossEncoder # type: ignore
|
|
262
|
+
|
|
263
|
+
self._model = CrossEncoder(self.model_name)
|
|
264
|
+
logger.info("CrossEncoder loaded: %s", self.model_name)
|
|
265
|
+
except ImportError:
|
|
266
|
+
logger.warning(
|
|
267
|
+
"CrossEncoder unavailable (pip install sentence-transformers); "
|
|
268
|
+
"skipping precision re-ranking"
|
|
269
|
+
)
|
|
270
|
+
self.enabled = False
|
|
271
|
+
except Exception as exc:
|
|
272
|
+
logger.warning("Failed to load CrossEncoder %s: %s", self.model_name, exc)
|
|
273
|
+
self.enabled = False
|
|
274
|
+
|
|
275
|
+
# ── Public API ──────────────────────────────────────────────────
|
|
276
|
+
|
|
277
|
+
|
|
278
|
+
def rerank(
|
|
279
|
+
self, query: str, results: List[SearchResult]
|
|
280
|
+
) -> List[SearchResult]:
|
|
281
|
+
"""Re-score *results* with the cross-encoder.
|
|
282
|
+
|
|
283
|
+
Returns results sorted by cross-encoder score (descending).
|
|
284
|
+
When the model is unavailable or disabled, returns input unchanged.
|
|
285
|
+
"""
|
|
286
|
+
if not self.enabled or not results:
|
|
287
|
+
return results
|
|
288
|
+
|
|
289
|
+
self._load_model()
|
|
290
|
+
if self._model is None:
|
|
291
|
+
return results
|
|
292
|
+
|
|
293
|
+
pairs = [(query, r.summary) for r in results]
|
|
294
|
+
scores = self._model.predict(pairs)
|
|
295
|
+
for r, score in zip(results, scores):
|
|
296
|
+
r.relevance_score = float(score)
|
|
297
|
+
|
|
298
|
+
results.sort(key=lambda x: x.relevance_score, reverse=True)
|
|
299
|
+
return results
|