memplex 3.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- memnex/__init__.py +31 -0
- memnex/__main__.py +6 -0
- memnex/_plugin/.claude-plugin/plugin.json +24 -0
- memnex/_plugin/.mcp.json +9 -0
- memnex/_plugin/__init__.py +0 -0
- memnex/_plugin/hooks/hooks.json +43 -0
- memnex/_plugin/scripts/hook-runner.py +166 -0
- memnex/_plugin/skills/mem-explore/SKILL.md +83 -0
- memnex/_plugin/skills/mem-manage/SKILL.md +92 -0
- memnex/_plugin/skills/mem-search/SKILL.md +85 -0
- memnex/_plugin/skills/mem-write/SKILL.md +78 -0
- memnex/adapters/__init__.py +14 -0
- memnex/adapters/claude_skill.py +169 -0
- memnex/adapters/cli.py +525 -0
- memnex/adapters/http_api.py +314 -0
- memnex/adapters/mcp_server.py +448 -0
- memnex/compaction.py +563 -0
- memnex/config.py +366 -0
- memnex/core/__init__.py +13 -0
- memnex/core/associator/__init__.py +8 -0
- memnex/core/associator/domain_classifier.py +75 -0
- memnex/core/associator/entity_aligner.py +127 -0
- memnex/core/associator/ref_linker.py +197 -0
- memnex/core/associator/term_mapper.py +77 -0
- memnex/core/dictionaries/__init__.py +50 -0
- memnex/core/engine.py +667 -0
- memnex/core/extractors/__init__.py +15 -0
- memnex/core/extractors/docx.py +97 -0
- memnex/core/extractors/image.py +233 -0
- memnex/core/extractors/markdown.py +139 -0
- memnex/core/extractors/pdf.py +133 -0
- memnex/core/extractors/vision_mapper.py +131 -0
- memnex/core/handlers/__init__.py +7 -0
- memnex/core/handlers/clipboard.py +40 -0
- memnex/core/handlers/file_handler.py +62 -0
- memnex/core/handlers/url_handler.py +132 -0
- memnex/llm/__init__.py +25 -0
- memnex/llm/enhancer.py +226 -0
- memnex/llm/fallback_chain.py +87 -0
- memnex/llm/injection_guard.py +178 -0
- memnex/llm/provider.py +130 -0
- memnex/llm/providers/__init__.py +22 -0
- memnex/llm/providers/anthropic.py +135 -0
- memnex/llm/providers/local.py +135 -0
- memnex/llm/providers/rule_based.py +68 -0
- memnex/llm/sanitizer.py +67 -0
- memnex/models/__init__.py +68 -0
- memnex/models/feedback.py +42 -0
- memnex/models/graph.py +33 -0
- memnex/models/memory.py +102 -0
- memnex/models/misc.py +185 -0
- memnex/models/paragraph.py +45 -0
- memnex/models/search.py +51 -0
- memnex/models/source.py +23 -0
- memnex/models/task.py +62 -0
- memnex/processing/__init__.py +1 -0
- memnex/processing/graph_builder.py +278 -0
- memnex/processing/merger/__init__.py +6 -0
- memnex/processing/merger/confidence_calculator.py +127 -0
- memnex/processing/merger/conflict_resolver.py +116 -0
- memnex/retrieval/__init__.py +1 -0
- memnex/retrieval/dedup.py +386 -0
- memnex/retrieval/embedding.py +289 -0
- memnex/retrieval/reranker.py +299 -0
- memnex/service.py +902 -0
- memnex/storage/__init__.py +65 -0
- memnex/storage/base.py +132 -0
- memnex/storage/changelog.py +106 -0
- memnex/storage/feedback.py +486 -0
- memnex/storage/lite/__init__.py +5 -0
- memnex/storage/lite/store.py +606 -0
- memnex/storage/vector.py +265 -0
- memnex/wiki/__init__.py +11 -0
- memnex/wiki/community.py +221 -0
- memnex/wiki/compiler.py +545 -0
- memnex/wiki/generator.py +270 -0
- memnex/wiki/search.py +282 -0
- memnex/worker.py +412 -0
- memplex-3.2.0.dist-info/METADATA +37 -0
- memplex-3.2.0.dist-info/RECORD +83 -0
- memplex-3.2.0.dist-info/WHEEL +5 -0
- memplex-3.2.0.dist-info/entry_points.txt +2 -0
- memplex-3.2.0.dist-info/top_level.txt +1 -0
memnex/wiki/generator.py
ADDED
|
@@ -0,0 +1,270 @@
|
|
|
1
|
+
"""LLMWikiGenerator -- LLM-augmented wiki page generation.
|
|
2
|
+
|
|
3
|
+
Uses the LLM to produce enriched Entity Pages, summaries, concept pages,
|
|
4
|
+
and cross-reference suggestions. All LLM calls go through
|
|
5
|
+
``LLMPromptSanitizer.build_structured_prompt`` for safe input handling.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import json
|
|
11
|
+
import logging
|
|
12
|
+
from typing import List, Optional, TYPE_CHECKING
|
|
13
|
+
|
|
14
|
+
from memnex.models import (
|
|
15
|
+
FieldValue,
|
|
16
|
+
Function,
|
|
17
|
+
WikiPage,
|
|
18
|
+
)
|
|
19
|
+
from memnex.llm.sanitizer import LLMPromptSanitizer
|
|
20
|
+
|
|
21
|
+
if TYPE_CHECKING:
|
|
22
|
+
from memnex.llm.enhancer import LLMEnhancer
|
|
23
|
+
|
|
24
|
+
logger = logging.getLogger(__name__)
|
|
25
|
+
|
|
26
|
+
# Default maximum input length for prompt sanitization
|
|
27
|
+
DEFAULT_MAX_INPUT_LENGTH = 10000
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class LLMWikiGenerator:
|
|
31
|
+
"""LLM-enhanced wiki page generator.
|
|
32
|
+
|
|
33
|
+
Uses LLM understanding to produce higher-quality Wiki content than
|
|
34
|
+
rule-based compilation alone.
|
|
35
|
+
|
|
36
|
+
Parameters
|
|
37
|
+
----------
|
|
38
|
+
llm_enhancer:
|
|
39
|
+
The :class:`LLMEnhancer` instance used for LLM calls.
|
|
40
|
+
sep:
|
|
41
|
+
Separator string used to delimit sections in generated pages.
|
|
42
|
+
max_input_length:
|
|
43
|
+
Maximum character length for sanitized inputs sent to the LLM.
|
|
44
|
+
"""
|
|
45
|
+
|
|
46
|
+
def __init__(
|
|
47
|
+
self,
|
|
48
|
+
llm_enhancer: LLMEnhancer,
|
|
49
|
+
sep: str = "---",
|
|
50
|
+
max_input_length: int = DEFAULT_MAX_INPUT_LENGTH,
|
|
51
|
+
) -> None:
|
|
52
|
+
self._llm = llm_enhancer
|
|
53
|
+
self.sep = sep
|
|
54
|
+
self.max_input_length = max_input_length
|
|
55
|
+
|
|
56
|
+
# ── Public API ────────────────────────────────────────────────────
|
|
57
|
+
|
|
58
|
+
async def generate_entity_page(self, func: Function) -> str:
|
|
59
|
+
"""Use LLM to generate an enhanced Entity Page for a Function.
|
|
60
|
+
|
|
61
|
+
The LLM receives structured Function data and returns a natural
|
|
62
|
+
language wiki page covering: one-sentence summary, trigger
|
|
63
|
+
conditions, execution flow, and related functions.
|
|
64
|
+
"""
|
|
65
|
+
func_data = {
|
|
66
|
+
"name": func.name,
|
|
67
|
+
"domain": func.domain or "uncategorized",
|
|
68
|
+
"trigger": [fv.desc for fv in func.trigger],
|
|
69
|
+
"condition": [fv.desc for fv in func.condition],
|
|
70
|
+
"action": [fv.desc for fv in func.action],
|
|
71
|
+
"benefit": [fv.desc for fv in func.benefit],
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
prompt = LLMPromptSanitizer.build_structured_prompt(
|
|
75
|
+
instruction=(
|
|
76
|
+
"为以下函数生成一份清晰的 Wiki 页面,包含:"
|
|
77
|
+
"1.一句话概括 2.触发条件与前置条件 "
|
|
78
|
+
"3.执行流程自然语言描述 4.关联函数(如有)"
|
|
79
|
+
),
|
|
80
|
+
user_input=json.dumps(func_data, ensure_ascii=False),
|
|
81
|
+
max_length=self.max_input_length,
|
|
82
|
+
)
|
|
83
|
+
return await self._llm.llm.complete(prompt)
|
|
84
|
+
|
|
85
|
+
async def generate_summary(self, functions: List[Function]) -> str:
|
|
86
|
+
"""Use LLM to generate a domain summary from multiple Functions.
|
|
87
|
+
|
|
88
|
+
Produces a concept-level overview including core responsibilities,
|
|
89
|
+
functional components, collaboration patterns, and key workflows.
|
|
90
|
+
"""
|
|
91
|
+
funcs_data = [
|
|
92
|
+
{"name": f.name, "action": [fv.desc for fv in f.action]}
|
|
93
|
+
for f in functions
|
|
94
|
+
]
|
|
95
|
+
|
|
96
|
+
prompt = LLMPromptSanitizer.build_structured_prompt(
|
|
97
|
+
instruction=(
|
|
98
|
+
"分析以下函数列表,生成简洁领域摘要,包含:"
|
|
99
|
+
"1.核心职责 2.主要功能组件 3.协作关系 4.关键业务流程"
|
|
100
|
+
),
|
|
101
|
+
user_input=json.dumps(funcs_data, ensure_ascii=False),
|
|
102
|
+
max_length=self.max_input_length,
|
|
103
|
+
)
|
|
104
|
+
return await self._llm.llm.complete(prompt)
|
|
105
|
+
|
|
106
|
+
async def generate_concept_page(
|
|
107
|
+
self,
|
|
108
|
+
domain: str,
|
|
109
|
+
functions: List[Function],
|
|
110
|
+
) -> str:
|
|
111
|
+
"""Generate a concept page that aggregates Functions by domain.
|
|
112
|
+
|
|
113
|
+
Parameters
|
|
114
|
+
----------
|
|
115
|
+
domain:
|
|
116
|
+
The domain / topic label for this concept page.
|
|
117
|
+
functions:
|
|
118
|
+
Functions belonging to this domain.
|
|
119
|
+
"""
|
|
120
|
+
funcs_summary = [
|
|
121
|
+
{
|
|
122
|
+
"name": f.name,
|
|
123
|
+
"trigger": [fv.desc for fv in f.trigger[:2]],
|
|
124
|
+
"action": [fv.desc for fv in f.action[:2]],
|
|
125
|
+
}
|
|
126
|
+
for f in functions[:20]
|
|
127
|
+
]
|
|
128
|
+
|
|
129
|
+
prompt = LLMPromptSanitizer.build_structured_prompt(
|
|
130
|
+
instruction=(
|
|
131
|
+
f"为领域 \"{domain}\" 生成一份概念聚合页面,包含:"
|
|
132
|
+
"1.领域概述 2.核心功能列表 3.功能间协作关系 4.业务流程图描述"
|
|
133
|
+
),
|
|
134
|
+
user_input=json.dumps(funcs_summary, ensure_ascii=False),
|
|
135
|
+
max_length=self.max_input_length,
|
|
136
|
+
)
|
|
137
|
+
return await self._llm.llm.complete(prompt)
|
|
138
|
+
|
|
139
|
+
async def update_cross_references(
|
|
140
|
+
self,
|
|
141
|
+
pages: List[WikiPage],
|
|
142
|
+
) -> List[WikiPage]:
|
|
143
|
+
"""Use LLM to discover and update cross-references across pages.
|
|
144
|
+
|
|
145
|
+
For each page, the LLM analyses the content and suggests relevant
|
|
146
|
+
``[[wikilinks]]`` to other pages in the corpus.
|
|
147
|
+
|
|
148
|
+
Parameters
|
|
149
|
+
----------
|
|
150
|
+
pages:
|
|
151
|
+
All wiki pages to analyse for cross-references.
|
|
152
|
+
|
|
153
|
+
Returns
|
|
154
|
+
-------
|
|
155
|
+
Updated list of WikiPage objects with enriched cross-references.
|
|
156
|
+
"""
|
|
157
|
+
if not pages:
|
|
158
|
+
return pages
|
|
159
|
+
|
|
160
|
+
# Build a lightweight summary index for the LLM
|
|
161
|
+
page_summaries: list[dict] = []
|
|
162
|
+
for p in pages:
|
|
163
|
+
# Extract first non-heading, non-blank line as summary
|
|
164
|
+
summary = ""
|
|
165
|
+
for line in p.content.splitlines():
|
|
166
|
+
stripped = line.strip()
|
|
167
|
+
if stripped and not stripped.startswith("#") and not stripped.startswith("---"):
|
|
168
|
+
summary = stripped[:200]
|
|
169
|
+
break
|
|
170
|
+
page_summaries.append({"id": p.page_id, "summary": summary})
|
|
171
|
+
|
|
172
|
+
updated: List[WikiPage] = []
|
|
173
|
+
for page in pages:
|
|
174
|
+
try:
|
|
175
|
+
candidates = [
|
|
176
|
+
s for s in page_summaries if s["id"] != page.page_id
|
|
177
|
+
][:20]
|
|
178
|
+
|
|
179
|
+
prompt = LLMPromptSanitizer.build_structured_prompt(
|
|
180
|
+
instruction=(
|
|
181
|
+
"分析当前 Wiki 页面,从候选页面中选择相关页面,"
|
|
182
|
+
"返回相关页面的 ID 列表和关联理由。"
|
|
183
|
+
"格式:{\"related\": [{\"id\": \"str\", \"reason\": \"str\"}]}"
|
|
184
|
+
),
|
|
185
|
+
user_input=json.dumps(
|
|
186
|
+
{
|
|
187
|
+
"current_page": {"id": page.page_id, "summary": page.content[:500]},
|
|
188
|
+
"candidates": candidates,
|
|
189
|
+
},
|
|
190
|
+
ensure_ascii=False,
|
|
191
|
+
),
|
|
192
|
+
output_schema={
|
|
193
|
+
"related": [{"id": "str", "reason": "str"}],
|
|
194
|
+
},
|
|
195
|
+
max_length=self.max_input_length,
|
|
196
|
+
)
|
|
197
|
+
result = await self._llm.llm.complete_json(prompt)
|
|
198
|
+
|
|
199
|
+
# Inject cross-references into page content
|
|
200
|
+
related = result.get("related", [])
|
|
201
|
+
if related:
|
|
202
|
+
link_lines = [
|
|
203
|
+
f"- [[{r.get('id', '')}]] -- {r.get('reason', '')}"
|
|
204
|
+
for r in related
|
|
205
|
+
if r.get("id")
|
|
206
|
+
]
|
|
207
|
+
if link_lines:
|
|
208
|
+
cross_ref_block = (
|
|
209
|
+
"\n## Cross-References (LLM)\n"
|
|
210
|
+
+ "\n".join(link_lines)
|
|
211
|
+
+ "\n"
|
|
212
|
+
)
|
|
213
|
+
new_content = page.content.rstrip() + cross_ref_block
|
|
214
|
+
updated.append(
|
|
215
|
+
WikiPage(
|
|
216
|
+
page_id=page.page_id,
|
|
217
|
+
content=new_content,
|
|
218
|
+
metadata=page.metadata,
|
|
219
|
+
)
|
|
220
|
+
)
|
|
221
|
+
continue
|
|
222
|
+
except Exception:
|
|
223
|
+
logger.warning(
|
|
224
|
+
"Cross-reference generation failed for %s, keeping original",
|
|
225
|
+
page.page_id,
|
|
226
|
+
)
|
|
227
|
+
|
|
228
|
+
updated.append(page)
|
|
229
|
+
|
|
230
|
+
return updated
|
|
231
|
+
|
|
232
|
+
async def generate_community_page(
|
|
233
|
+
self,
|
|
234
|
+
community_funcs: List[Function],
|
|
235
|
+
community_id: int,
|
|
236
|
+
) -> dict:
|
|
237
|
+
"""Generate a Concept Page for a GraphRAG-detected community.
|
|
238
|
+
|
|
239
|
+
At most 20 functions are sent to the LLM to avoid token overflow.
|
|
240
|
+
|
|
241
|
+
Parameters
|
|
242
|
+
----------
|
|
243
|
+
community_funcs:
|
|
244
|
+
Functions belonging to this community.
|
|
245
|
+
community_id:
|
|
246
|
+
Numeric identifier for the community.
|
|
247
|
+
|
|
248
|
+
Returns
|
|
249
|
+
-------
|
|
250
|
+
Parsed JSON dict from the LLM response.
|
|
251
|
+
"""
|
|
252
|
+
func_summaries = [
|
|
253
|
+
f"{f.name}: {', '.join(fv.desc for fv in f.action[:1])}"
|
|
254
|
+
for f in community_funcs[:20]
|
|
255
|
+
]
|
|
256
|
+
safe_text = LLMPromptSanitizer.sanitize(
|
|
257
|
+
"\n".join(func_summaries), self.max_input_length,
|
|
258
|
+
)
|
|
259
|
+
|
|
260
|
+
prompt = LLMPromptSanitizer.build_structured_prompt(
|
|
261
|
+
instruction="分析以下功能节点的聚类,识别共同主题并生成简洁社区摘要",
|
|
262
|
+
user_input=safe_text,
|
|
263
|
+
output_schema={
|
|
264
|
+
"community_theme": "str",
|
|
265
|
+
"core_functions": ["str"],
|
|
266
|
+
"summary": "str",
|
|
267
|
+
"key_relationships": ["str"],
|
|
268
|
+
},
|
|
269
|
+
)
|
|
270
|
+
return await self._llm.llm.complete_json(prompt)
|
memnex/wiki/search.py
ADDED
|
@@ -0,0 +1,282 @@
|
|
|
1
|
+
"""DualIndexSearch -- hybrid FTS + vector search for wiki pages.
|
|
2
|
+
|
|
3
|
+
Combines full-text keyword search with vector semantic search using
|
|
4
|
+
Reciprocal Rank Fusion (RRF) to merge results.
|
|
5
|
+
|
|
6
|
+
ID mapping convention: Wiki page filename = Function.id + ".md"
|
|
7
|
+
FTS and vector results are unified by Function.id, so RRF merging is
|
|
8
|
+
unambiguous.
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from __future__ import annotations
|
|
12
|
+
|
|
13
|
+
import logging
|
|
14
|
+
import re
|
|
15
|
+
from pathlib import Path
|
|
16
|
+
from typing import Any, Dict, List, Optional, TYPE_CHECKING
|
|
17
|
+
|
|
18
|
+
from memnex.models import (
|
|
19
|
+
SearchResult,
|
|
20
|
+
SourceType,
|
|
21
|
+
WikiPage,
|
|
22
|
+
)
|
|
23
|
+
|
|
24
|
+
if TYPE_CHECKING:
|
|
25
|
+
from memnex.retrieval.embedding import EmbeddingService
|
|
26
|
+
|
|
27
|
+
logger = logging.getLogger(__name__)
|
|
28
|
+
|
|
29
|
+
# Wikilink pattern
|
|
30
|
+
_WIKILINK_RE = re.compile(r"\[\[([^\]]+)\]\]")
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class DualIndexSearch:
|
|
34
|
+
"""Hybrid FTS + vector search over wiki pages.
|
|
35
|
+
|
|
36
|
+
Parameters
|
|
37
|
+
----------
|
|
38
|
+
wiki_dir:
|
|
39
|
+
Root directory for wiki files.
|
|
40
|
+
embedding_service:
|
|
41
|
+
EmbeddingService for generating and comparing vector embeddings.
|
|
42
|
+
"""
|
|
43
|
+
|
|
44
|
+
def __init__(
|
|
45
|
+
self,
|
|
46
|
+
wiki_dir: Path,
|
|
47
|
+
embedding_service: EmbeddingService,
|
|
48
|
+
) -> None:
|
|
49
|
+
self.wiki_dir = wiki_dir
|
|
50
|
+
self._embedding = embedding_service
|
|
51
|
+
|
|
52
|
+
# In-memory FTS index: page_id -> content (lowercased)
|
|
53
|
+
self._fts_index: Dict[str, str] = {}
|
|
54
|
+
# In-memory vector index: page_id -> embedding vector
|
|
55
|
+
self._vector_index: Dict[str, List[float]] = {}
|
|
56
|
+
|
|
57
|
+
# ── Public API ────────────────────────────────────────────────────
|
|
58
|
+
|
|
59
|
+
def add_page(self, page: WikiPage) -> None:
|
|
60
|
+
"""Add a wiki page to both FTS and vector indices.
|
|
61
|
+
|
|
62
|
+
Parameters
|
|
63
|
+
----------
|
|
64
|
+
page:
|
|
65
|
+
The WikiPage to index.
|
|
66
|
+
"""
|
|
67
|
+
# FTS index: store lowered content for keyword matching
|
|
68
|
+
self._fts_index[page.page_id] = page.content.lower()
|
|
69
|
+
|
|
70
|
+
# Vector index: embed the page content
|
|
71
|
+
try:
|
|
72
|
+
vector = self._embedding.embed(page.content)
|
|
73
|
+
self._vector_index[page.page_id] = vector
|
|
74
|
+
except Exception:
|
|
75
|
+
logger.warning(
|
|
76
|
+
"Failed to embed page %s, skipping vector index",
|
|
77
|
+
page.page_id,
|
|
78
|
+
)
|
|
79
|
+
|
|
80
|
+
def search(self, query: str, top_k: int = 10) -> List[SearchResult]:
|
|
81
|
+
"""Execute hybrid FTS + vector search with RRF merging.
|
|
82
|
+
|
|
83
|
+
Steps:
|
|
84
|
+
1. FTS keyword search over indexed page content
|
|
85
|
+
2. Vector semantic search using embedding similarity
|
|
86
|
+
3. Merge results with Reciprocal Rank Fusion
|
|
87
|
+
|
|
88
|
+
Parameters
|
|
89
|
+
----------
|
|
90
|
+
query:
|
|
91
|
+
Search query string.
|
|
92
|
+
top_k:
|
|
93
|
+
Maximum number of results to return.
|
|
94
|
+
"""
|
|
95
|
+
# 1. FTS search
|
|
96
|
+
fts_results = self._fts_search(query)
|
|
97
|
+
|
|
98
|
+
# 2. Vector search
|
|
99
|
+
vector_results = self._vector_search(query)
|
|
100
|
+
|
|
101
|
+
# 3. RRF merge
|
|
102
|
+
merged = self._reciprocal_rank_fusion(fts_results, vector_results)
|
|
103
|
+
|
|
104
|
+
return merged[:top_k]
|
|
105
|
+
|
|
106
|
+
def rebuild_index(self) -> None:
|
|
107
|
+
"""Rebuild both FTS and vector indices from wiki files on disk.
|
|
108
|
+
|
|
109
|
+
Reads all ``*.md`` files from ``wiki_dir/entities/`` and
|
|
110
|
+
re-indexes them.
|
|
111
|
+
"""
|
|
112
|
+
self._fts_index.clear()
|
|
113
|
+
self._vector_index.clear()
|
|
114
|
+
|
|
115
|
+
entities_dir = self.wiki_dir / "entities"
|
|
116
|
+
if not entities_dir.exists():
|
|
117
|
+
logger.info("No entities directory at %s, index empty", entities_dir)
|
|
118
|
+
return
|
|
119
|
+
|
|
120
|
+
for md_file in entities_dir.glob("*.md"):
|
|
121
|
+
page_id = md_file.stem
|
|
122
|
+
content = md_file.read_text(encoding="utf-8")
|
|
123
|
+
page = WikiPage(page_id=page_id, content=content)
|
|
124
|
+
self.add_page(page)
|
|
125
|
+
|
|
126
|
+
logger.info(
|
|
127
|
+
"Rebuilt index: %d pages (fts=%d, vector=%d)",
|
|
128
|
+
len(self._fts_index),
|
|
129
|
+
len(self._fts_index),
|
|
130
|
+
len(self._vector_index),
|
|
131
|
+
)
|
|
132
|
+
|
|
133
|
+
# ── RRF merge ─────────────────────────────────────────────────────
|
|
134
|
+
|
|
135
|
+
@staticmethod
|
|
136
|
+
def _reciprocal_rank_fusion(
|
|
137
|
+
fts_results: List[SearchResult],
|
|
138
|
+
vector_results: List[SearchResult],
|
|
139
|
+
k: int = 60,
|
|
140
|
+
) -> List[SearchResult]:
|
|
141
|
+
"""Merge FTS and vector results using Reciprocal Rank Fusion.
|
|
142
|
+
|
|
143
|
+
RRF score = sum of 1/(k + rank_i) across both result sets.
|
|
144
|
+
|
|
145
|
+
Parameters
|
|
146
|
+
----------
|
|
147
|
+
fts_results:
|
|
148
|
+
Results from full-text search.
|
|
149
|
+
vector_results:
|
|
150
|
+
Results from vector similarity search.
|
|
151
|
+
k:
|
|
152
|
+
RRF constant (default 60). Higher k dampens the effect
|
|
153
|
+
of individual rank positions.
|
|
154
|
+
"""
|
|
155
|
+
item_map: Dict[str, SearchResult] = {}
|
|
156
|
+
for item in fts_results + vector_results:
|
|
157
|
+
item_map[item.func_id] = item
|
|
158
|
+
|
|
159
|
+
scores: Dict[str, float] = {}
|
|
160
|
+
for rank, item in enumerate(fts_results):
|
|
161
|
+
scores[item.func_id] = scores.get(item.func_id, 0.0) + 1.0 / (k + rank + 1)
|
|
162
|
+
for rank, item in enumerate(vector_results):
|
|
163
|
+
scores[item.func_id] = scores.get(item.func_id, 0.0) + 1.0 / (k + rank + 1)
|
|
164
|
+
|
|
165
|
+
sorted_ids = sorted(scores.items(), key=lambda x: x[1], reverse=True)
|
|
166
|
+
|
|
167
|
+
results: List[SearchResult] = []
|
|
168
|
+
for page_id, rrf_score in sorted_ids:
|
|
169
|
+
item = item_map.get(page_id)
|
|
170
|
+
if item is None:
|
|
171
|
+
continue
|
|
172
|
+
# Create a new result with the RRF score
|
|
173
|
+
results.append(SearchResult(
|
|
174
|
+
func_id=item.func_id,
|
|
175
|
+
name=item.name,
|
|
176
|
+
domain=item.domain,
|
|
177
|
+
relevance_score=rrf_score,
|
|
178
|
+
summary=item.summary,
|
|
179
|
+
source_type=item.source_type,
|
|
180
|
+
created_at=item.created_at,
|
|
181
|
+
updated_at=item.updated_at,
|
|
182
|
+
))
|
|
183
|
+
return results
|
|
184
|
+
|
|
185
|
+
# ── Private: FTS search ───────────────────────────────────────────
|
|
186
|
+
|
|
187
|
+
def _fts_search(self, query: str) -> List[SearchResult]:
|
|
188
|
+
"""Keyword search over indexed page content.
|
|
189
|
+
|
|
190
|
+
Simple TF-based scoring: count of query word occurrences.
|
|
191
|
+
"""
|
|
192
|
+
query_lower = query.lower()
|
|
193
|
+
query_terms = set(query_lower.split())
|
|
194
|
+
|
|
195
|
+
results: List[SearchResult] = []
|
|
196
|
+
for page_id, content in self._fts_index.items():
|
|
197
|
+
score = 0.0
|
|
198
|
+
for term in query_terms:
|
|
199
|
+
count = content.count(term)
|
|
200
|
+
if count > 0:
|
|
201
|
+
score += min(count / 10.0, 1.0)
|
|
202
|
+
if score > 0:
|
|
203
|
+
# Extract a brief summary from content
|
|
204
|
+
summary = self._extract_summary(content)
|
|
205
|
+
results.append(SearchResult(
|
|
206
|
+
func_id=page_id,
|
|
207
|
+
name=page_id,
|
|
208
|
+
domain="",
|
|
209
|
+
relevance_score=score,
|
|
210
|
+
summary=summary,
|
|
211
|
+
source_type=SourceType.WIKI,
|
|
212
|
+
))
|
|
213
|
+
|
|
214
|
+
results.sort(key=lambda r: r.relevance_score, reverse=True)
|
|
215
|
+
return results
|
|
216
|
+
|
|
217
|
+
# ── Private: vector search ────────────────────────────────────────
|
|
218
|
+
|
|
219
|
+
def _vector_search(self, query: str) -> List[SearchResult]:
|
|
220
|
+
"""Semantic search using embedding cosine similarity."""
|
|
221
|
+
if not self._vector_index:
|
|
222
|
+
return []
|
|
223
|
+
|
|
224
|
+
try:
|
|
225
|
+
query_vector = self._embedding.embed(query)
|
|
226
|
+
except Exception:
|
|
227
|
+
logger.warning("Failed to embed query, skipping vector search")
|
|
228
|
+
return []
|
|
229
|
+
|
|
230
|
+
results: List[SearchResult] = []
|
|
231
|
+
for page_id, doc_vector in self._vector_index.items():
|
|
232
|
+
similarity = self._cosine_similarity(query_vector, doc_vector)
|
|
233
|
+
if similarity > 0.1: # Minimum relevance threshold
|
|
234
|
+
content = self._fts_index.get(page_id, "")
|
|
235
|
+
summary = self._extract_summary(content)
|
|
236
|
+
results.append(SearchResult(
|
|
237
|
+
func_id=page_id,
|
|
238
|
+
name=page_id,
|
|
239
|
+
domain="",
|
|
240
|
+
relevance_score=float(similarity),
|
|
241
|
+
summary=summary,
|
|
242
|
+
source_type=SourceType.WIKI,
|
|
243
|
+
))
|
|
244
|
+
|
|
245
|
+
results.sort(key=lambda r: r.relevance_score, reverse=True)
|
|
246
|
+
return results
|
|
247
|
+
|
|
248
|
+
# ── Utility helpers ───────────────────────────────────────────────
|
|
249
|
+
|
|
250
|
+
@staticmethod
|
|
251
|
+
def _cosine_similarity(a: List[float], b: List[float]) -> float:
|
|
252
|
+
"""Compute cosine similarity between two vectors."""
|
|
253
|
+
dot = sum(x * y for x, y in zip(a, b))
|
|
254
|
+
norm_a = sum(x * x for x in a) ** 0.5
|
|
255
|
+
norm_b = sum(x * x for x in b) ** 0.5
|
|
256
|
+
if norm_a == 0 or norm_b == 0:
|
|
257
|
+
return 0.0
|
|
258
|
+
return dot / (norm_a * norm_b)
|
|
259
|
+
|
|
260
|
+
@staticmethod
|
|
261
|
+
def _extract_summary(content: str, max_len: int = 200) -> str:
|
|
262
|
+
"""Extract a brief summary from page content.
|
|
263
|
+
|
|
264
|
+
Skips frontmatter and headings, returns the first substantive line.
|
|
265
|
+
"""
|
|
266
|
+
in_frontmatter = False
|
|
267
|
+
frontmatter_dashes = 0
|
|
268
|
+
for line in content.splitlines():
|
|
269
|
+
stripped = line.strip()
|
|
270
|
+
if stripped == "---":
|
|
271
|
+
frontmatter_dashes += 1
|
|
272
|
+
if frontmatter_dashes == 1:
|
|
273
|
+
in_frontmatter = True
|
|
274
|
+
continue
|
|
275
|
+
elif frontmatter_dashes == 2:
|
|
276
|
+
in_frontmatter = False
|
|
277
|
+
continue
|
|
278
|
+
if in_frontmatter:
|
|
279
|
+
continue
|
|
280
|
+
if stripped and not stripped.startswith("#") and not stripped.startswith("|"):
|
|
281
|
+
return stripped[:max_len]
|
|
282
|
+
return ""
|