union-app-chat-stream 1.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.gitignore +16 -0
- package/PROJECT_OVERVIEW.md +187 -0
- package/app/.env +63 -0
- package/app/.env.dev +63 -0
- package/app/.env.prod.bj11 +63 -0
- package/app/.env.prod.sh20 +63 -0
- package/app/.env.prod.sz31 +63 -0
- package/app/.env.test.bj12 +63 -0
- package/app/__init__.py +42 -0
- package/app/__pycache__/__init__.cpython-312.pyc +0 -0
- package/app/__pycache__/authenticated_user.cpython-312.pyc +0 -0
- package/app/__pycache__/extensions.cpython-312.pyc +0 -0
- package/app/__pycache__/wsgi.cpython-312.pyc +0 -0
- package/app/authenticated_user.py +77 -0
- package/app/config/__pycache__/config_loader.cpython-312.pyc +0 -0
- package/app/config/__pycache__/env_config.cpython-312.pyc +0 -0
- package/app/config/__pycache__/logger_config.cpython-312.pyc +0 -0
- package/app/config/env_config.py +96 -0
- package/app/config/logger_config.py +46 -0
- package/app/manager/__init__.py +4 -0
- package/app/manager/__pycache__/__init__.cpython-312.pyc +0 -0
- package/app/manager/__pycache__/chatstream_manager.cpython-312.pyc +0 -0
- package/app/manager/__pycache__/prompts.cpython-312.pyc +0 -0
- package/app/manager/__pycache__/runtime_manager.cpython-312.pyc +0 -0
- package/app/manager/__pycache__/toolcall_manager.cpython-312.pyc +0 -0
- package/app/manager/chatstream_manager.py +90 -0
- package/app/manager/prompts.py +62 -0
- package/app/manager/runtime_manager.py +552 -0
- package/app/models/__pycache__/schemas.cpython-312.pyc +0 -0
- package/app/models/schemas.py +30 -0
- package/app/service/__init__.py +4 -0
- package/app/service/__pycache__/__init__.cpython-312.pyc +0 -0
- package/app/service/__pycache__/chat_service.cpython-312.pyc +0 -0
- package/app/service/__pycache__/llm_service.cpython-312.pyc +0 -0
- package/app/service/__pycache__/rag_service.cpython-312.pyc +0 -0
- package/app/service/__pycache__/tool_call_service.cpython-312.pyc +0 -0
- package/app/service/__pycache__/union_service.cpython-312.pyc +0 -0
- package/app/service/chat_service.py +228 -0
- package/app/service/llm_service.py +214 -0
- package/app/service/rag_service.py +866 -0
- package/app/service/union_service.py +201 -0
- package/app/utils/__init__.py +5 -0
- package/app/utils/__pycache__/__init__.cpython-312.pyc +0 -0
- package/app/utils/__pycache__/common_utils.cpython-312.pyc +0 -0
- package/app/utils/__pycache__/debug_context.cpython-312.pyc +0 -0
- package/app/utils/__pycache__/function_utils.cpython-312.pyc +0 -0
- package/app/utils/__pycache__/jwt_utils.cpython-312.pyc +0 -0
- package/app/utils/common_utils.py +169 -0
- package/app/utils/debug_context.py +16 -0
- package/app/utils/function_utils.py +274 -0
- package/app/utils/jwt_utils.py +39 -0
- package/app/views/__init__.py +6 -0
- package/app/views/__pycache__/__init__.cpython-312.pyc +0 -0
- package/app/views/__pycache__/view_chatstream.cpython-312.pyc +0 -0
- package/app/views/__pycache__/view_healthcheck.cpython-312.pyc +0 -0
- package/app/views/__pycache__/view_runtime.cpython-312.pyc +0 -0
- package/app/views/view_chatstream.py +53 -0
- package/app/views/view_healthcheck.py +14 -0
- package/app/views/view_runtime.py +72 -0
- package/app/wsgi.py +37 -0
- package/ci.yml +14 -0
- package/deploy/autoconf/templates/env.j2 +25 -0
- package/deploy/autoconf.yml +15 -0
- package/deploy/scripts/healthcheck.sh +0 -0
- package/deploy/scripts/requirements.txt +53 -0
- package/deploy/scripts/start.sh +75 -0
- package/deploy/scripts/stop.sh +31 -0
- package/knowledge/.gitkeep +0 -0
- package/knowledge/000001-biz-offline-85b99bd43b-v1.md +88 -0
- package/knowledge/000002-biz-offline-717e8d823e-v1.md +90 -0
- package/knowledge/000003-biz-offline-c963227cc8-v1.md +84 -0
- package/knowledge/000004-biz-offline-2a5868e7da-v1.md +92 -0
- package/knowledge/000005-biz-offline-f9d9cf1a88-v1.md +79 -0
- package/knowledge/000006-biz-offline-c4fa2df3bd-v1.md +77 -0
- package/knowledge/000007-biz-offline-78304b70ca-v1.md +76 -0
- package/knowledge/000008-biz-offline-987ae67b35-v1.md +75 -0
- package/knowledge/000009-biz-offline-4d656bcea3-v1.md +85 -0
- package/knowledge/000010-sop-offline-a9e1050719-v1.md +100 -0
- package/knowledge/000011-biz-offline-5de0624891-v1.md +86 -0
- package/knowledge/000012-biz-offline-7dfacccba3-v1.md +82 -0
- package/knowledge/000013-biz-offline-5e1d29d2ed-v1.md +81 -0
- package/knowledge/000014-biz-offline-1d0ed8b841-v1.md +68 -0
- package/knowledge/000015-biz-offline-8a1376ee3e-v1.md +78 -0
- package/knowledge/000016-biz-offline-c8bfc2aa08-v1.md +99 -0
- package/knowledge/000017-biz-offline-9dffb28032-v1.md +88 -0
- package/knowledge/000018-biz-offline-f935bc9a6a-v1.md +80 -0
- package/knowledge/000019-biz-offline-858b3ecd89-v1.md +86 -0
- package/knowledge/000020-biz-offline-65cb5c4f40-v1.md +113 -0
- package/knowledge/000021-biz-offline-1bf211639c-v1.md +148 -0
- package/knowledge/000022-biz-offline-8c5a637879-v1.md +140 -0
- package/knowledge/000023-biz-offline-fe872b8712-v1.md +188 -0
- package/knowledge/000024-biz-offline-a85010c500-v1.md +133 -0
- package/knowledge/000025-biz-offline-8af58a3638-v1.md +136 -0
- package/knowledge/000026-biz-offline-6754102e93-v1.md +142 -0
- package/knowledge/000027-biz-offline-ea2e5ca5f9-v1.md +150 -0
- package/knowledge/000028-scenario-offline-dab45cebb4-v1.md +136 -0
- package/knowledge/000029-scenario-offline-5b8ae5ea9f-v1.md +143 -0
- package/knowledge/000030-scenario-offline-9a82d42f3f-v1.md +136 -0
- package/knowledge/000031-scenario-offline-cc2edc0197-v1.md +122 -0
- package/knowledge/000032-scenario-offline-e5f6e5cbfa-v1.md +122 -0
- package/knowledge/000033-scenario-offline-e1955849aa-v1.md +135 -0
- package/knowledge/000034-scenario-offline-3a13d49a3a-v1.md +138 -0
- package/knowledge/000035-scenario-offline-fd5560211f-v1.md +147 -0
- package/knowledge/000036-scenario-offline-function-call-mock-v1.md +134 -0
- package/package.json +18 -0
- package/requirements.txt +53 -0
- package/tools/prompts.yaml +10 -0
- package/tools/tool_definitions.yaml +303 -0
|
@@ -0,0 +1,866 @@
|
|
|
1
|
+
import hashlib
|
|
2
|
+
import json
|
|
3
|
+
import re
|
|
4
|
+
|
|
5
|
+
try:
|
|
6
|
+
import pysqlite3
|
|
7
|
+
import sys
|
|
8
|
+
sys.modules["sqlite3"] = pysqlite3
|
|
9
|
+
except ImportError:
|
|
10
|
+
pass
|
|
11
|
+
|
|
12
|
+
import sqlite3
|
|
13
|
+
from datetime import datetime
|
|
14
|
+
from pathlib import Path
|
|
15
|
+
from typing import Any, Dict, List, Optional, Tuple
|
|
16
|
+
|
|
17
|
+
import yaml
|
|
18
|
+
|
|
19
|
+
from zai import ZhipuAiClient
|
|
20
|
+
|
|
21
|
+
from loguru import logger
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def _preview(text: str, limit: int = 300) -> str:
|
|
25
|
+
return str(text).replace("\n", " ")[:limit]
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class RagService:
|
|
29
|
+
"""轻量 RAG 服务:加载 Markdown 知识库,写入 Chroma,并按问题检索。"""
|
|
30
|
+
|
|
31
|
+
_TOKEN_SPLIT_RE = re.compile(r"[\s,,、;;//||()()《》【】\\-]+")
|
|
32
|
+
_CODE_PATTERNS = [
|
|
33
|
+
re.compile(r"(?<![A-Za-z0-9])[a-zA-Z]+[0-9]+(?:\.[0-9A-Za-z]+)+"),
|
|
34
|
+
re.compile(r"(?<![A-Za-z0-9])[a-zA-Z]+[0-9]{2,}"),
|
|
35
|
+
re.compile(r"(?<![A-Za-z0-9])[0-9]{6}-[a-zA-Z0-9-]+-v[0-9]+"),
|
|
36
|
+
re.compile(r"(?<![A-Za-z0-9])[a-zA-Z]+-offline-[a-fA-F0-9]{8,}-v[0-9]+"),
|
|
37
|
+
re.compile(r"(?<![A-Za-z0-9])[A-Z]\.[0-9]+(?:\.[0-9]+)?"),
|
|
38
|
+
re.compile(r"(?<![A-Za-z0-9])[a-z][A-Za-z0-9]*[A-Z][A-Za-z0-9]*"),
|
|
39
|
+
]
|
|
40
|
+
|
|
41
|
+
def __init__(self, config):
|
|
42
|
+
self._config = config
|
|
43
|
+
self._enabled = config["RAG_ENABLED"]
|
|
44
|
+
self._top_k = self._positive_int(config["RAG_TOP_K"])
|
|
45
|
+
self._collection = None
|
|
46
|
+
self._root = Path(__file__).resolve().parents[2]
|
|
47
|
+
self._persist_dir = self._root / config["RAG_PERSIST_DIR"]
|
|
48
|
+
self._knowledge_dir = self._root / config["RAG_KNOWLEDGE_DIR"]
|
|
49
|
+
self._collection_name = config["RAG_COLLECTION"]
|
|
50
|
+
self._rebuild_on_startup = config["RAG_REBUILD_ON_STARTUP"]
|
|
51
|
+
self._embedding_model = config["RAG_EMBEDDING_MODEL"]
|
|
52
|
+
self._embedding_max_chars = self._positive_int(config["RAG_EMBEDDING_MAX_CHARS"])
|
|
53
|
+
self._embedding_batch_size = self._positive_int(config["RAG_EMBEDDING_BATCH_SIZE"])
|
|
54
|
+
self._semantic_candidate_k = self._positive_int(config["RAG_SEMANTIC_CANDIDATE_K"])
|
|
55
|
+
self._context_k = self._positive_int(config["RAG_CONTEXT_K"])
|
|
56
|
+
self._exact_context_k = self._positive_int(config["RAG_EXACT_CONTEXT_K"])
|
|
57
|
+
self._exact_per_file_context_k = self._positive_int(config["RAG_EXACT_PER_FILE_CONTEXT_K"])
|
|
58
|
+
self._per_file_context_k = self._positive_int(config["RAG_PER_FILE_CONTEXT_K"])
|
|
59
|
+
self._chunk_size = self._positive_int(config["RAG_CHUNK_SIZE"])
|
|
60
|
+
self._chroma = None
|
|
61
|
+
self._client = ZhipuAiClient(
|
|
62
|
+
api_key=config["LLM_KEY"],
|
|
63
|
+
base_url=config["LLM_URL"],
|
|
64
|
+
)
|
|
65
|
+
if self._enabled:
|
|
66
|
+
self._init_collection()
|
|
67
|
+
|
|
68
|
+
def search(self, question: str, top_k: Optional[int] = None) -> Tuple[str, List[Dict]]:
|
|
69
|
+
if not self._ensure_collection():
|
|
70
|
+
logger.info(f"RAG 未启用或集合不可用,跳过检索。question={_preview(question, 120)}")
|
|
71
|
+
return "", []
|
|
72
|
+
|
|
73
|
+
exact_terms = self._extract_exact_terms(question)
|
|
74
|
+
exact_hits = self._exact_match_documents(question, exact_terms)
|
|
75
|
+
expanded_hits = self._expand_neighbors(exact_hits)
|
|
76
|
+
relation_hits = self._expand_related_items(exact_hits)
|
|
77
|
+
try:
|
|
78
|
+
if self._collection.count() == 0:
|
|
79
|
+
logger.info("RAG 集合为空,开始重建知识库。")
|
|
80
|
+
self.rebuild()
|
|
81
|
+
if self._collection.count() == 0:
|
|
82
|
+
logger.info(f"RAG 重建后仍无可用文档。question={_preview(question, 120)}")
|
|
83
|
+
return "", []
|
|
84
|
+
result = self._collection.query(
|
|
85
|
+
query_embeddings=[self._embed(question)],
|
|
86
|
+
n_results=self._semantic_candidate_count(),
|
|
87
|
+
include=["documents", "metadatas", "distances"],
|
|
88
|
+
)
|
|
89
|
+
except Exception:
|
|
90
|
+
logger.exception(f"RAG 检索异常,已降级为空上下文。question={_preview(question, 120)}")
|
|
91
|
+
result = {}
|
|
92
|
+
docs = result.get("documents", [[]])[0]
|
|
93
|
+
metas = result.get("metadatas", [[]])[0]
|
|
94
|
+
distances = result.get("distances", [[]])[0]
|
|
95
|
+
ids = result.get("ids", [[]])[0]
|
|
96
|
+
semantic_hits = self._semantic_hits(question, docs, metas, distances, ids)
|
|
97
|
+
ranked = self._merge_and_rank(question, exact_terms, exact_hits, expanded_hits, relation_hits, semantic_hits, top_k)
|
|
98
|
+
metas = [item["metadata"] for item in ranked]
|
|
99
|
+
sources = [self._source(m) for m in metas]
|
|
100
|
+
context = self._format_evidence_context(ranked)
|
|
101
|
+
logger.info(f"RAG 检索完成。question={_preview(question, 120)} hit_count={len(ranked)} exact_terms={exact_terms} sources={sources} context_preview={_preview(context)}")
|
|
102
|
+
return context, sources
|
|
103
|
+
|
|
104
|
+
def knowledge_search(self, query: str, top_k: Optional[Any] = None) -> Tuple[Dict[str, Any], str]:
|
|
105
|
+
query = str(query or "").strip()
|
|
106
|
+
if not query:
|
|
107
|
+
return {"context": "", "sources": [], "related_tools": []}, "query_empty"
|
|
108
|
+
|
|
109
|
+
context, sources = self.search(query, top_k=self._parse_optional_top_k(top_k))
|
|
110
|
+
return {
|
|
111
|
+
"query": query,
|
|
112
|
+
"context": context,
|
|
113
|
+
"sources": sources,
|
|
114
|
+
"related_tools": self._extract_related_tools(sources),
|
|
115
|
+
}, "success"
|
|
116
|
+
|
|
117
|
+
def _ensure_collection(self) -> bool:
|
|
118
|
+
if self._collection:
|
|
119
|
+
return True
|
|
120
|
+
if not self._enabled:
|
|
121
|
+
return False
|
|
122
|
+
self._init_collection()
|
|
123
|
+
return self._collection is not None
|
|
124
|
+
|
|
125
|
+
@staticmethod
|
|
126
|
+
def _positive_int(value) -> int:
|
|
127
|
+
parsed = int(value)
|
|
128
|
+
if parsed < 1:
|
|
129
|
+
raise ValueError(f"配置值必须为正整数: {value}")
|
|
130
|
+
return parsed
|
|
131
|
+
|
|
132
|
+
def _parse_optional_top_k(self, value: Optional[Any]) -> Optional[int]:
|
|
133
|
+
if value in (None, ""):
|
|
134
|
+
return None
|
|
135
|
+
parsed = self._positive_int(value)
|
|
136
|
+
return min(parsed, self._context_limit())
|
|
137
|
+
|
|
138
|
+
@staticmethod
|
|
139
|
+
def _extract_related_tools(sources: List[Dict]) -> List[Dict[str, str]]:
|
|
140
|
+
related_tools: Dict[str, Dict[str, str]] = {}
|
|
141
|
+
for source in sources:
|
|
142
|
+
raw = source.get("related_items", "")
|
|
143
|
+
if not raw:
|
|
144
|
+
continue
|
|
145
|
+
text = raw if isinstance(raw, str) else json.dumps(raw, ensure_ascii=False)
|
|
146
|
+
for name in sorted(set(re.findall(r"\b[a-zA-Z][a-zA-Z0-9_]{2,}\b", text))):
|
|
147
|
+
if "_" not in name:
|
|
148
|
+
continue
|
|
149
|
+
related_tools.setdefault(name, {
|
|
150
|
+
"name": name,
|
|
151
|
+
"reason": "知识库关联能力提示;是否可执行以当前 tool definition 为准。",
|
|
152
|
+
})
|
|
153
|
+
return list(related_tools.values())
|
|
154
|
+
|
|
155
|
+
def _init_collection(self):
|
|
156
|
+
try:
|
|
157
|
+
import chromadb
|
|
158
|
+
except ImportError:
|
|
159
|
+
logger.warning("未安装 chromadb,RAG 检索不可用。")
|
|
160
|
+
return
|
|
161
|
+
|
|
162
|
+
self._chroma = chromadb.PersistentClient(path=str(self._persist_dir))
|
|
163
|
+
self._collection = self._chroma.get_or_create_collection(self._collection_name)
|
|
164
|
+
try:
|
|
165
|
+
if self._rebuild_on_startup:
|
|
166
|
+
self.rebuild()
|
|
167
|
+
except Exception:
|
|
168
|
+
logger.exception("RAG 初始化重建失败,已关闭当前集合。")
|
|
169
|
+
self._collection = None
|
|
170
|
+
|
|
171
|
+
def rebuild(self) -> int:
|
|
172
|
+
docs = self._load_documents()
|
|
173
|
+
if not docs:
|
|
174
|
+
logger.info("RAG 未加载到知识库文档,跳过重建。")
|
|
175
|
+
return 0
|
|
176
|
+
embeddings = []
|
|
177
|
+
logger.info(f"RAG 开始生成文档向量。doc_chunks={len(docs)}")
|
|
178
|
+
for start in range(0, len(docs), self._embedding_batch_size):
|
|
179
|
+
batch_docs = docs[start:start + self._embedding_batch_size]
|
|
180
|
+
batch_end = start + len(batch_docs)
|
|
181
|
+
try:
|
|
182
|
+
logger.info(
|
|
183
|
+
f"RAG 批量生成文档向量中。progress={start + 1}-{batch_end}/{len(docs)} "
|
|
184
|
+
f"batch_size={len(batch_docs)} "
|
|
185
|
+
f"first_file={batch_docs[0]['metadata'].get('file_path', '')} "
|
|
186
|
+
f"first_chunk_index={batch_docs[0]['metadata'].get('chunk_index', '')} "
|
|
187
|
+
f"first_title={batch_docs[0]['metadata'].get('title', '')}"
|
|
188
|
+
)
|
|
189
|
+
embeddings.extend(self._embed_batch([doc["content"] for doc in batch_docs]))
|
|
190
|
+
logger.info(
|
|
191
|
+
f"RAG 批量生成文档向量完成。progress={batch_end}/{len(docs)} "
|
|
192
|
+
f"batch_size={len(batch_docs)}"
|
|
193
|
+
)
|
|
194
|
+
except Exception:
|
|
195
|
+
first_doc = batch_docs[0]
|
|
196
|
+
first_metadata = first_doc["metadata"]
|
|
197
|
+
logger.exception(
|
|
198
|
+
f"RAG 批量文档向量生成失败。progress={start + 1}-{batch_end}/{len(docs)} "
|
|
199
|
+
f"first_file={first_metadata.get('file_path', '')} "
|
|
200
|
+
f"first_chunk_index={first_metadata.get('chunk_index', '')} "
|
|
201
|
+
f"first_title={first_metadata.get('title', '')} "
|
|
202
|
+
f"first_content_preview={_preview(first_doc['content'], 120)}"
|
|
203
|
+
)
|
|
204
|
+
raise
|
|
205
|
+
if len(embeddings) != len(docs):
|
|
206
|
+
raise RuntimeError(f"RAG 文档向量数量不匹配。docs={len(docs)} embeddings={len(embeddings)}")
|
|
207
|
+
self._collection.upsert(
|
|
208
|
+
ids=[d["id"] for d in docs],
|
|
209
|
+
documents=[d["content"] for d in docs],
|
|
210
|
+
metadatas=[d["metadata"] for d in docs],
|
|
211
|
+
embeddings=embeddings,
|
|
212
|
+
)
|
|
213
|
+
logger.info(f"RAG 知识库重建完成。doc_chunks={len(docs)}")
|
|
214
|
+
return len(docs)
|
|
215
|
+
|
|
216
|
+
def force_rebuild(self) -> Dict:
|
|
217
|
+
if not self._enabled:
|
|
218
|
+
return {
|
|
219
|
+
"enabled": False,
|
|
220
|
+
"status": "disabled",
|
|
221
|
+
"message": "RAG 未启用,未执行重建。",
|
|
222
|
+
}
|
|
223
|
+
if not self._ensure_collection():
|
|
224
|
+
return {
|
|
225
|
+
"enabled": True,
|
|
226
|
+
"status": "unavailable",
|
|
227
|
+
"message": "RAG 集合不可用,无法重建。",
|
|
228
|
+
}
|
|
229
|
+
|
|
230
|
+
logger.info(f"RAG 开始强制重建,将删除并重建集合。collection={self._collection_name}")
|
|
231
|
+
self._chroma.delete_collection(self._collection_name)
|
|
232
|
+
self._collection = self._chroma.get_or_create_collection(self._collection_name)
|
|
233
|
+
rebuilt_chunks = self.rebuild()
|
|
234
|
+
status = self.check()
|
|
235
|
+
status.update({
|
|
236
|
+
"status": "rebuilt",
|
|
237
|
+
"rebuilt_chunks": rebuilt_chunks,
|
|
238
|
+
})
|
|
239
|
+
return status
|
|
240
|
+
|
|
241
|
+
def check(self) -> Dict:
|
|
242
|
+
docs = self._load_documents()
|
|
243
|
+
expected_ids = {d["id"] for d in docs}
|
|
244
|
+
source_files = [str(path) for path in self._knowledge_dir.rglob("*.md")]
|
|
245
|
+
active_files = {str(d["metadata"].get("file_path", "")) for d in docs}
|
|
246
|
+
|
|
247
|
+
database = self._database_info()
|
|
248
|
+
result = {
|
|
249
|
+
"enabled": self._enabled,
|
|
250
|
+
"status": "unknown",
|
|
251
|
+
"collection": self._collection_name,
|
|
252
|
+
"is_synced": False,
|
|
253
|
+
"needs_rebuild": True,
|
|
254
|
+
"last_build_at": database.get("last_embedding_created_at"),
|
|
255
|
+
"database_updated_at": database.get("modified_at"),
|
|
256
|
+
"item_counts": {
|
|
257
|
+
"source_files": len(source_files),
|
|
258
|
+
"active_source_files": len(active_files),
|
|
259
|
+
"expected_chunks": len(docs),
|
|
260
|
+
"vector_chunks": 0,
|
|
261
|
+
"missing_chunks": len(expected_ids),
|
|
262
|
+
"stale_chunks": 0,
|
|
263
|
+
},
|
|
264
|
+
"config": {
|
|
265
|
+
"embedding_model": self._embedding_model,
|
|
266
|
+
"top_k": self._top_k,
|
|
267
|
+
"chunk_size": self._chunk_size,
|
|
268
|
+
},
|
|
269
|
+
}
|
|
270
|
+
|
|
271
|
+
if not self._ensure_collection():
|
|
272
|
+
result.update({
|
|
273
|
+
"status": "disabled" if not self._enabled else "unavailable",
|
|
274
|
+
"message": "RAG 未启用或集合不可用。",
|
|
275
|
+
})
|
|
276
|
+
return result
|
|
277
|
+
|
|
278
|
+
vector_count = self._collection.count()
|
|
279
|
+
result["item_counts"]["vector_chunks"] = vector_count
|
|
280
|
+
stored_ids = set()
|
|
281
|
+
if vector_count:
|
|
282
|
+
stored = self._collection.get(limit=vector_count, include=[])
|
|
283
|
+
stored_ids = set(stored.get("ids", []))
|
|
284
|
+
|
|
285
|
+
missing_ids = expected_ids - stored_ids
|
|
286
|
+
stale_ids = stored_ids - expected_ids
|
|
287
|
+
is_synced = not missing_ids and not stale_ids and vector_count == len(expected_ids)
|
|
288
|
+
result.update({
|
|
289
|
+
"status": "ready" if vector_count else "empty",
|
|
290
|
+
"is_synced": is_synced,
|
|
291
|
+
"needs_rebuild": not is_synced,
|
|
292
|
+
})
|
|
293
|
+
result["item_counts"].update({
|
|
294
|
+
"missing_chunks": len(missing_ids),
|
|
295
|
+
"stale_chunks": len(stale_ids),
|
|
296
|
+
})
|
|
297
|
+
return result
|
|
298
|
+
|
|
299
|
+
def _database_info(self) -> Dict:
|
|
300
|
+
db_path = self._persist_dir / "chroma.sqlite3"
|
|
301
|
+
info = {
|
|
302
|
+
"path": str(db_path),
|
|
303
|
+
"exists": db_path.exists(),
|
|
304
|
+
}
|
|
305
|
+
if not db_path.exists():
|
|
306
|
+
return info
|
|
307
|
+
|
|
308
|
+
stat = db_path.stat()
|
|
309
|
+
info.update({
|
|
310
|
+
"size_bytes": stat.st_size,
|
|
311
|
+
"modified_at": datetime.fromtimestamp(stat.st_mtime).isoformat(timespec="seconds"),
|
|
312
|
+
})
|
|
313
|
+
try:
|
|
314
|
+
with sqlite3.connect(str(db_path)) as conn:
|
|
315
|
+
row = conn.execute("select min(created_at), max(created_at) from embeddings").fetchone()
|
|
316
|
+
info.update({
|
|
317
|
+
"first_embedding_created_at": row[0] if row else None,
|
|
318
|
+
"last_embedding_created_at": row[1] if row else None,
|
|
319
|
+
})
|
|
320
|
+
except sqlite3.Error:
|
|
321
|
+
logger.exception(f"读取 Chroma SQLite 元信息失败。path={db_path}")
|
|
322
|
+
return info
|
|
323
|
+
|
|
324
|
+
def _load_documents(self) -> List[Dict]:
|
|
325
|
+
docs = []
|
|
326
|
+
for path in self._knowledge_dir.rglob("*.md"):
|
|
327
|
+
metadata, body = self._read_markdown(path)
|
|
328
|
+
if metadata.get("status", "active") != "active":
|
|
329
|
+
continue
|
|
330
|
+
for idx, chunk in enumerate(self._split(body)):
|
|
331
|
+
item_meta = self._clean_metadata({**metadata, "file_path": str(path), "chunk_index": idx})
|
|
332
|
+
searchable = self._searchable_content(item_meta, chunk)
|
|
333
|
+
docs.append({
|
|
334
|
+
"id": self._chunk_id(path, idx, chunk),
|
|
335
|
+
"content": searchable,
|
|
336
|
+
"metadata": item_meta,
|
|
337
|
+
})
|
|
338
|
+
return docs
|
|
339
|
+
|
|
340
|
+
def _semantic_candidate_count(self) -> int:
|
|
341
|
+
return self._semantic_candidate_k
|
|
342
|
+
|
|
343
|
+
def _context_limit(self) -> int:
|
|
344
|
+
return self._context_k
|
|
345
|
+
|
|
346
|
+
def _semantic_hits(
|
|
347
|
+
self,
|
|
348
|
+
question: str,
|
|
349
|
+
docs: List[str],
|
|
350
|
+
metas: List[Dict],
|
|
351
|
+
distances: List[float],
|
|
352
|
+
ids: List[str],
|
|
353
|
+
) -> List[Dict]:
|
|
354
|
+
hits = []
|
|
355
|
+
for idx, (doc, meta) in enumerate(zip(docs, metas)):
|
|
356
|
+
distance = distances[idx] if idx < len(distances) else 1.0
|
|
357
|
+
hits.append({
|
|
358
|
+
"id": ids[idx] if idx < len(ids) else self._chunk_id_from_metadata(meta, doc),
|
|
359
|
+
"document": self._strip_search_prefix(doc),
|
|
360
|
+
"metadata": meta,
|
|
361
|
+
"distance": float(distance),
|
|
362
|
+
"score": (1.0 - float(distance)) + self._metadata_match_score(question, [], meta),
|
|
363
|
+
"match_type": "semantic",
|
|
364
|
+
"match_terms": [],
|
|
365
|
+
"confidence": "medium",
|
|
366
|
+
"usage_hint": "语义相似证据,可用于补充业务背景、相邻规范或相似场景。",
|
|
367
|
+
})
|
|
368
|
+
return hits
|
|
369
|
+
|
|
370
|
+
@staticmethod
|
|
371
|
+
def _chunk_id_from_metadata(metadata: Dict, document: str) -> str:
|
|
372
|
+
path = Path(str(metadata.get("file_path", "")))
|
|
373
|
+
idx = metadata.get("chunk_index", "")
|
|
374
|
+
return RagService._chunk_id(path, idx, document)
|
|
375
|
+
|
|
376
|
+
def _extract_exact_terms(self, question: str) -> List[str]:
|
|
377
|
+
terms = []
|
|
378
|
+
for pattern in self._CODE_PATTERNS:
|
|
379
|
+
terms.extend(match.group(0) for match in pattern.finditer(question))
|
|
380
|
+
|
|
381
|
+
for token in self._TOKEN_SPLIT_RE.split(question):
|
|
382
|
+
token = token.strip()
|
|
383
|
+
if len(token) < 3:
|
|
384
|
+
continue
|
|
385
|
+
if re.fullmatch(r"[A-Za-z]+", token) and len(token) < 6:
|
|
386
|
+
continue
|
|
387
|
+
if re.search(r"[A-Za-z0-9]", token) and not self._is_generic_ascii_token(token):
|
|
388
|
+
terms.append(token)
|
|
389
|
+
|
|
390
|
+
normalized = []
|
|
391
|
+
seen = set()
|
|
392
|
+
for term in terms:
|
|
393
|
+
term = term.strip("::,,。.;;()()[]【】")
|
|
394
|
+
key = term.lower()
|
|
395
|
+
if len(term) >= 3 and key not in seen:
|
|
396
|
+
seen.add(key)
|
|
397
|
+
normalized.append(term)
|
|
398
|
+
return normalized
|
|
399
|
+
|
|
400
|
+
@staticmethod
|
|
401
|
+
def _is_generic_ascii_token(token: str) -> bool:
|
|
402
|
+
return token.lower() in {
|
|
403
|
+
"json",
|
|
404
|
+
"http",
|
|
405
|
+
"https",
|
|
406
|
+
"api",
|
|
407
|
+
"xml",
|
|
408
|
+
"true",
|
|
409
|
+
"false",
|
|
410
|
+
"none",
|
|
411
|
+
"null",
|
|
412
|
+
"biz",
|
|
413
|
+
"offline",
|
|
414
|
+
}
|
|
415
|
+
|
|
416
|
+
def _exact_match_documents(self, question: str, exact_terms: List[str]) -> List[Dict]:
|
|
417
|
+
hits = []
|
|
418
|
+
for doc in self._load_documents():
|
|
419
|
+
metadata = doc["metadata"]
|
|
420
|
+
document = self._strip_search_prefix(doc["content"])
|
|
421
|
+
metadata_text = self._metadata_text(metadata)
|
|
422
|
+
metadata_terms = self._matched_terms(metadata_text, exact_terms)
|
|
423
|
+
body_terms = self._matched_terms(document, exact_terms)
|
|
424
|
+
name_terms = self._matched_name_terms(question, metadata)
|
|
425
|
+
matched_terms = sorted(set(metadata_terms + body_terms + name_terms), key=str.lower)
|
|
426
|
+
if not matched_terms:
|
|
427
|
+
continue
|
|
428
|
+
score = (
|
|
429
|
+
3.0
|
|
430
|
+
+ self._metadata_match_score(question, matched_terms, metadata)
|
|
431
|
+
+ min(len(metadata_terms) * 0.5, 2.0)
|
|
432
|
+
+ min(len(body_terms) * 1.5, 4.5)
|
|
433
|
+
+ min(len(name_terms) * 1.2, 3.6)
|
|
434
|
+
)
|
|
435
|
+
hits.append({
|
|
436
|
+
"id": doc["id"],
|
|
437
|
+
"document": document,
|
|
438
|
+
"metadata": metadata,
|
|
439
|
+
"distance": None,
|
|
440
|
+
"score": score,
|
|
441
|
+
"match_type": "exact",
|
|
442
|
+
"match_terms": matched_terms,
|
|
443
|
+
"body_match_terms": body_terms,
|
|
444
|
+
"name_match_terms": name_terms,
|
|
445
|
+
"confidence": "high",
|
|
446
|
+
"usage_hint": "高置信精确命中,优先用于确定具体对象、字段、格式、编号或阈值。",
|
|
447
|
+
})
|
|
448
|
+
return self._prefer_direct_exact_hits(hits)
|
|
449
|
+
|
|
450
|
+
@staticmethod
|
|
451
|
+
def _prefer_direct_exact_hits(hits: List[Dict]) -> List[Dict]:
|
|
452
|
+
direct_by_file = {}
|
|
453
|
+
for hit in hits:
|
|
454
|
+
file_path = hit["metadata"].get("file_path", "")
|
|
455
|
+
if hit.get("body_match_terms") or hit.get("name_match_terms"):
|
|
456
|
+
direct_by_file[file_path] = True
|
|
457
|
+
|
|
458
|
+
filtered = [
|
|
459
|
+
hit
|
|
460
|
+
for hit in hits
|
|
461
|
+
if hit.get("body_match_terms")
|
|
462
|
+
or hit.get("name_match_terms")
|
|
463
|
+
or not direct_by_file.get(hit["metadata"].get("file_path", ""))
|
|
464
|
+
]
|
|
465
|
+
return sorted(filtered, key=lambda item: item["score"], reverse=True)
|
|
466
|
+
|
|
467
|
+
@staticmethod
|
|
468
|
+
def _matched_name_terms(question: str, metadata: Dict) -> List[str]:
|
|
469
|
+
candidates = [
|
|
470
|
+
str(metadata.get("title", "")),
|
|
471
|
+
str(metadata.get("source_section", "")),
|
|
472
|
+
]
|
|
473
|
+
terms = []
|
|
474
|
+
for candidate in candidates:
|
|
475
|
+
candidate = candidate.strip()
|
|
476
|
+
if len(candidate) >= 6 and candidate in question:
|
|
477
|
+
terms.append(candidate)
|
|
478
|
+
section_name = re.sub(r"^[A-Z]\.[0-9]+(?:\.[0-9]+)?\s*", "", candidate).strip()
|
|
479
|
+
if len(section_name) >= 6 and section_name in question:
|
|
480
|
+
terms.append(section_name)
|
|
481
|
+
return sorted(set(terms), key=len, reverse=True)
|
|
482
|
+
|
|
483
|
+
def _expand_neighbors(self, exact_hits: List[Dict]) -> List[Dict]:
|
|
484
|
+
if not exact_hits:
|
|
485
|
+
return []
|
|
486
|
+
|
|
487
|
+
docs = self._load_documents()
|
|
488
|
+
by_location = {
|
|
489
|
+
(doc["metadata"].get("file_path", ""), int(doc["metadata"].get("chunk_index", 0))): doc
|
|
490
|
+
for doc in docs
|
|
491
|
+
}
|
|
492
|
+
neighbors = []
|
|
493
|
+
seen = {hit["id"] for hit in exact_hits}
|
|
494
|
+
for hit in exact_hits[:3]:
|
|
495
|
+
metadata = hit["metadata"]
|
|
496
|
+
file_path = metadata.get("file_path", "")
|
|
497
|
+
try:
|
|
498
|
+
chunk_index = int(metadata.get("chunk_index", 0))
|
|
499
|
+
except (TypeError, ValueError):
|
|
500
|
+
continue
|
|
501
|
+
for offset in (-1, 1):
|
|
502
|
+
doc = by_location.get((file_path, chunk_index + offset))
|
|
503
|
+
if not doc or doc["id"] in seen:
|
|
504
|
+
continue
|
|
505
|
+
seen.add(doc["id"])
|
|
506
|
+
neighbors.append({
|
|
507
|
+
"id": doc["id"],
|
|
508
|
+
"document": self._strip_search_prefix(doc["content"]),
|
|
509
|
+
"metadata": doc["metadata"],
|
|
510
|
+
"distance": None,
|
|
511
|
+
"score": hit["score"] - 0.4,
|
|
512
|
+
"match_type": "neighbor_context",
|
|
513
|
+
"match_terms": hit.get("match_terms", []),
|
|
514
|
+
"confidence": "medium",
|
|
515
|
+
"usage_hint": "精确命中文档的邻近片段,可用于补充上下文、边界和使用要求。",
|
|
516
|
+
})
|
|
517
|
+
return neighbors
|
|
518
|
+
|
|
519
|
+
def _expand_related_items(self, exact_hits: List[Dict]) -> List[Dict]:
|
|
520
|
+
if not exact_hits:
|
|
521
|
+
return []
|
|
522
|
+
|
|
523
|
+
docs = self._load_documents()
|
|
524
|
+
relation_docs = {}
|
|
525
|
+
for doc in docs:
|
|
526
|
+
metadata = doc["metadata"]
|
|
527
|
+
if int(metadata.get("chunk_index", 0)) != 0:
|
|
528
|
+
continue
|
|
529
|
+
category = str(metadata.get("category", "")).strip()
|
|
530
|
+
for title in (metadata.get("subcategory", ""), metadata.get("title", "")):
|
|
531
|
+
title = str(title).strip()
|
|
532
|
+
if title:
|
|
533
|
+
relation_docs[(category, title)] = doc
|
|
534
|
+
|
|
535
|
+
related = []
|
|
536
|
+
seen = {hit["id"] for hit in exact_hits}
|
|
537
|
+
for hit in exact_hits[:3]:
|
|
538
|
+
metadata = hit["metadata"]
|
|
539
|
+
source_category = str(metadata.get("category", "")).strip()
|
|
540
|
+
for item in self._related_items(metadata):
|
|
541
|
+
target_category = str(item.get("大类标题") or item.get("category") or source_category).strip()
|
|
542
|
+
target_title = str(item.get("小类标题") or item.get("subcategory") or item.get("title") or "").strip()
|
|
543
|
+
if not target_title:
|
|
544
|
+
continue
|
|
545
|
+
doc = relation_docs.get((target_category, target_title)) or relation_docs.get(
|
|
546
|
+
(source_category, target_title))
|
|
547
|
+
if not doc:
|
|
548
|
+
doc = self._find_related_doc(docs, target_category or source_category, target_title)
|
|
549
|
+
if not doc or doc["id"] in seen:
|
|
550
|
+
continue
|
|
551
|
+
seen.add(doc["id"])
|
|
552
|
+
related.append({
|
|
553
|
+
"id": doc["id"],
|
|
554
|
+
"document": self._strip_search_prefix(doc["content"]),
|
|
555
|
+
"metadata": doc["metadata"],
|
|
556
|
+
"distance": None,
|
|
557
|
+
"score": hit["score"] - 0.2,
|
|
558
|
+
"match_type": "relation_context",
|
|
559
|
+
"match_terms": hit.get("match_terms", []),
|
|
560
|
+
"confidence": "medium",
|
|
561
|
+
"usage_hint": "知识小类关系证据,可用于说明同一知识大类下小类之间的关联、边界和相似场景。",
|
|
562
|
+
})
|
|
563
|
+
return related
|
|
564
|
+
|
|
565
|
+
@staticmethod
|
|
566
|
+
def _related_items(metadata: Dict) -> List[Dict]:
|
|
567
|
+
raw = metadata.get("related_items", "")
|
|
568
|
+
if not raw:
|
|
569
|
+
return []
|
|
570
|
+
if isinstance(raw, list):
|
|
571
|
+
return [item for item in raw if isinstance(item, dict)]
|
|
572
|
+
if isinstance(raw, str):
|
|
573
|
+
try:
|
|
574
|
+
parsed = json.loads(raw)
|
|
575
|
+
except json.JSONDecodeError:
|
|
576
|
+
return []
|
|
577
|
+
return [item for item in parsed if isinstance(item, dict)] if isinstance(parsed, list) else []
|
|
578
|
+
return []
|
|
579
|
+
|
|
580
|
+
@staticmethod
|
|
581
|
+
def _find_related_doc(docs: List[Dict], category: str, title: str) -> Dict:
|
|
582
|
+
title_lower = title.lower()
|
|
583
|
+
for doc in docs:
|
|
584
|
+
metadata = doc["metadata"]
|
|
585
|
+
if int(metadata.get("chunk_index", 0)) != 0:
|
|
586
|
+
continue
|
|
587
|
+
doc_category = str(metadata.get("category", "")).strip()
|
|
588
|
+
if category and doc_category and category != doc_category:
|
|
589
|
+
continue
|
|
590
|
+
candidates = [
|
|
591
|
+
str(metadata.get("subcategory", "")).strip(),
|
|
592
|
+
str(metadata.get("title", "")).strip(),
|
|
593
|
+
str(metadata.get("source_section", "")).strip(),
|
|
594
|
+
]
|
|
595
|
+
if any(title_lower in candidate.lower() or candidate.lower() in title_lower for candidate in candidates if
|
|
596
|
+
candidate):
|
|
597
|
+
return doc
|
|
598
|
+
return {}
|
|
599
|
+
|
|
600
|
+
def _merge_and_rank(
|
|
601
|
+
self,
|
|
602
|
+
question: str,
|
|
603
|
+
exact_terms: List[str],
|
|
604
|
+
exact_hits: List[Dict],
|
|
605
|
+
expanded_hits: List[Dict],
|
|
606
|
+
relation_hits: List[Dict],
|
|
607
|
+
semantic_hits: List[Dict],
|
|
608
|
+
top_k: Optional[int] = None,
|
|
609
|
+
) -> List[Dict]:
|
|
610
|
+
merged = {}
|
|
611
|
+
for item in [*semantic_hits, *relation_hits, *expanded_hits, *exact_hits]:
|
|
612
|
+
item = dict(item)
|
|
613
|
+
searchable = self._evidence_text(item["metadata"], item["document"])
|
|
614
|
+
matched_terms = self._matched_terms(searchable, exact_terms)
|
|
615
|
+
item["match_terms"] = sorted(set(item.get("match_terms", []) + matched_terms), key=str.lower)
|
|
616
|
+
item["score"] = item.get("score", 0.0) + self._metadata_match_score(question, item["match_terms"],
|
|
617
|
+
item["metadata"])
|
|
618
|
+
existing = merged.get(item["id"])
|
|
619
|
+
if existing is None or self._hit_priority(item) > self._hit_priority(existing):
|
|
620
|
+
merged[item["id"]] = item
|
|
621
|
+
elif existing is not None:
|
|
622
|
+
existing["match_terms"] = sorted(set(existing.get("match_terms", []) + item["match_terms"]),
|
|
623
|
+
key=str.lower)
|
|
624
|
+
existing["score"] = max(existing.get("score", 0.0), item.get("score", 0.0))
|
|
625
|
+
|
|
626
|
+
ranked = sorted(merged.values(), key=lambda item: self._hit_priority(item), reverse=True)
|
|
627
|
+
return self._diversify_hits(ranked, top_k or self._context_limit())
|
|
628
|
+
|
|
629
|
+
@staticmethod
|
|
630
|
+
def _hit_priority(item: Dict) -> Tuple[float, int]:
|
|
631
|
+
type_bonus = {
|
|
632
|
+
"exact": 3,
|
|
633
|
+
"neighbor_context": 2,
|
|
634
|
+
"relation_context": 2,
|
|
635
|
+
"semantic": 1,
|
|
636
|
+
}.get(item.get("match_type"), 0)
|
|
637
|
+
return float(item.get("score", 0.0)), type_bonus
|
|
638
|
+
|
|
639
|
+
def _diversify_hits(self, ranked: List[Dict], limit: int) -> List[Dict]:
|
|
640
|
+
selected = []
|
|
641
|
+
per_file = {}
|
|
642
|
+
exact_per_file = {}
|
|
643
|
+
exact_selected = 0
|
|
644
|
+
exact_limit = self._exact_context_k
|
|
645
|
+
exact_file_limit = self._exact_per_file_context_k
|
|
646
|
+
per_file_limit = self._per_file_context_k
|
|
647
|
+
|
|
648
|
+
for item in ranked:
|
|
649
|
+
file_path = item["metadata"].get("file_path", "")
|
|
650
|
+
match_type = item.get("match_type")
|
|
651
|
+
if match_type == "exact" and exact_selected >= exact_limit:
|
|
652
|
+
continue
|
|
653
|
+
if match_type == "exact" and exact_per_file.get(file_path, 0) >= exact_file_limit:
|
|
654
|
+
continue
|
|
655
|
+
if per_file.get(file_path, 0) >= per_file_limit and match_type != "exact":
|
|
656
|
+
continue
|
|
657
|
+
selected.append(item)
|
|
658
|
+
per_file[file_path] = per_file.get(file_path, 0) + 1
|
|
659
|
+
if match_type == "exact":
|
|
660
|
+
exact_selected += 1
|
|
661
|
+
exact_per_file[file_path] = exact_per_file.get(file_path, 0) + 1
|
|
662
|
+
if len(selected) >= limit:
|
|
663
|
+
break
|
|
664
|
+
return selected
|
|
665
|
+
|
|
666
|
+
def _metadata_match_score(self, question: str, matched_terms: List[str], metadata: Dict) -> float:
|
|
667
|
+
score = 0.0
|
|
668
|
+
weighted_fields = [
|
|
669
|
+
("kb_id", 2.0),
|
|
670
|
+
("title", 1.4),
|
|
671
|
+
("source_section", 1.2),
|
|
672
|
+
("tags", 1.6),
|
|
673
|
+
("business_modules", 0.8),
|
|
674
|
+
("source_doc", 0.4),
|
|
675
|
+
("doc_type", 0.5),
|
|
676
|
+
("domain", 0.4),
|
|
677
|
+
("category", 0.3),
|
|
678
|
+
("category_keywords", 0.3),
|
|
679
|
+
("subcategory", 0.7),
|
|
680
|
+
("related_items", 0.8),
|
|
681
|
+
("related_categories", 0.2),
|
|
682
|
+
("relation_notes", 0.2),
|
|
683
|
+
]
|
|
684
|
+
question_lower = question.lower()
|
|
685
|
+
for field, weight in weighted_fields:
|
|
686
|
+
text = str(metadata.get(field, ""))
|
|
687
|
+
text_lower = text.lower()
|
|
688
|
+
for term in matched_terms:
|
|
689
|
+
if term.lower() in text_lower:
|
|
690
|
+
score += weight
|
|
691
|
+
for token in self._TOKEN_SPLIT_RE.split(text):
|
|
692
|
+
token = token.strip()
|
|
693
|
+
if len(token) >= 2 and token.lower() in question_lower:
|
|
694
|
+
score += min(weight, 0.12)
|
|
695
|
+
return min(score, 8.0)
|
|
696
|
+
|
|
697
|
+
@staticmethod
|
|
698
|
+
def _matched_terms(text: str, terms: List[str]) -> List[str]:
|
|
699
|
+
lower_text = text.lower()
|
|
700
|
+
return [term for term in terms if term.lower() in lower_text]
|
|
701
|
+
|
|
702
|
+
@staticmethod
|
|
703
|
+
def _evidence_text(metadata: Dict, document: str) -> str:
|
|
704
|
+
return f"{RagService._metadata_text(metadata)}\n{document}"
|
|
705
|
+
|
|
706
|
+
@staticmethod
|
|
707
|
+
def _metadata_text(metadata: Dict) -> str:
|
|
708
|
+
metadata_text = " ".join([
|
|
709
|
+
str(metadata.get("kb_id", "")),
|
|
710
|
+
str(metadata.get("title", "")),
|
|
711
|
+
str(metadata.get("source_doc", "")),
|
|
712
|
+
str(metadata.get("source_section", "")),
|
|
713
|
+
str(metadata.get("doc_type", "")),
|
|
714
|
+
str(metadata.get("domain", "")),
|
|
715
|
+
str(metadata.get("tags", "")),
|
|
716
|
+
str(metadata.get("business_modules", "")),
|
|
717
|
+
str(metadata.get("category", "")),
|
|
718
|
+
str(metadata.get("category_keywords", "")),
|
|
719
|
+
str(metadata.get("subcategory", "")),
|
|
720
|
+
str(metadata.get("related_items", "")),
|
|
721
|
+
str(metadata.get("related_categories", "")),
|
|
722
|
+
str(metadata.get("relation_notes", "")),
|
|
723
|
+
str(metadata.get("file_path", "")),
|
|
724
|
+
])
|
|
725
|
+
return metadata_text
|
|
726
|
+
|
|
727
|
+
@staticmethod
|
|
728
|
+
def _format_evidence_context(ranked: List[Dict]) -> str:
|
|
729
|
+
group_names = {
|
|
730
|
+
"exact": "高置信精确命中",
|
|
731
|
+
"neighbor_context": "精确命中邻近上下文",
|
|
732
|
+
"relation_context": "知识小类关系上下文",
|
|
733
|
+
"semantic": "相关语义召回",
|
|
734
|
+
}
|
|
735
|
+
groups = []
|
|
736
|
+
for match_type, group_name in group_names.items():
|
|
737
|
+
items = [item for item in ranked if item.get("match_type") == match_type]
|
|
738
|
+
if not items:
|
|
739
|
+
continue
|
|
740
|
+
blocks = [f"【{group_name}】"]
|
|
741
|
+
for item in items:
|
|
742
|
+
metadata = item["metadata"]
|
|
743
|
+
terms = "、".join(item.get("match_terms", [])) or "无显式代码词命中"
|
|
744
|
+
blocks.append(
|
|
745
|
+
f"[{len(blocks)}] 标题:{metadata.get('title', '')}\n"
|
|
746
|
+
f"匹配类型:{item.get('match_type', '')}\n"
|
|
747
|
+
f"置信级别:{item.get('confidence', '')}\n"
|
|
748
|
+
f"命中词:{terms}\n"
|
|
749
|
+
f"使用建议:{item.get('usage_hint', '')}\n"
|
|
750
|
+
f"文档类型:{metadata.get('doc_type', '')}\n"
|
|
751
|
+
f"知识领域:{metadata.get('domain', '')}\n"
|
|
752
|
+
f"知识大类:{metadata.get('category', '')}\n"
|
|
753
|
+
f"关键词:{metadata.get('category_keywords', '')}\n"
|
|
754
|
+
f"知识小类:{metadata.get('subcategory', '')}\n"
|
|
755
|
+
f"关联信息:{metadata.get('related_items', '')}\n"
|
|
756
|
+
f"来源:{metadata.get('source_doc', '')} {metadata.get('source_section', '')}\n"
|
|
757
|
+
f"内容:{item.get('document', '')}"
|
|
758
|
+
)
|
|
759
|
+
groups.append("\n\n".join(blocks))
|
|
760
|
+
return "\n\n".join(groups)
|
|
761
|
+
|
|
762
|
+
@staticmethod
|
|
763
|
+
def _searchable_content(metadata: Dict, chunk: str) -> str:
|
|
764
|
+
prefix = "\n".join([
|
|
765
|
+
f"知识库ID:{metadata.get('kb_id', '')}",
|
|
766
|
+
f"标题:{metadata.get('title', '')}",
|
|
767
|
+
f"来源文档:{metadata.get('source_doc', '')}",
|
|
768
|
+
f"来源章节:{metadata.get('source_section', '')}",
|
|
769
|
+
f"文档类型:{metadata.get('doc_type', '')}",
|
|
770
|
+
f"知识领域:{metadata.get('domain', '')}",
|
|
771
|
+
f"标签:{metadata.get('tags', '')}",
|
|
772
|
+
f"业务模块:{metadata.get('business_modules', '')}",
|
|
773
|
+
f"知识大类:{metadata.get('category', '')}",
|
|
774
|
+
f"关键词:{metadata.get('category_keywords', '')}",
|
|
775
|
+
f"知识小类:{metadata.get('subcategory', '')}",
|
|
776
|
+
f"关联信息:{metadata.get('related_items', '')}",
|
|
777
|
+
]).strip()
|
|
778
|
+
return f"{prefix}\n\n---CONTENT---\n{chunk}" if prefix else chunk
|
|
779
|
+
|
|
780
|
+
@staticmethod
|
|
781
|
+
def _strip_search_prefix(document: str) -> str:
|
|
782
|
+
marker = "---CONTENT---\n"
|
|
783
|
+
if marker in document:
|
|
784
|
+
return document.split(marker, 1)[1]
|
|
785
|
+
return document
|
|
786
|
+
|
|
787
|
+
@staticmethod
|
|
788
|
+
def _read_markdown(path: Path) -> Tuple[Dict, str]:
|
|
789
|
+
text = path.read_text(encoding="utf-8")
|
|
790
|
+
match = re.match(r"^---\n(.*?)\n---\n(.*)$", text, re.S)
|
|
791
|
+
if not match:
|
|
792
|
+
return {"title": path.stem, "status": "active"}, text
|
|
793
|
+
return yaml.safe_load(match.group(1)) or {}, match.group(2).strip()
|
|
794
|
+
|
|
795
|
+
def _split(self, text: str) -> List[str]:
|
|
796
|
+
max_chars = self._chunk_size
|
|
797
|
+
parts = re.split(r"\n(?=##\s+)", text)
|
|
798
|
+
chunks = []
|
|
799
|
+
for part in parts:
|
|
800
|
+
part = part.strip()
|
|
801
|
+
if not part:
|
|
802
|
+
continue
|
|
803
|
+
if len(part) <= max_chars:
|
|
804
|
+
chunks.append(part)
|
|
805
|
+
else:
|
|
806
|
+
chunks.extend(part[i:i + max_chars] for i in range(0, len(part), max_chars))
|
|
807
|
+
return chunks
|
|
808
|
+
|
|
809
|
+
def _embed(self, text: str) -> List[float]:
|
|
810
|
+
response = self._client.embeddings.create(
|
|
811
|
+
model=self._embedding_model,
|
|
812
|
+
input=text[: self._embedding_max_chars],
|
|
813
|
+
)
|
|
814
|
+
item = response.data[0]
|
|
815
|
+
return item["embedding"] if isinstance(item, dict) else item.embedding
|
|
816
|
+
|
|
817
|
+
def _embed_batch(self, texts: List[str]) -> List[List[float]]:
|
|
818
|
+
response = self._client.embeddings.create(
|
|
819
|
+
model=self._embedding_model,
|
|
820
|
+
input=[text[: self._embedding_max_chars] for text in texts],
|
|
821
|
+
)
|
|
822
|
+
embeddings = [
|
|
823
|
+
item["embedding"] if isinstance(item, dict) else item.embedding
|
|
824
|
+
for item in response.data
|
|
825
|
+
]
|
|
826
|
+
if len(embeddings) != len(texts):
|
|
827
|
+
raise RuntimeError(f"embedding batch 返回数量不匹配。expected={len(texts)} actual={len(embeddings)}")
|
|
828
|
+
return embeddings
|
|
829
|
+
|
|
830
|
+
@staticmethod
|
|
831
|
+
def _clean_metadata(metadata: Dict) -> Dict:
|
|
832
|
+
cleaned = {}
|
|
833
|
+
for key, value in metadata.items():
|
|
834
|
+
if isinstance(value, (list, dict)):
|
|
835
|
+
cleaned[key] = json.dumps(value, ensure_ascii=False)
|
|
836
|
+
elif value is None:
|
|
837
|
+
cleaned[key] = ""
|
|
838
|
+
elif isinstance(value, (str, int, float, bool)):
|
|
839
|
+
cleaned[key] = value
|
|
840
|
+
else:
|
|
841
|
+
cleaned[key] = str(value)
|
|
842
|
+
return cleaned
|
|
843
|
+
|
|
844
|
+
@staticmethod
|
|
845
|
+
def _chunk_id(path: Path, idx: int, chunk: str) -> str:
|
|
846
|
+
digest = hashlib.md5(f"{path}:{idx}:{chunk}".encode("utf-8")).hexdigest()
|
|
847
|
+
return digest
|
|
848
|
+
|
|
849
|
+
@staticmethod
|
|
850
|
+
def _source(metadata: Dict) -> Dict:
|
|
851
|
+
return {
|
|
852
|
+
"kb_id": metadata.get("kb_id", ""),
|
|
853
|
+
"title": metadata.get("title", ""),
|
|
854
|
+
"doc_type": metadata.get("doc_type", ""),
|
|
855
|
+
"domain": metadata.get("domain", ""),
|
|
856
|
+
"category": metadata.get("category", ""),
|
|
857
|
+
"category_keywords": metadata.get("category_keywords", ""),
|
|
858
|
+
"source_doc_description": metadata.get("source_doc_description", ""),
|
|
859
|
+
"subcategory": metadata.get("subcategory", ""),
|
|
860
|
+
"related_items": metadata.get("related_items", ""),
|
|
861
|
+
"related_categories": metadata.get("related_categories", ""),
|
|
862
|
+
"relation_notes": metadata.get("relation_notes", ""),
|
|
863
|
+
"source_doc": metadata.get("source_doc", ""),
|
|
864
|
+
"source_section": metadata.get("source_section", ""),
|
|
865
|
+
"file_path": metadata.get("file_path", ""),
|
|
866
|
+
}
|