union-app-chat-stream 1.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (108) hide show
  1. package/.gitignore +16 -0
  2. package/PROJECT_OVERVIEW.md +187 -0
  3. package/app/.env +63 -0
  4. package/app/.env.dev +63 -0
  5. package/app/.env.prod.bj11 +63 -0
  6. package/app/.env.prod.sh20 +63 -0
  7. package/app/.env.prod.sz31 +63 -0
  8. package/app/.env.test.bj12 +63 -0
  9. package/app/__init__.py +42 -0
  10. package/app/__pycache__/__init__.cpython-312.pyc +0 -0
  11. package/app/__pycache__/authenticated_user.cpython-312.pyc +0 -0
  12. package/app/__pycache__/extensions.cpython-312.pyc +0 -0
  13. package/app/__pycache__/wsgi.cpython-312.pyc +0 -0
  14. package/app/authenticated_user.py +77 -0
  15. package/app/config/__pycache__/config_loader.cpython-312.pyc +0 -0
  16. package/app/config/__pycache__/env_config.cpython-312.pyc +0 -0
  17. package/app/config/__pycache__/logger_config.cpython-312.pyc +0 -0
  18. package/app/config/env_config.py +96 -0
  19. package/app/config/logger_config.py +46 -0
  20. package/app/manager/__init__.py +4 -0
  21. package/app/manager/__pycache__/__init__.cpython-312.pyc +0 -0
  22. package/app/manager/__pycache__/chatstream_manager.cpython-312.pyc +0 -0
  23. package/app/manager/__pycache__/prompts.cpython-312.pyc +0 -0
  24. package/app/manager/__pycache__/runtime_manager.cpython-312.pyc +0 -0
  25. package/app/manager/__pycache__/toolcall_manager.cpython-312.pyc +0 -0
  26. package/app/manager/chatstream_manager.py +90 -0
  27. package/app/manager/prompts.py +62 -0
  28. package/app/manager/runtime_manager.py +552 -0
  29. package/app/models/__pycache__/schemas.cpython-312.pyc +0 -0
  30. package/app/models/schemas.py +30 -0
  31. package/app/service/__init__.py +4 -0
  32. package/app/service/__pycache__/__init__.cpython-312.pyc +0 -0
  33. package/app/service/__pycache__/chat_service.cpython-312.pyc +0 -0
  34. package/app/service/__pycache__/llm_service.cpython-312.pyc +0 -0
  35. package/app/service/__pycache__/rag_service.cpython-312.pyc +0 -0
  36. package/app/service/__pycache__/tool_call_service.cpython-312.pyc +0 -0
  37. package/app/service/__pycache__/union_service.cpython-312.pyc +0 -0
  38. package/app/service/chat_service.py +228 -0
  39. package/app/service/llm_service.py +214 -0
  40. package/app/service/rag_service.py +866 -0
  41. package/app/service/union_service.py +201 -0
  42. package/app/utils/__init__.py +5 -0
  43. package/app/utils/__pycache__/__init__.cpython-312.pyc +0 -0
  44. package/app/utils/__pycache__/common_utils.cpython-312.pyc +0 -0
  45. package/app/utils/__pycache__/debug_context.cpython-312.pyc +0 -0
  46. package/app/utils/__pycache__/function_utils.cpython-312.pyc +0 -0
  47. package/app/utils/__pycache__/jwt_utils.cpython-312.pyc +0 -0
  48. package/app/utils/common_utils.py +169 -0
  49. package/app/utils/debug_context.py +16 -0
  50. package/app/utils/function_utils.py +274 -0
  51. package/app/utils/jwt_utils.py +39 -0
  52. package/app/views/__init__.py +6 -0
  53. package/app/views/__pycache__/__init__.cpython-312.pyc +0 -0
  54. package/app/views/__pycache__/view_chatstream.cpython-312.pyc +0 -0
  55. package/app/views/__pycache__/view_healthcheck.cpython-312.pyc +0 -0
  56. package/app/views/__pycache__/view_runtime.cpython-312.pyc +0 -0
  57. package/app/views/view_chatstream.py +53 -0
  58. package/app/views/view_healthcheck.py +14 -0
  59. package/app/views/view_runtime.py +72 -0
  60. package/app/wsgi.py +37 -0
  61. package/ci.yml +14 -0
  62. package/deploy/autoconf/templates/env.j2 +25 -0
  63. package/deploy/autoconf.yml +15 -0
  64. package/deploy/scripts/healthcheck.sh +0 -0
  65. package/deploy/scripts/requirements.txt +53 -0
  66. package/deploy/scripts/start.sh +75 -0
  67. package/deploy/scripts/stop.sh +31 -0
  68. package/knowledge/.gitkeep +0 -0
  69. package/knowledge/000001-biz-offline-85b99bd43b-v1.md +88 -0
  70. package/knowledge/000002-biz-offline-717e8d823e-v1.md +90 -0
  71. package/knowledge/000003-biz-offline-c963227cc8-v1.md +84 -0
  72. package/knowledge/000004-biz-offline-2a5868e7da-v1.md +92 -0
  73. package/knowledge/000005-biz-offline-f9d9cf1a88-v1.md +79 -0
  74. package/knowledge/000006-biz-offline-c4fa2df3bd-v1.md +77 -0
  75. package/knowledge/000007-biz-offline-78304b70ca-v1.md +76 -0
  76. package/knowledge/000008-biz-offline-987ae67b35-v1.md +75 -0
  77. package/knowledge/000009-biz-offline-4d656bcea3-v1.md +85 -0
  78. package/knowledge/000010-sop-offline-a9e1050719-v1.md +100 -0
  79. package/knowledge/000011-biz-offline-5de0624891-v1.md +86 -0
  80. package/knowledge/000012-biz-offline-7dfacccba3-v1.md +82 -0
  81. package/knowledge/000013-biz-offline-5e1d29d2ed-v1.md +81 -0
  82. package/knowledge/000014-biz-offline-1d0ed8b841-v1.md +68 -0
  83. package/knowledge/000015-biz-offline-8a1376ee3e-v1.md +78 -0
  84. package/knowledge/000016-biz-offline-c8bfc2aa08-v1.md +99 -0
  85. package/knowledge/000017-biz-offline-9dffb28032-v1.md +88 -0
  86. package/knowledge/000018-biz-offline-f935bc9a6a-v1.md +80 -0
  87. package/knowledge/000019-biz-offline-858b3ecd89-v1.md +86 -0
  88. package/knowledge/000020-biz-offline-65cb5c4f40-v1.md +113 -0
  89. package/knowledge/000021-biz-offline-1bf211639c-v1.md +148 -0
  90. package/knowledge/000022-biz-offline-8c5a637879-v1.md +140 -0
  91. package/knowledge/000023-biz-offline-fe872b8712-v1.md +188 -0
  92. package/knowledge/000024-biz-offline-a85010c500-v1.md +133 -0
  93. package/knowledge/000025-biz-offline-8af58a3638-v1.md +136 -0
  94. package/knowledge/000026-biz-offline-6754102e93-v1.md +142 -0
  95. package/knowledge/000027-biz-offline-ea2e5ca5f9-v1.md +150 -0
  96. package/knowledge/000028-scenario-offline-dab45cebb4-v1.md +136 -0
  97. package/knowledge/000029-scenario-offline-5b8ae5ea9f-v1.md +143 -0
  98. package/knowledge/000030-scenario-offline-9a82d42f3f-v1.md +136 -0
  99. package/knowledge/000031-scenario-offline-cc2edc0197-v1.md +122 -0
  100. package/knowledge/000032-scenario-offline-e5f6e5cbfa-v1.md +122 -0
  101. package/knowledge/000033-scenario-offline-e1955849aa-v1.md +135 -0
  102. package/knowledge/000034-scenario-offline-3a13d49a3a-v1.md +138 -0
  103. package/knowledge/000035-scenario-offline-fd5560211f-v1.md +147 -0
  104. package/knowledge/000036-scenario-offline-function-call-mock-v1.md +134 -0
  105. package/package.json +18 -0
  106. package/requirements.txt +53 -0
  107. package/tools/prompts.yaml +10 -0
  108. package/tools/tool_definitions.yaml +303 -0
@@ -0,0 +1,866 @@
1
+ import hashlib
2
+ import json
3
+ import re
4
+
5
+ try:
6
+ import pysqlite3
7
+ import sys
8
+ sys.modules["sqlite3"] = pysqlite3
9
+ except ImportError:
10
+ pass
11
+
12
+ import sqlite3
13
+ from datetime import datetime
14
+ from pathlib import Path
15
+ from typing import Any, Dict, List, Optional, Tuple
16
+
17
+ import yaml
18
+
19
+ from zai import ZhipuAiClient
20
+
21
+ from loguru import logger
22
+
23
+
24
+ def _preview(text: str, limit: int = 300) -> str:
25
+ return str(text).replace("\n", " ")[:limit]
26
+
27
+
28
+ class RagService:
29
+ """轻量 RAG 服务:加载 Markdown 知识库,写入 Chroma,并按问题检索。"""
30
+
31
+ _TOKEN_SPLIT_RE = re.compile(r"[\s,,、;;//||()()《》【】\\-]+")
32
+ _CODE_PATTERNS = [
33
+ re.compile(r"(?<![A-Za-z0-9])[a-zA-Z]+[0-9]+(?:\.[0-9A-Za-z]+)+"),
34
+ re.compile(r"(?<![A-Za-z0-9])[a-zA-Z]+[0-9]{2,}"),
35
+ re.compile(r"(?<![A-Za-z0-9])[0-9]{6}-[a-zA-Z0-9-]+-v[0-9]+"),
36
+ re.compile(r"(?<![A-Za-z0-9])[a-zA-Z]+-offline-[a-fA-F0-9]{8,}-v[0-9]+"),
37
+ re.compile(r"(?<![A-Za-z0-9])[A-Z]\.[0-9]+(?:\.[0-9]+)?"),
38
+ re.compile(r"(?<![A-Za-z0-9])[a-z][A-Za-z0-9]*[A-Z][A-Za-z0-9]*"),
39
+ ]
40
+
41
+ def __init__(self, config):
42
+ self._config = config
43
+ self._enabled = config["RAG_ENABLED"]
44
+ self._top_k = self._positive_int(config["RAG_TOP_K"])
45
+ self._collection = None
46
+ self._root = Path(__file__).resolve().parents[2]
47
+ self._persist_dir = self._root / config["RAG_PERSIST_DIR"]
48
+ self._knowledge_dir = self._root / config["RAG_KNOWLEDGE_DIR"]
49
+ self._collection_name = config["RAG_COLLECTION"]
50
+ self._rebuild_on_startup = config["RAG_REBUILD_ON_STARTUP"]
51
+ self._embedding_model = config["RAG_EMBEDDING_MODEL"]
52
+ self._embedding_max_chars = self._positive_int(config["RAG_EMBEDDING_MAX_CHARS"])
53
+ self._embedding_batch_size = self._positive_int(config["RAG_EMBEDDING_BATCH_SIZE"])
54
+ self._semantic_candidate_k = self._positive_int(config["RAG_SEMANTIC_CANDIDATE_K"])
55
+ self._context_k = self._positive_int(config["RAG_CONTEXT_K"])
56
+ self._exact_context_k = self._positive_int(config["RAG_EXACT_CONTEXT_K"])
57
+ self._exact_per_file_context_k = self._positive_int(config["RAG_EXACT_PER_FILE_CONTEXT_K"])
58
+ self._per_file_context_k = self._positive_int(config["RAG_PER_FILE_CONTEXT_K"])
59
+ self._chunk_size = self._positive_int(config["RAG_CHUNK_SIZE"])
60
+ self._chroma = None
61
+ self._client = ZhipuAiClient(
62
+ api_key=config["LLM_KEY"],
63
+ base_url=config["LLM_URL"],
64
+ )
65
+ if self._enabled:
66
+ self._init_collection()
67
+
68
+ def search(self, question: str, top_k: Optional[int] = None) -> Tuple[str, List[Dict]]:
69
+ if not self._ensure_collection():
70
+ logger.info(f"RAG 未启用或集合不可用,跳过检索。question={_preview(question, 120)}")
71
+ return "", []
72
+
73
+ exact_terms = self._extract_exact_terms(question)
74
+ exact_hits = self._exact_match_documents(question, exact_terms)
75
+ expanded_hits = self._expand_neighbors(exact_hits)
76
+ relation_hits = self._expand_related_items(exact_hits)
77
+ try:
78
+ if self._collection.count() == 0:
79
+ logger.info("RAG 集合为空,开始重建知识库。")
80
+ self.rebuild()
81
+ if self._collection.count() == 0:
82
+ logger.info(f"RAG 重建后仍无可用文档。question={_preview(question, 120)}")
83
+ return "", []
84
+ result = self._collection.query(
85
+ query_embeddings=[self._embed(question)],
86
+ n_results=self._semantic_candidate_count(),
87
+ include=["documents", "metadatas", "distances"],
88
+ )
89
+ except Exception:
90
+ logger.exception(f"RAG 检索异常,已降级为空上下文。question={_preview(question, 120)}")
91
+ result = {}
92
+ docs = result.get("documents", [[]])[0]
93
+ metas = result.get("metadatas", [[]])[0]
94
+ distances = result.get("distances", [[]])[0]
95
+ ids = result.get("ids", [[]])[0]
96
+ semantic_hits = self._semantic_hits(question, docs, metas, distances, ids)
97
+ ranked = self._merge_and_rank(question, exact_terms, exact_hits, expanded_hits, relation_hits, semantic_hits, top_k)
98
+ metas = [item["metadata"] for item in ranked]
99
+ sources = [self._source(m) for m in metas]
100
+ context = self._format_evidence_context(ranked)
101
+ logger.info(f"RAG 检索完成。question={_preview(question, 120)} hit_count={len(ranked)} exact_terms={exact_terms} sources={sources} context_preview={_preview(context)}")
102
+ return context, sources
103
+
104
+ def knowledge_search(self, query: str, top_k: Optional[Any] = None) -> Tuple[Dict[str, Any], str]:
105
+ query = str(query or "").strip()
106
+ if not query:
107
+ return {"context": "", "sources": [], "related_tools": []}, "query_empty"
108
+
109
+ context, sources = self.search(query, top_k=self._parse_optional_top_k(top_k))
110
+ return {
111
+ "query": query,
112
+ "context": context,
113
+ "sources": sources,
114
+ "related_tools": self._extract_related_tools(sources),
115
+ }, "success"
116
+
117
+ def _ensure_collection(self) -> bool:
118
+ if self._collection:
119
+ return True
120
+ if not self._enabled:
121
+ return False
122
+ self._init_collection()
123
+ return self._collection is not None
124
+
125
+ @staticmethod
126
+ def _positive_int(value) -> int:
127
+ parsed = int(value)
128
+ if parsed < 1:
129
+ raise ValueError(f"配置值必须为正整数: {value}")
130
+ return parsed
131
+
132
+ def _parse_optional_top_k(self, value: Optional[Any]) -> Optional[int]:
133
+ if value in (None, ""):
134
+ return None
135
+ parsed = self._positive_int(value)
136
+ return min(parsed, self._context_limit())
137
+
138
+ @staticmethod
139
+ def _extract_related_tools(sources: List[Dict]) -> List[Dict[str, str]]:
140
+ related_tools: Dict[str, Dict[str, str]] = {}
141
+ for source in sources:
142
+ raw = source.get("related_items", "")
143
+ if not raw:
144
+ continue
145
+ text = raw if isinstance(raw, str) else json.dumps(raw, ensure_ascii=False)
146
+ for name in sorted(set(re.findall(r"\b[a-zA-Z][a-zA-Z0-9_]{2,}\b", text))):
147
+ if "_" not in name:
148
+ continue
149
+ related_tools.setdefault(name, {
150
+ "name": name,
151
+ "reason": "知识库关联能力提示;是否可执行以当前 tool definition 为准。",
152
+ })
153
+ return list(related_tools.values())
154
+
155
+ def _init_collection(self):
156
+ try:
157
+ import chromadb
158
+ except ImportError:
159
+ logger.warning("未安装 chromadb,RAG 检索不可用。")
160
+ return
161
+
162
+ self._chroma = chromadb.PersistentClient(path=str(self._persist_dir))
163
+ self._collection = self._chroma.get_or_create_collection(self._collection_name)
164
+ try:
165
+ if self._rebuild_on_startup:
166
+ self.rebuild()
167
+ except Exception:
168
+ logger.exception("RAG 初始化重建失败,已关闭当前集合。")
169
+ self._collection = None
170
+
171
+ def rebuild(self) -> int:
172
+ docs = self._load_documents()
173
+ if not docs:
174
+ logger.info("RAG 未加载到知识库文档,跳过重建。")
175
+ return 0
176
+ embeddings = []
177
+ logger.info(f"RAG 开始生成文档向量。doc_chunks={len(docs)}")
178
+ for start in range(0, len(docs), self._embedding_batch_size):
179
+ batch_docs = docs[start:start + self._embedding_batch_size]
180
+ batch_end = start + len(batch_docs)
181
+ try:
182
+ logger.info(
183
+ f"RAG 批量生成文档向量中。progress={start + 1}-{batch_end}/{len(docs)} "
184
+ f"batch_size={len(batch_docs)} "
185
+ f"first_file={batch_docs[0]['metadata'].get('file_path', '')} "
186
+ f"first_chunk_index={batch_docs[0]['metadata'].get('chunk_index', '')} "
187
+ f"first_title={batch_docs[0]['metadata'].get('title', '')}"
188
+ )
189
+ embeddings.extend(self._embed_batch([doc["content"] for doc in batch_docs]))
190
+ logger.info(
191
+ f"RAG 批量生成文档向量完成。progress={batch_end}/{len(docs)} "
192
+ f"batch_size={len(batch_docs)}"
193
+ )
194
+ except Exception:
195
+ first_doc = batch_docs[0]
196
+ first_metadata = first_doc["metadata"]
197
+ logger.exception(
198
+ f"RAG 批量文档向量生成失败。progress={start + 1}-{batch_end}/{len(docs)} "
199
+ f"first_file={first_metadata.get('file_path', '')} "
200
+ f"first_chunk_index={first_metadata.get('chunk_index', '')} "
201
+ f"first_title={first_metadata.get('title', '')} "
202
+ f"first_content_preview={_preview(first_doc['content'], 120)}"
203
+ )
204
+ raise
205
+ if len(embeddings) != len(docs):
206
+ raise RuntimeError(f"RAG 文档向量数量不匹配。docs={len(docs)} embeddings={len(embeddings)}")
207
+ self._collection.upsert(
208
+ ids=[d["id"] for d in docs],
209
+ documents=[d["content"] for d in docs],
210
+ metadatas=[d["metadata"] for d in docs],
211
+ embeddings=embeddings,
212
+ )
213
+ logger.info(f"RAG 知识库重建完成。doc_chunks={len(docs)}")
214
+ return len(docs)
215
+
216
+ def force_rebuild(self) -> Dict:
217
+ if not self._enabled:
218
+ return {
219
+ "enabled": False,
220
+ "status": "disabled",
221
+ "message": "RAG 未启用,未执行重建。",
222
+ }
223
+ if not self._ensure_collection():
224
+ return {
225
+ "enabled": True,
226
+ "status": "unavailable",
227
+ "message": "RAG 集合不可用,无法重建。",
228
+ }
229
+
230
+ logger.info(f"RAG 开始强制重建,将删除并重建集合。collection={self._collection_name}")
231
+ self._chroma.delete_collection(self._collection_name)
232
+ self._collection = self._chroma.get_or_create_collection(self._collection_name)
233
+ rebuilt_chunks = self.rebuild()
234
+ status = self.check()
235
+ status.update({
236
+ "status": "rebuilt",
237
+ "rebuilt_chunks": rebuilt_chunks,
238
+ })
239
+ return status
240
+
241
+ def check(self) -> Dict:
242
+ docs = self._load_documents()
243
+ expected_ids = {d["id"] for d in docs}
244
+ source_files = [str(path) for path in self._knowledge_dir.rglob("*.md")]
245
+ active_files = {str(d["metadata"].get("file_path", "")) for d in docs}
246
+
247
+ database = self._database_info()
248
+ result = {
249
+ "enabled": self._enabled,
250
+ "status": "unknown",
251
+ "collection": self._collection_name,
252
+ "is_synced": False,
253
+ "needs_rebuild": True,
254
+ "last_build_at": database.get("last_embedding_created_at"),
255
+ "database_updated_at": database.get("modified_at"),
256
+ "item_counts": {
257
+ "source_files": len(source_files),
258
+ "active_source_files": len(active_files),
259
+ "expected_chunks": len(docs),
260
+ "vector_chunks": 0,
261
+ "missing_chunks": len(expected_ids),
262
+ "stale_chunks": 0,
263
+ },
264
+ "config": {
265
+ "embedding_model": self._embedding_model,
266
+ "top_k": self._top_k,
267
+ "chunk_size": self._chunk_size,
268
+ },
269
+ }
270
+
271
+ if not self._ensure_collection():
272
+ result.update({
273
+ "status": "disabled" if not self._enabled else "unavailable",
274
+ "message": "RAG 未启用或集合不可用。",
275
+ })
276
+ return result
277
+
278
+ vector_count = self._collection.count()
279
+ result["item_counts"]["vector_chunks"] = vector_count
280
+ stored_ids = set()
281
+ if vector_count:
282
+ stored = self._collection.get(limit=vector_count, include=[])
283
+ stored_ids = set(stored.get("ids", []))
284
+
285
+ missing_ids = expected_ids - stored_ids
286
+ stale_ids = stored_ids - expected_ids
287
+ is_synced = not missing_ids and not stale_ids and vector_count == len(expected_ids)
288
+ result.update({
289
+ "status": "ready" if vector_count else "empty",
290
+ "is_synced": is_synced,
291
+ "needs_rebuild": not is_synced,
292
+ })
293
+ result["item_counts"].update({
294
+ "missing_chunks": len(missing_ids),
295
+ "stale_chunks": len(stale_ids),
296
+ })
297
+ return result
298
+
299
+ def _database_info(self) -> Dict:
300
+ db_path = self._persist_dir / "chroma.sqlite3"
301
+ info = {
302
+ "path": str(db_path),
303
+ "exists": db_path.exists(),
304
+ }
305
+ if not db_path.exists():
306
+ return info
307
+
308
+ stat = db_path.stat()
309
+ info.update({
310
+ "size_bytes": stat.st_size,
311
+ "modified_at": datetime.fromtimestamp(stat.st_mtime).isoformat(timespec="seconds"),
312
+ })
313
+ try:
314
+ with sqlite3.connect(str(db_path)) as conn:
315
+ row = conn.execute("select min(created_at), max(created_at) from embeddings").fetchone()
316
+ info.update({
317
+ "first_embedding_created_at": row[0] if row else None,
318
+ "last_embedding_created_at": row[1] if row else None,
319
+ })
320
+ except sqlite3.Error:
321
+ logger.exception(f"读取 Chroma SQLite 元信息失败。path={db_path}")
322
+ return info
323
+
324
+ def _load_documents(self) -> List[Dict]:
325
+ docs = []
326
+ for path in self._knowledge_dir.rglob("*.md"):
327
+ metadata, body = self._read_markdown(path)
328
+ if metadata.get("status", "active") != "active":
329
+ continue
330
+ for idx, chunk in enumerate(self._split(body)):
331
+ item_meta = self._clean_metadata({**metadata, "file_path": str(path), "chunk_index": idx})
332
+ searchable = self._searchable_content(item_meta, chunk)
333
+ docs.append({
334
+ "id": self._chunk_id(path, idx, chunk),
335
+ "content": searchable,
336
+ "metadata": item_meta,
337
+ })
338
+ return docs
339
+
340
+ def _semantic_candidate_count(self) -> int:
341
+ return self._semantic_candidate_k
342
+
343
+ def _context_limit(self) -> int:
344
+ return self._context_k
345
+
346
+ def _semantic_hits(
347
+ self,
348
+ question: str,
349
+ docs: List[str],
350
+ metas: List[Dict],
351
+ distances: List[float],
352
+ ids: List[str],
353
+ ) -> List[Dict]:
354
+ hits = []
355
+ for idx, (doc, meta) in enumerate(zip(docs, metas)):
356
+ distance = distances[idx] if idx < len(distances) else 1.0
357
+ hits.append({
358
+ "id": ids[idx] if idx < len(ids) else self._chunk_id_from_metadata(meta, doc),
359
+ "document": self._strip_search_prefix(doc),
360
+ "metadata": meta,
361
+ "distance": float(distance),
362
+ "score": (1.0 - float(distance)) + self._metadata_match_score(question, [], meta),
363
+ "match_type": "semantic",
364
+ "match_terms": [],
365
+ "confidence": "medium",
366
+ "usage_hint": "语义相似证据,可用于补充业务背景、相邻规范或相似场景。",
367
+ })
368
+ return hits
369
+
370
+ @staticmethod
371
+ def _chunk_id_from_metadata(metadata: Dict, document: str) -> str:
372
+ path = Path(str(metadata.get("file_path", "")))
373
+ idx = metadata.get("chunk_index", "")
374
+ return RagService._chunk_id(path, idx, document)
375
+
376
+ def _extract_exact_terms(self, question: str) -> List[str]:
377
+ terms = []
378
+ for pattern in self._CODE_PATTERNS:
379
+ terms.extend(match.group(0) for match in pattern.finditer(question))
380
+
381
+ for token in self._TOKEN_SPLIT_RE.split(question):
382
+ token = token.strip()
383
+ if len(token) < 3:
384
+ continue
385
+ if re.fullmatch(r"[A-Za-z]+", token) and len(token) < 6:
386
+ continue
387
+ if re.search(r"[A-Za-z0-9]", token) and not self._is_generic_ascii_token(token):
388
+ terms.append(token)
389
+
390
+ normalized = []
391
+ seen = set()
392
+ for term in terms:
393
+ term = term.strip("::,,。.;;()()[]【】")
394
+ key = term.lower()
395
+ if len(term) >= 3 and key not in seen:
396
+ seen.add(key)
397
+ normalized.append(term)
398
+ return normalized
399
+
400
+ @staticmethod
401
+ def _is_generic_ascii_token(token: str) -> bool:
402
+ return token.lower() in {
403
+ "json",
404
+ "http",
405
+ "https",
406
+ "api",
407
+ "xml",
408
+ "true",
409
+ "false",
410
+ "none",
411
+ "null",
412
+ "biz",
413
+ "offline",
414
+ }
415
+
416
+ def _exact_match_documents(self, question: str, exact_terms: List[str]) -> List[Dict]:
417
+ hits = []
418
+ for doc in self._load_documents():
419
+ metadata = doc["metadata"]
420
+ document = self._strip_search_prefix(doc["content"])
421
+ metadata_text = self._metadata_text(metadata)
422
+ metadata_terms = self._matched_terms(metadata_text, exact_terms)
423
+ body_terms = self._matched_terms(document, exact_terms)
424
+ name_terms = self._matched_name_terms(question, metadata)
425
+ matched_terms = sorted(set(metadata_terms + body_terms + name_terms), key=str.lower)
426
+ if not matched_terms:
427
+ continue
428
+ score = (
429
+ 3.0
430
+ + self._metadata_match_score(question, matched_terms, metadata)
431
+ + min(len(metadata_terms) * 0.5, 2.0)
432
+ + min(len(body_terms) * 1.5, 4.5)
433
+ + min(len(name_terms) * 1.2, 3.6)
434
+ )
435
+ hits.append({
436
+ "id": doc["id"],
437
+ "document": document,
438
+ "metadata": metadata,
439
+ "distance": None,
440
+ "score": score,
441
+ "match_type": "exact",
442
+ "match_terms": matched_terms,
443
+ "body_match_terms": body_terms,
444
+ "name_match_terms": name_terms,
445
+ "confidence": "high",
446
+ "usage_hint": "高置信精确命中,优先用于确定具体对象、字段、格式、编号或阈值。",
447
+ })
448
+ return self._prefer_direct_exact_hits(hits)
449
+
450
+ @staticmethod
451
+ def _prefer_direct_exact_hits(hits: List[Dict]) -> List[Dict]:
452
+ direct_by_file = {}
453
+ for hit in hits:
454
+ file_path = hit["metadata"].get("file_path", "")
455
+ if hit.get("body_match_terms") or hit.get("name_match_terms"):
456
+ direct_by_file[file_path] = True
457
+
458
+ filtered = [
459
+ hit
460
+ for hit in hits
461
+ if hit.get("body_match_terms")
462
+ or hit.get("name_match_terms")
463
+ or not direct_by_file.get(hit["metadata"].get("file_path", ""))
464
+ ]
465
+ return sorted(filtered, key=lambda item: item["score"], reverse=True)
466
+
467
+ @staticmethod
468
+ def _matched_name_terms(question: str, metadata: Dict) -> List[str]:
469
+ candidates = [
470
+ str(metadata.get("title", "")),
471
+ str(metadata.get("source_section", "")),
472
+ ]
473
+ terms = []
474
+ for candidate in candidates:
475
+ candidate = candidate.strip()
476
+ if len(candidate) >= 6 and candidate in question:
477
+ terms.append(candidate)
478
+ section_name = re.sub(r"^[A-Z]\.[0-9]+(?:\.[0-9]+)?\s*", "", candidate).strip()
479
+ if len(section_name) >= 6 and section_name in question:
480
+ terms.append(section_name)
481
+ return sorted(set(terms), key=len, reverse=True)
482
+
483
+ def _expand_neighbors(self, exact_hits: List[Dict]) -> List[Dict]:
484
+ if not exact_hits:
485
+ return []
486
+
487
+ docs = self._load_documents()
488
+ by_location = {
489
+ (doc["metadata"].get("file_path", ""), int(doc["metadata"].get("chunk_index", 0))): doc
490
+ for doc in docs
491
+ }
492
+ neighbors = []
493
+ seen = {hit["id"] for hit in exact_hits}
494
+ for hit in exact_hits[:3]:
495
+ metadata = hit["metadata"]
496
+ file_path = metadata.get("file_path", "")
497
+ try:
498
+ chunk_index = int(metadata.get("chunk_index", 0))
499
+ except (TypeError, ValueError):
500
+ continue
501
+ for offset in (-1, 1):
502
+ doc = by_location.get((file_path, chunk_index + offset))
503
+ if not doc or doc["id"] in seen:
504
+ continue
505
+ seen.add(doc["id"])
506
+ neighbors.append({
507
+ "id": doc["id"],
508
+ "document": self._strip_search_prefix(doc["content"]),
509
+ "metadata": doc["metadata"],
510
+ "distance": None,
511
+ "score": hit["score"] - 0.4,
512
+ "match_type": "neighbor_context",
513
+ "match_terms": hit.get("match_terms", []),
514
+ "confidence": "medium",
515
+ "usage_hint": "精确命中文档的邻近片段,可用于补充上下文、边界和使用要求。",
516
+ })
517
+ return neighbors
518
+
519
+ def _expand_related_items(self, exact_hits: List[Dict]) -> List[Dict]:
520
+ if not exact_hits:
521
+ return []
522
+
523
+ docs = self._load_documents()
524
+ relation_docs = {}
525
+ for doc in docs:
526
+ metadata = doc["metadata"]
527
+ if int(metadata.get("chunk_index", 0)) != 0:
528
+ continue
529
+ category = str(metadata.get("category", "")).strip()
530
+ for title in (metadata.get("subcategory", ""), metadata.get("title", "")):
531
+ title = str(title).strip()
532
+ if title:
533
+ relation_docs[(category, title)] = doc
534
+
535
+ related = []
536
+ seen = {hit["id"] for hit in exact_hits}
537
+ for hit in exact_hits[:3]:
538
+ metadata = hit["metadata"]
539
+ source_category = str(metadata.get("category", "")).strip()
540
+ for item in self._related_items(metadata):
541
+ target_category = str(item.get("大类标题") or item.get("category") or source_category).strip()
542
+ target_title = str(item.get("小类标题") or item.get("subcategory") or item.get("title") or "").strip()
543
+ if not target_title:
544
+ continue
545
+ doc = relation_docs.get((target_category, target_title)) or relation_docs.get(
546
+ (source_category, target_title))
547
+ if not doc:
548
+ doc = self._find_related_doc(docs, target_category or source_category, target_title)
549
+ if not doc or doc["id"] in seen:
550
+ continue
551
+ seen.add(doc["id"])
552
+ related.append({
553
+ "id": doc["id"],
554
+ "document": self._strip_search_prefix(doc["content"]),
555
+ "metadata": doc["metadata"],
556
+ "distance": None,
557
+ "score": hit["score"] - 0.2,
558
+ "match_type": "relation_context",
559
+ "match_terms": hit.get("match_terms", []),
560
+ "confidence": "medium",
561
+ "usage_hint": "知识小类关系证据,可用于说明同一知识大类下小类之间的关联、边界和相似场景。",
562
+ })
563
+ return related
564
+
565
+ @staticmethod
566
+ def _related_items(metadata: Dict) -> List[Dict]:
567
+ raw = metadata.get("related_items", "")
568
+ if not raw:
569
+ return []
570
+ if isinstance(raw, list):
571
+ return [item for item in raw if isinstance(item, dict)]
572
+ if isinstance(raw, str):
573
+ try:
574
+ parsed = json.loads(raw)
575
+ except json.JSONDecodeError:
576
+ return []
577
+ return [item for item in parsed if isinstance(item, dict)] if isinstance(parsed, list) else []
578
+ return []
579
+
580
+ @staticmethod
581
+ def _find_related_doc(docs: List[Dict], category: str, title: str) -> Dict:
582
+ title_lower = title.lower()
583
+ for doc in docs:
584
+ metadata = doc["metadata"]
585
+ if int(metadata.get("chunk_index", 0)) != 0:
586
+ continue
587
+ doc_category = str(metadata.get("category", "")).strip()
588
+ if category and doc_category and category != doc_category:
589
+ continue
590
+ candidates = [
591
+ str(metadata.get("subcategory", "")).strip(),
592
+ str(metadata.get("title", "")).strip(),
593
+ str(metadata.get("source_section", "")).strip(),
594
+ ]
595
+ if any(title_lower in candidate.lower() or candidate.lower() in title_lower for candidate in candidates if
596
+ candidate):
597
+ return doc
598
+ return {}
599
+
600
+ def _merge_and_rank(
601
+ self,
602
+ question: str,
603
+ exact_terms: List[str],
604
+ exact_hits: List[Dict],
605
+ expanded_hits: List[Dict],
606
+ relation_hits: List[Dict],
607
+ semantic_hits: List[Dict],
608
+ top_k: Optional[int] = None,
609
+ ) -> List[Dict]:
610
+ merged = {}
611
+ for item in [*semantic_hits, *relation_hits, *expanded_hits, *exact_hits]:
612
+ item = dict(item)
613
+ searchable = self._evidence_text(item["metadata"], item["document"])
614
+ matched_terms = self._matched_terms(searchable, exact_terms)
615
+ item["match_terms"] = sorted(set(item.get("match_terms", []) + matched_terms), key=str.lower)
616
+ item["score"] = item.get("score", 0.0) + self._metadata_match_score(question, item["match_terms"],
617
+ item["metadata"])
618
+ existing = merged.get(item["id"])
619
+ if existing is None or self._hit_priority(item) > self._hit_priority(existing):
620
+ merged[item["id"]] = item
621
+ elif existing is not None:
622
+ existing["match_terms"] = sorted(set(existing.get("match_terms", []) + item["match_terms"]),
623
+ key=str.lower)
624
+ existing["score"] = max(existing.get("score", 0.0), item.get("score", 0.0))
625
+
626
+ ranked = sorted(merged.values(), key=lambda item: self._hit_priority(item), reverse=True)
627
+ return self._diversify_hits(ranked, top_k or self._context_limit())
628
+
629
+ @staticmethod
630
+ def _hit_priority(item: Dict) -> Tuple[float, int]:
631
+ type_bonus = {
632
+ "exact": 3,
633
+ "neighbor_context": 2,
634
+ "relation_context": 2,
635
+ "semantic": 1,
636
+ }.get(item.get("match_type"), 0)
637
+ return float(item.get("score", 0.0)), type_bonus
638
+
639
+ def _diversify_hits(self, ranked: List[Dict], limit: int) -> List[Dict]:
640
+ selected = []
641
+ per_file = {}
642
+ exact_per_file = {}
643
+ exact_selected = 0
644
+ exact_limit = self._exact_context_k
645
+ exact_file_limit = self._exact_per_file_context_k
646
+ per_file_limit = self._per_file_context_k
647
+
648
+ for item in ranked:
649
+ file_path = item["metadata"].get("file_path", "")
650
+ match_type = item.get("match_type")
651
+ if match_type == "exact" and exact_selected >= exact_limit:
652
+ continue
653
+ if match_type == "exact" and exact_per_file.get(file_path, 0) >= exact_file_limit:
654
+ continue
655
+ if per_file.get(file_path, 0) >= per_file_limit and match_type != "exact":
656
+ continue
657
+ selected.append(item)
658
+ per_file[file_path] = per_file.get(file_path, 0) + 1
659
+ if match_type == "exact":
660
+ exact_selected += 1
661
+ exact_per_file[file_path] = exact_per_file.get(file_path, 0) + 1
662
+ if len(selected) >= limit:
663
+ break
664
+ return selected
665
+
666
+ def _metadata_match_score(self, question: str, matched_terms: List[str], metadata: Dict) -> float:
667
+ score = 0.0
668
+ weighted_fields = [
669
+ ("kb_id", 2.0),
670
+ ("title", 1.4),
671
+ ("source_section", 1.2),
672
+ ("tags", 1.6),
673
+ ("business_modules", 0.8),
674
+ ("source_doc", 0.4),
675
+ ("doc_type", 0.5),
676
+ ("domain", 0.4),
677
+ ("category", 0.3),
678
+ ("category_keywords", 0.3),
679
+ ("subcategory", 0.7),
680
+ ("related_items", 0.8),
681
+ ("related_categories", 0.2),
682
+ ("relation_notes", 0.2),
683
+ ]
684
+ question_lower = question.lower()
685
+ for field, weight in weighted_fields:
686
+ text = str(metadata.get(field, ""))
687
+ text_lower = text.lower()
688
+ for term in matched_terms:
689
+ if term.lower() in text_lower:
690
+ score += weight
691
+ for token in self._TOKEN_SPLIT_RE.split(text):
692
+ token = token.strip()
693
+ if len(token) >= 2 and token.lower() in question_lower:
694
+ score += min(weight, 0.12)
695
+ return min(score, 8.0)
696
+
697
+ @staticmethod
698
+ def _matched_terms(text: str, terms: List[str]) -> List[str]:
699
+ lower_text = text.lower()
700
+ return [term for term in terms if term.lower() in lower_text]
701
+
702
+ @staticmethod
703
+ def _evidence_text(metadata: Dict, document: str) -> str:
704
+ return f"{RagService._metadata_text(metadata)}\n{document}"
705
+
706
+ @staticmethod
707
+ def _metadata_text(metadata: Dict) -> str:
708
+ metadata_text = " ".join([
709
+ str(metadata.get("kb_id", "")),
710
+ str(metadata.get("title", "")),
711
+ str(metadata.get("source_doc", "")),
712
+ str(metadata.get("source_section", "")),
713
+ str(metadata.get("doc_type", "")),
714
+ str(metadata.get("domain", "")),
715
+ str(metadata.get("tags", "")),
716
+ str(metadata.get("business_modules", "")),
717
+ str(metadata.get("category", "")),
718
+ str(metadata.get("category_keywords", "")),
719
+ str(metadata.get("subcategory", "")),
720
+ str(metadata.get("related_items", "")),
721
+ str(metadata.get("related_categories", "")),
722
+ str(metadata.get("relation_notes", "")),
723
+ str(metadata.get("file_path", "")),
724
+ ])
725
+ return metadata_text
726
+
727
+ @staticmethod
728
+ def _format_evidence_context(ranked: List[Dict]) -> str:
729
+ group_names = {
730
+ "exact": "高置信精确命中",
731
+ "neighbor_context": "精确命中邻近上下文",
732
+ "relation_context": "知识小类关系上下文",
733
+ "semantic": "相关语义召回",
734
+ }
735
+ groups = []
736
+ for match_type, group_name in group_names.items():
737
+ items = [item for item in ranked if item.get("match_type") == match_type]
738
+ if not items:
739
+ continue
740
+ blocks = [f"【{group_name}】"]
741
+ for item in items:
742
+ metadata = item["metadata"]
743
+ terms = "、".join(item.get("match_terms", [])) or "无显式代码词命中"
744
+ blocks.append(
745
+ f"[{len(blocks)}] 标题:{metadata.get('title', '')}\n"
746
+ f"匹配类型:{item.get('match_type', '')}\n"
747
+ f"置信级别:{item.get('confidence', '')}\n"
748
+ f"命中词:{terms}\n"
749
+ f"使用建议:{item.get('usage_hint', '')}\n"
750
+ f"文档类型:{metadata.get('doc_type', '')}\n"
751
+ f"知识领域:{metadata.get('domain', '')}\n"
752
+ f"知识大类:{metadata.get('category', '')}\n"
753
+ f"关键词:{metadata.get('category_keywords', '')}\n"
754
+ f"知识小类:{metadata.get('subcategory', '')}\n"
755
+ f"关联信息:{metadata.get('related_items', '')}\n"
756
+ f"来源:{metadata.get('source_doc', '')} {metadata.get('source_section', '')}\n"
757
+ f"内容:{item.get('document', '')}"
758
+ )
759
+ groups.append("\n\n".join(blocks))
760
+ return "\n\n".join(groups)
761
+
762
+ @staticmethod
763
+ def _searchable_content(metadata: Dict, chunk: str) -> str:
764
+ prefix = "\n".join([
765
+ f"知识库ID:{metadata.get('kb_id', '')}",
766
+ f"标题:{metadata.get('title', '')}",
767
+ f"来源文档:{metadata.get('source_doc', '')}",
768
+ f"来源章节:{metadata.get('source_section', '')}",
769
+ f"文档类型:{metadata.get('doc_type', '')}",
770
+ f"知识领域:{metadata.get('domain', '')}",
771
+ f"标签:{metadata.get('tags', '')}",
772
+ f"业务模块:{metadata.get('business_modules', '')}",
773
+ f"知识大类:{metadata.get('category', '')}",
774
+ f"关键词:{metadata.get('category_keywords', '')}",
775
+ f"知识小类:{metadata.get('subcategory', '')}",
776
+ f"关联信息:{metadata.get('related_items', '')}",
777
+ ]).strip()
778
+ return f"{prefix}\n\n---CONTENT---\n{chunk}" if prefix else chunk
779
+
780
+ @staticmethod
781
+ def _strip_search_prefix(document: str) -> str:
782
+ marker = "---CONTENT---\n"
783
+ if marker in document:
784
+ return document.split(marker, 1)[1]
785
+ return document
786
+
787
+ @staticmethod
788
+ def _read_markdown(path: Path) -> Tuple[Dict, str]:
789
+ text = path.read_text(encoding="utf-8")
790
+ match = re.match(r"^---\n(.*?)\n---\n(.*)$", text, re.S)
791
+ if not match:
792
+ return {"title": path.stem, "status": "active"}, text
793
+ return yaml.safe_load(match.group(1)) or {}, match.group(2).strip()
794
+
795
+ def _split(self, text: str) -> List[str]:
796
+ max_chars = self._chunk_size
797
+ parts = re.split(r"\n(?=##\s+)", text)
798
+ chunks = []
799
+ for part in parts:
800
+ part = part.strip()
801
+ if not part:
802
+ continue
803
+ if len(part) <= max_chars:
804
+ chunks.append(part)
805
+ else:
806
+ chunks.extend(part[i:i + max_chars] for i in range(0, len(part), max_chars))
807
+ return chunks
808
+
809
+ def _embed(self, text: str) -> List[float]:
810
+ response = self._client.embeddings.create(
811
+ model=self._embedding_model,
812
+ input=text[: self._embedding_max_chars],
813
+ )
814
+ item = response.data[0]
815
+ return item["embedding"] if isinstance(item, dict) else item.embedding
816
+
817
+ def _embed_batch(self, texts: List[str]) -> List[List[float]]:
818
+ response = self._client.embeddings.create(
819
+ model=self._embedding_model,
820
+ input=[text[: self._embedding_max_chars] for text in texts],
821
+ )
822
+ embeddings = [
823
+ item["embedding"] if isinstance(item, dict) else item.embedding
824
+ for item in response.data
825
+ ]
826
+ if len(embeddings) != len(texts):
827
+ raise RuntimeError(f"embedding batch 返回数量不匹配。expected={len(texts)} actual={len(embeddings)}")
828
+ return embeddings
829
+
830
+ @staticmethod
831
+ def _clean_metadata(metadata: Dict) -> Dict:
832
+ cleaned = {}
833
+ for key, value in metadata.items():
834
+ if isinstance(value, (list, dict)):
835
+ cleaned[key] = json.dumps(value, ensure_ascii=False)
836
+ elif value is None:
837
+ cleaned[key] = ""
838
+ elif isinstance(value, (str, int, float, bool)):
839
+ cleaned[key] = value
840
+ else:
841
+ cleaned[key] = str(value)
842
+ return cleaned
843
+
844
+ @staticmethod
845
+ def _chunk_id(path: Path, idx: int, chunk: str) -> str:
846
+ digest = hashlib.md5(f"{path}:{idx}:{chunk}".encode("utf-8")).hexdigest()
847
+ return digest
848
+
849
+ @staticmethod
850
+ def _source(metadata: Dict) -> Dict:
851
+ return {
852
+ "kb_id": metadata.get("kb_id", ""),
853
+ "title": metadata.get("title", ""),
854
+ "doc_type": metadata.get("doc_type", ""),
855
+ "domain": metadata.get("domain", ""),
856
+ "category": metadata.get("category", ""),
857
+ "category_keywords": metadata.get("category_keywords", ""),
858
+ "source_doc_description": metadata.get("source_doc_description", ""),
859
+ "subcategory": metadata.get("subcategory", ""),
860
+ "related_items": metadata.get("related_items", ""),
861
+ "related_categories": metadata.get("related_categories", ""),
862
+ "relation_notes": metadata.get("relation_notes", ""),
863
+ "source_doc": metadata.get("source_doc", ""),
864
+ "source_section": metadata.get("source_section", ""),
865
+ "file_path": metadata.get("file_path", ""),
866
+ }