union_kb_ingest 1.0.10 → 1.0.11
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +5 -30
- package/app_config.py +21 -33
- package/config/config.yaml +1 -1
- package/ingest.py +148 -37
- package/input/function/.gitkeep +1 -0
- package/input/function/tools.yaml +27 -0
- package/normalizer.py +92 -403
- package/package.json +3 -2
- package/requirements.txt +2 -5
- package/writer.py +1 -0
package/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Simon
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
package/README.md
CHANGED
|
@@ -12,23 +12,11 @@
|
|
|
12
12
|
|
|
13
13
|
启用大模型时,工具只会读取 `prompts/知识库建立规范.md` 作为格式和质量约束,并由代码按当前片段、辅助上下文和输出 JSON 结构组装生成提示词。模型需依据原文语义判断业务场景、模块、角色、标签和风险等级;代码中的启发式生成只作为未启用大模型时的兜底,不使用预设业务关键词去指导大模型输出。
|
|
14
14
|
|
|
15
|
-
##
|
|
16
|
-
|
|
17
|
-
不建议把这些依赖加入项目根 `requirements.txt`。离线机器单独安装即可:
|
|
15
|
+
## 安装依赖
|
|
18
16
|
|
|
19
17
|
```bash
|
|
20
18
|
python -m pip install -r requirements.txt
|
|
21
19
|
```
|
|
22
|
-
|
|
23
|
-
解析层不使用 OCR,不加载本地视觉/版面模型,也不访问远程模型服务:
|
|
24
|
-
|
|
25
|
-
- PDF:使用 `docling-parse` 抽取 PDF 内嵌文本和文本行顺序;扫描件或图片型 PDF 不会识别。
|
|
26
|
-
- DOCX:使用 Docling 的 Word 后端转为 Markdown。
|
|
27
|
-
- 旧版 `.doc`:通过 LibreOffice `soffice` 转为 `.docx` 后再解析;不使用 OCR。
|
|
28
|
-
- Markdown / TXT:作为已文本化材料直接读取。
|
|
29
|
-
|
|
30
|
-
不要安装 `docling` 或 `docling-slim[standard]`,它们会引入 OCR、版面/表格模型、Torch/ONNXRuntime 等重依赖,并可能在运行时下载模型。内网机器建议为离线工具单独准备 Python 3.10+ 环境。
|
|
31
|
-
|
|
32
20
|
## 基本用法
|
|
33
21
|
|
|
34
22
|
把文件放入:
|
|
@@ -37,6 +25,8 @@ python -m pip install -r requirements.txt
|
|
|
37
25
|
input/
|
|
38
26
|
```
|
|
39
27
|
|
|
28
|
+
工具调用示例放在 `input/function/tools.yaml` 中,可按业务需要替换。
|
|
29
|
+
|
|
40
30
|
生成知识库文件:
|
|
41
31
|
|
|
42
32
|
```bash
|
|
@@ -67,7 +57,7 @@ python ingest.py validate
|
|
|
67
57
|
|
|
68
58
|
## 大模型配置
|
|
69
59
|
|
|
70
|
-
|
|
60
|
+
默认不强制调用大模型,但是强烈建议启用大模型分析,会使用启发式模板生成知识库文件。
|
|
71
61
|
|
|
72
62
|
如果要启用大模型整理,修改 `config/config.yaml`:
|
|
73
63
|
|
|
@@ -75,7 +65,7 @@ python ingest.py validate
|
|
|
75
65
|
llm:
|
|
76
66
|
enabled: true
|
|
77
67
|
base_url: "https://open.bigmodel.cn/api/paas/v4/"
|
|
78
|
-
api_key: "
|
|
68
|
+
api_key: ""
|
|
79
69
|
model: "glm-4.7"
|
|
80
70
|
timeout_seconds: 120
|
|
81
71
|
max_tokens: 8192
|
|
@@ -87,19 +77,4 @@ draft:
|
|
|
87
77
|
outline_max_sections: 40
|
|
88
78
|
```
|
|
89
79
|
|
|
90
|
-
也可以继续使用环境变量覆盖配置文件:
|
|
91
|
-
|
|
92
|
-
```bash
|
|
93
|
-
export KB_LLM_ENABLED=true
|
|
94
|
-
export KB_LLM_BASE_URL="https://open.bigmodel.cn/api/paas/v4/"
|
|
95
|
-
export KB_LLM_API_KEY="your-zhipu-api-key"
|
|
96
|
-
export KB_LLM_MODEL="glm-4.7"
|
|
97
|
-
```
|
|
98
|
-
|
|
99
80
|
工具通过 Z.AI 新版 Python SDK 调用中文智谱开放平台 GLM,依赖固定为 `zai-sdk==0.2.2`,客户端固定使用官方中文写法 `from zai import ZhipuAiClient`,`base_url` 使用 `https://open.bigmodel.cn/api/paas/v4/`。工具不再包含旧 `zhipuai` SDK、国际版 `ZaiClient` 或 OpenAI 调用路径,也不 import 项目 `src` 代码。
|
|
100
|
-
|
|
101
|
-
## 与线上项目的关系
|
|
102
|
-
|
|
103
|
-
这个工具只产出符合规范的 `*.md` 文件到 `result/`,后续由线上知识库加载流程处理。
|
|
104
|
-
|
|
105
|
-
建议线上打包时排除整个 `tools/kb_ingest` 目录。
|
package/app_config.py
CHANGED
|
@@ -1,6 +1,5 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
-
import os
|
|
4
3
|
from dataclasses import dataclass
|
|
5
4
|
from functools import lru_cache
|
|
6
5
|
from pathlib import Path
|
|
@@ -15,7 +14,7 @@ DEFAULT_CONFIG_PATH = CURRENT_DIR / "config" / "config.yaml"
|
|
|
15
14
|
|
|
16
15
|
@dataclass(frozen=True)
|
|
17
16
|
class LlmConfig:
|
|
18
|
-
"""LLM
|
|
17
|
+
"""LLM 调用配置。"""
|
|
19
18
|
enabled: bool = False
|
|
20
19
|
base_url: str = ""
|
|
21
20
|
api_key: str = ""
|
|
@@ -35,67 +34,56 @@ class DraftConfig:
|
|
|
35
34
|
|
|
36
35
|
@lru_cache(maxsize=1)
|
|
37
36
|
def get_llm_config() -> LlmConfig:
|
|
38
|
-
"""
|
|
37
|
+
"""读取 LLM 配置。"""
|
|
39
38
|
raw = _read_config().get("llm", {})
|
|
40
39
|
if not isinstance(raw, dict):
|
|
41
40
|
raw = {}
|
|
42
41
|
|
|
43
42
|
return LlmConfig(
|
|
44
|
-
enabled=
|
|
45
|
-
base_url=
|
|
46
|
-
api_key=
|
|
47
|
-
model=
|
|
48
|
-
timeout_seconds=
|
|
49
|
-
max_tokens=
|
|
50
|
-
temperature=
|
|
43
|
+
enabled=_as_bool(raw.get("enabled"), False),
|
|
44
|
+
base_url=str(raw.get("base_url") or ""),
|
|
45
|
+
api_key=str(raw.get("api_key") or ""),
|
|
46
|
+
model=str(raw.get("model") or ""),
|
|
47
|
+
timeout_seconds=_as_int(raw.get("timeout_seconds"), 120),
|
|
48
|
+
max_tokens=_as_int(raw.get("max_tokens"), 4096),
|
|
49
|
+
temperature=_as_float(raw.get("temperature"), 0.1),
|
|
51
50
|
)
|
|
52
51
|
|
|
53
52
|
|
|
54
53
|
@lru_cache(maxsize=1)
|
|
55
54
|
def get_draft_config() -> DraftConfig:
|
|
56
|
-
"""
|
|
55
|
+
"""读取草稿生成配置。"""
|
|
57
56
|
raw = _read_config().get("draft", {})
|
|
58
57
|
if not isinstance(raw, dict):
|
|
59
58
|
raw = {}
|
|
60
59
|
|
|
61
60
|
return DraftConfig(
|
|
62
|
-
max_chars=
|
|
63
|
-
context_chars=
|
|
64
|
-
outline_max_sections=
|
|
61
|
+
max_chars=_as_int(raw.get("max_chars"), 3600),
|
|
62
|
+
context_chars=_as_int(raw.get("context_chars"), 800),
|
|
63
|
+
outline_max_sections=_as_int(raw.get("outline_max_sections"), 40),
|
|
65
64
|
)
|
|
66
65
|
|
|
67
66
|
|
|
68
67
|
def _read_config() -> Dict[str, Any]:
|
|
69
68
|
"""读取 YAML 配置文件并返回字典。"""
|
|
70
|
-
|
|
71
|
-
if not path.exists():
|
|
69
|
+
if not DEFAULT_CONFIG_PATH.exists():
|
|
72
70
|
return {}
|
|
73
|
-
data = yaml.safe_load(
|
|
71
|
+
data = yaml.safe_load(DEFAULT_CONFIG_PATH.read_text(encoding="utf-8")) or {}
|
|
74
72
|
return data if isinstance(data, dict) else {}
|
|
75
73
|
|
|
76
74
|
|
|
77
|
-
def
|
|
78
|
-
"""
|
|
79
|
-
value = os.environ.get(name)
|
|
80
|
-
if value is None:
|
|
81
|
-
return default
|
|
82
|
-
return _as_bool(value, default)
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
def _env_int(name: str, value: Any, default: int) -> int:
|
|
86
|
-
"""读取整数环境变量并回退到默认值。"""
|
|
87
|
-
raw = os.environ.get(name, value)
|
|
75
|
+
def _as_int(value: Any, default: int) -> int:
|
|
76
|
+
"""把配置值转换为整数。"""
|
|
88
77
|
try:
|
|
89
|
-
return int(
|
|
78
|
+
return int(value)
|
|
90
79
|
except (TypeError, ValueError):
|
|
91
80
|
return default
|
|
92
81
|
|
|
93
82
|
|
|
94
|
-
def
|
|
95
|
-
"""
|
|
96
|
-
raw = os.environ.get(name, value)
|
|
83
|
+
def _as_float(value: Any, default: float) -> float:
|
|
84
|
+
"""把配置值转换为浮点数。"""
|
|
97
85
|
try:
|
|
98
|
-
return float(
|
|
86
|
+
return float(value)
|
|
99
87
|
except (TypeError, ValueError):
|
|
100
88
|
return default
|
|
101
89
|
|
package/config/config.yaml
CHANGED
package/ingest.py
CHANGED
|
@@ -10,12 +10,14 @@ from pathlib import Path
|
|
|
10
10
|
from typing import List
|
|
11
11
|
from uuid import uuid4
|
|
12
12
|
|
|
13
|
+
import yaml
|
|
14
|
+
|
|
13
15
|
CURRENT_DIR = Path(__file__).resolve().parent
|
|
14
16
|
if str(CURRENT_DIR) not in sys.path:
|
|
15
17
|
sys.path.insert(0, str(CURRENT_DIR))
|
|
16
18
|
|
|
17
19
|
from app_config import get_draft_config
|
|
18
|
-
from normalizer import normalize_block
|
|
20
|
+
from normalizer import fallback_failed_block, normalize_block
|
|
19
21
|
from parser import iter_input_files, parse_document
|
|
20
22
|
from schemas import ParsedBlock
|
|
21
23
|
from splitter import split_blocks
|
|
@@ -58,6 +60,8 @@ def cmd_draft(args) -> int:
|
|
|
58
60
|
return 0
|
|
59
61
|
if action == "rebuild":
|
|
60
62
|
_clear_generated_files(output_dir)
|
|
63
|
+
elif action == "retry":
|
|
64
|
+
return _retry_failed_files(output_dir, status=args.status)
|
|
61
65
|
elif action == "resume":
|
|
62
66
|
resume_state = _load_progress(progress_path)
|
|
63
67
|
if not resume_state:
|
|
@@ -114,39 +118,22 @@ def cmd_draft(args) -> int:
|
|
|
114
118
|
try:
|
|
115
119
|
items = normalize_block(block, status=args.status)
|
|
116
120
|
except SystemExit as exc:
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
f"file={path.name} block={block_index + 1}/{len(blocks)} "
|
|
134
|
-
f"progress={progress_path}"
|
|
135
|
-
)
|
|
136
|
-
raise
|
|
137
|
-
for item in items:
|
|
138
|
-
source_order += 1
|
|
139
|
-
item.source_order = source_order
|
|
140
|
-
item.source_pages = sorted(set(block.pages))
|
|
141
|
-
item.source_trace = _source_trace(block)
|
|
142
|
-
write_item(
|
|
143
|
-
item,
|
|
144
|
-
output_dir,
|
|
145
|
-
source_title=Path(block.source_doc).stem,
|
|
146
|
-
timestamp=run_timestamp,
|
|
147
|
-
trace_id=run_trace_id,
|
|
148
|
-
)
|
|
149
|
-
total_items += 1
|
|
121
|
+
print(f"WARNING: block failed with SystemExit({exc.code}); writing failed fallback")
|
|
122
|
+
items = fallback_failed_block(block, status=args.status)
|
|
123
|
+
except Exception as exc:
|
|
124
|
+
print(f"WARNING: block failed with {type(exc).__name__}: {exc}; writing failed fallback")
|
|
125
|
+
items = fallback_failed_block(block, status=args.status)
|
|
126
|
+
if not items:
|
|
127
|
+
items = fallback_failed_block(block, status=args.status)
|
|
128
|
+
source_order, written = _write_items(
|
|
129
|
+
items,
|
|
130
|
+
output_dir,
|
|
131
|
+
block,
|
|
132
|
+
source_order=source_order,
|
|
133
|
+
run_timestamp=run_timestamp,
|
|
134
|
+
run_trace_id=run_trace_id,
|
|
135
|
+
)
|
|
136
|
+
total_items += written
|
|
150
137
|
_save_progress(
|
|
151
138
|
progress_path,
|
|
152
139
|
input_path=input_path,
|
|
@@ -178,20 +165,144 @@ def _list_effective_files(path: Path) -> list[Path]:
|
|
|
178
165
|
)
|
|
179
166
|
|
|
180
167
|
|
|
168
|
+
def _write_items(
|
|
169
|
+
items,
|
|
170
|
+
output_dir: Path,
|
|
171
|
+
block: ParsedBlock,
|
|
172
|
+
*,
|
|
173
|
+
source_order: int,
|
|
174
|
+
run_timestamp: str,
|
|
175
|
+
run_trace_id: str,
|
|
176
|
+
) -> tuple[int, int]:
|
|
177
|
+
written = 0
|
|
178
|
+
for item in items:
|
|
179
|
+
source_order += 1
|
|
180
|
+
item.source_order = source_order
|
|
181
|
+
item.source_pages = sorted(set(block.pages))
|
|
182
|
+
item.source_trace = _source_trace(block)
|
|
183
|
+
write_item(
|
|
184
|
+
item,
|
|
185
|
+
output_dir,
|
|
186
|
+
source_title=Path(block.source_doc).stem,
|
|
187
|
+
timestamp=run_timestamp,
|
|
188
|
+
trace_id=run_trace_id,
|
|
189
|
+
)
|
|
190
|
+
written += 1
|
|
191
|
+
return source_order, written
|
|
192
|
+
|
|
193
|
+
|
|
194
|
+
def _retry_failed_files(output_dir: Path, status: str) -> int:
|
|
195
|
+
failed_files = [path for path in _list_effective_files(output_dir) if "failed" in path.stem.lower()]
|
|
196
|
+
if not failed_files:
|
|
197
|
+
print("done. failed_files=0 retried=0")
|
|
198
|
+
return 0
|
|
199
|
+
|
|
200
|
+
run_timestamp = _make_timestamp()
|
|
201
|
+
run_trace_id = uuid4().hex[:8]
|
|
202
|
+
retried = 0
|
|
203
|
+
succeeded = 0
|
|
204
|
+
still_failed = 0
|
|
205
|
+
for path in failed_files:
|
|
206
|
+
block = _block_from_failed_file(path)
|
|
207
|
+
if not block:
|
|
208
|
+
print(f"WARNING: skipped failed file without chunk source: {path}")
|
|
209
|
+
continue
|
|
210
|
+
try:
|
|
211
|
+
items = normalize_block(block, status=status)
|
|
212
|
+
except SystemExit as exc:
|
|
213
|
+
print(f"WARNING: retry failed with SystemExit({exc.code}); keeping failed fallback: {path}")
|
|
214
|
+
items = fallback_failed_block(block, status=status)
|
|
215
|
+
except Exception as exc:
|
|
216
|
+
print(f"WARNING: retry failed with {type(exc).__name__}: {exc}; keeping failed fallback: {path}")
|
|
217
|
+
items = fallback_failed_block(block, status=status)
|
|
218
|
+
if not items:
|
|
219
|
+
items = fallback_failed_block(block, status=status)
|
|
220
|
+
|
|
221
|
+
source_order = max(int(block.order or 0) - 1, 0)
|
|
222
|
+
_, written = _write_items(
|
|
223
|
+
items,
|
|
224
|
+
output_dir,
|
|
225
|
+
block,
|
|
226
|
+
source_order=source_order,
|
|
227
|
+
run_timestamp=run_timestamp,
|
|
228
|
+
run_trace_id=run_trace_id,
|
|
229
|
+
)
|
|
230
|
+
if written:
|
|
231
|
+
path.unlink()
|
|
232
|
+
retried += 1
|
|
233
|
+
if any(item.review_status == "failed" for item in items):
|
|
234
|
+
still_failed += 1
|
|
235
|
+
else:
|
|
236
|
+
succeeded += 1
|
|
237
|
+
print(f"done. failed_files={len(failed_files)} retried={retried} succeeded={succeeded} still_failed={still_failed}")
|
|
238
|
+
return 0
|
|
239
|
+
|
|
240
|
+
|
|
241
|
+
def _block_from_failed_file(path: Path) -> ParsedBlock | None:
|
|
242
|
+
text = path.read_text(encoding="utf-8")
|
|
243
|
+
metadata = _front_matter(text)
|
|
244
|
+
source = _failed_chunk_source(text)
|
|
245
|
+
if not source:
|
|
246
|
+
return None
|
|
247
|
+
return ParsedBlock(
|
|
248
|
+
source_doc=str(metadata.get("source_doc") or path.stem),
|
|
249
|
+
source_section=str(metadata.get("source_section") or ""),
|
|
250
|
+
content=source,
|
|
251
|
+
pages=[int(page) for page in metadata.get("source_pages") or [] if str(page).isdigit()],
|
|
252
|
+
order=int(metadata.get("source_order") or 0),
|
|
253
|
+
category=str(metadata.get("category") or ""),
|
|
254
|
+
category_keywords=[str(item) for item in metadata.get("category_keywords") or []],
|
|
255
|
+
source_doc_description=str(metadata.get("source_doc_description") or ""),
|
|
256
|
+
subcategory=str(metadata.get("subcategory") or ""),
|
|
257
|
+
category_path=[str(item) for item in metadata.get("category_path") or []],
|
|
258
|
+
related_categories=[str(item) for item in metadata.get("related_categories") or []],
|
|
259
|
+
relation_notes=[str(item) for item in metadata.get("relation_notes") or []],
|
|
260
|
+
related_items=metadata.get("related_items") or [],
|
|
261
|
+
)
|
|
262
|
+
|
|
263
|
+
|
|
264
|
+
def _front_matter(text: str) -> dict:
|
|
265
|
+
if not text.startswith("---\n"):
|
|
266
|
+
return {}
|
|
267
|
+
end = text.find("\n---\n", 4)
|
|
268
|
+
if end < 0:
|
|
269
|
+
return {}
|
|
270
|
+
data = yaml.safe_load(text[4:end]) or {}
|
|
271
|
+
return data if isinstance(data, dict) else {}
|
|
272
|
+
|
|
273
|
+
|
|
274
|
+
def _failed_chunk_source(text: str) -> str:
|
|
275
|
+
marker = "## failed_chunk_source"
|
|
276
|
+
start = text.find(marker)
|
|
277
|
+
if start < 0:
|
|
278
|
+
return ""
|
|
279
|
+
source = text[start + len(marker):].strip()
|
|
280
|
+
if source.startswith("```"):
|
|
281
|
+
first_line = source.find("\n")
|
|
282
|
+
if first_line >= 0:
|
|
283
|
+
source = source[first_line + 1:]
|
|
284
|
+
if source.endswith("```"):
|
|
285
|
+
source = source[:-3]
|
|
286
|
+
return source.strip()
|
|
287
|
+
|
|
288
|
+
|
|
181
289
|
def _choose_existing_result_action(output_dir: Path, existing: list[Path]) -> str:
|
|
182
290
|
"""询问用户如何处理已有生成结果。"""
|
|
183
291
|
print(f"found {len(existing)} existing file(s) in {output_dir}.")
|
|
184
292
|
print("Choose how to continue:")
|
|
185
293
|
print("1. delete and rebuild")
|
|
186
294
|
print("2. resume from checkpoint")
|
|
187
|
-
print("3.
|
|
188
|
-
|
|
189
|
-
|
|
295
|
+
print("3. retry failed files")
|
|
296
|
+
print("4. exit")
|
|
297
|
+
answer = input("Select [1/2/3/4]: ").strip().lower().translate(
|
|
298
|
+
str.maketrans({"1": "1", "2": "2", "3": "3", "4": "4"})
|
|
190
299
|
)
|
|
191
300
|
if answer.startswith("1") or answer in {"d", "delete", "rebuild", "r"}:
|
|
192
301
|
return "rebuild"
|
|
193
302
|
if answer.startswith("2") or answer in {"resume", "continue", "c"}:
|
|
194
303
|
return "resume"
|
|
304
|
+
if answer.startswith("3") or answer in {"retry", "failed", "f"}:
|
|
305
|
+
return "retry"
|
|
195
306
|
return "exit"
|
|
196
307
|
|
|
197
308
|
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
tools:
|
|
2
|
+
- name: "query_member_change_announcements"
|
|
3
|
+
display_name: "查询成员机构变更通知"
|
|
4
|
+
description: "查询成员机构近期变更、变更报备、计划变更、执行评价以及影响范围。"
|
|
5
|
+
trigger: "当用户询问成员机构变更通知、变更报备、计划变更或执行评价时优先调用。"
|
|
6
|
+
input_schema:
|
|
7
|
+
type: "object"
|
|
8
|
+
properties:
|
|
9
|
+
orgCode:
|
|
10
|
+
type: "string"
|
|
11
|
+
description: "成员机构金融编码,通常由 resolve_member_org 工具获得。"
|
|
12
|
+
required: ["orgCode"]
|
|
13
|
+
output_schema:
|
|
14
|
+
type: "array"
|
|
15
|
+
description: "匹配到的成员机构变更通知列表。"
|
|
16
|
+
items:
|
|
17
|
+
type: "object"
|
|
18
|
+
properties:
|
|
19
|
+
member_code:
|
|
20
|
+
type: "string"
|
|
21
|
+
description: "成员机构编码。"
|
|
22
|
+
member_name:
|
|
23
|
+
type: "string"
|
|
24
|
+
description: "成员机构名称。"
|
|
25
|
+
change_type:
|
|
26
|
+
type: "string"
|
|
27
|
+
description: "变更类型。"
|
package/normalizer.py
CHANGED
|
@@ -15,8 +15,8 @@ from app_config import get_llm_config
|
|
|
15
15
|
from schemas import DOC_TYPES, KnowledgeItem, ParsedBlock
|
|
16
16
|
|
|
17
17
|
|
|
18
|
-
DEFAULT_DOMAIN = "
|
|
19
|
-
DEFAULT_OWNER = "
|
|
18
|
+
DEFAULT_DOMAIN = "通用业务"
|
|
19
|
+
DEFAULT_OWNER = "通用知识库"
|
|
20
20
|
CURRENT_DIR = Path(__file__).resolve().parent
|
|
21
21
|
KB_SPEC_PATH = CURRENT_DIR / "prompts" / "知识库建立规范.md"
|
|
22
22
|
TOOLS_PATH = CURRENT_DIR / "input" / "function" / "tools.yaml"
|
|
@@ -37,15 +37,23 @@ def normalize_block(block: ParsedBlock, status: str = "draft") -> List[Knowledge
|
|
|
37
37
|
items = _normalize_with_llm(block, status=status)
|
|
38
38
|
if items:
|
|
39
39
|
return [_postprocess_item(item, block) for item in items]
|
|
40
|
-
|
|
40
|
+
print("WARNING: model returned no valid knowledge items; using heuristic fallback")
|
|
41
|
+
return fallback_failed_block(block, status=status)
|
|
41
42
|
return [_postprocess_item(_normalize_heuristically(block, status=status), block)]
|
|
42
43
|
|
|
43
44
|
|
|
45
|
+
def fallback_failed_block(block: ParsedBlock, status: str = "draft") -> List[KnowledgeItem]:
|
|
46
|
+
"""用离线规则生成 failed 兜底条目。"""
|
|
47
|
+
item = _normalize_heuristically(block, status=status)
|
|
48
|
+
return [_mark_llm_failed_item(_postprocess_item(item, block), block)]
|
|
49
|
+
|
|
50
|
+
|
|
44
51
|
def _normalize_with_llm(block: ParsedBlock, status: str) -> List[KnowledgeItem]:
|
|
45
52
|
"""调用 LLM 生成条目并处理重试。"""
|
|
46
53
|
config = get_llm_config()
|
|
47
54
|
if not (config.base_url and config.api_key and config.model):
|
|
48
|
-
|
|
55
|
+
print("WARNING: missing base_url, api_key, or model; using heuristic fallback")
|
|
56
|
+
return []
|
|
49
57
|
|
|
50
58
|
base_prompt = _build_prompt(block, status)
|
|
51
59
|
started_at = time.monotonic()
|
|
@@ -61,22 +69,15 @@ def _normalize_with_llm(block: ParsedBlock, status: str) -> List[KnowledgeItem]:
|
|
|
61
69
|
return []
|
|
62
70
|
client = client_cls(api_key=config.api_key, base_url=config.base_url)
|
|
63
71
|
|
|
64
|
-
compact_retry = False
|
|
65
72
|
coverage_retry_count = 0
|
|
66
|
-
|
|
67
|
-
json_retry_feedback = ""
|
|
73
|
+
messages = _messages(base_prompt)
|
|
68
74
|
for attempt in range(1, LLM_MAX_RETRIES + 1):
|
|
69
|
-
prompt = _compact_retry_prompt(base_prompt) if compact_retry else base_prompt
|
|
70
|
-
if json_retry_feedback:
|
|
71
|
-
prompt = f"{prompt}\n\n{json_retry_feedback}"
|
|
72
|
-
if coverage_retry_feedback:
|
|
73
|
-
prompt = f"{prompt}\n\n{coverage_retry_feedback}"
|
|
74
75
|
try:
|
|
75
76
|
print(
|
|
76
77
|
"llm request: "
|
|
77
78
|
f"provider=zhipu base_url={config.base_url} attempt={attempt}/{LLM_MAX_RETRIES}"
|
|
78
79
|
)
|
|
79
|
-
response = _create_zhipu_completion(client, config,
|
|
80
|
+
response = _create_zhipu_completion(client, config, messages)
|
|
80
81
|
except Exception as exc:
|
|
81
82
|
elapsed = time.monotonic() - started_at
|
|
82
83
|
print(
|
|
@@ -85,7 +86,8 @@ def _normalize_with_llm(block: ParsedBlock, status: str) -> List[KnowledgeItem]:
|
|
|
85
86
|
f"after {elapsed:.1f}s detail={exc}"
|
|
86
87
|
)
|
|
87
88
|
if attempt >= LLM_MAX_RETRIES:
|
|
88
|
-
|
|
89
|
+
print(f"WARNING: request failed after {LLM_MAX_RETRIES} attempts: {type(exc).__name__}; using heuristic fallback")
|
|
90
|
+
return []
|
|
89
91
|
time.sleep(min(2 ** (attempt - 1), 30))
|
|
90
92
|
continue
|
|
91
93
|
|
|
@@ -107,14 +109,22 @@ def _normalize_with_llm(block: ParsedBlock, status: str) -> List[KnowledgeItem]:
|
|
|
107
109
|
f"response={_response_debug(response)}"
|
|
108
110
|
)
|
|
109
111
|
if attempt >= LLM_MAX_RETRIES:
|
|
110
|
-
|
|
112
|
+
print(f"WARNING: empty response content after {LLM_MAX_RETRIES} attempts; using heuristic fallback")
|
|
113
|
+
return []
|
|
114
|
+
_append_retry_messages(
|
|
115
|
+
messages,
|
|
116
|
+
content,
|
|
117
|
+
"重试补充要求:上一轮响应为空。请只返回一个 JSON object,根节点必须严格为 {\"items\": [...]}。",
|
|
118
|
+
)
|
|
111
119
|
time.sleep(min(2 ** (attempt - 1), 30))
|
|
112
120
|
continue
|
|
113
121
|
|
|
114
122
|
parse_result = _extract_json_with_diagnostics(content)
|
|
115
123
|
parsed = parse_result.value
|
|
116
124
|
if not parsed:
|
|
117
|
-
|
|
125
|
+
retry_prompt = _json_repair_retry_prompt(parse_result.error, content)
|
|
126
|
+
if _looks_truncated(content, finish_reason):
|
|
127
|
+
retry_prompt += "\n上一轮响应疑似被截断,本轮请压缩表述但保持 {\"items\": [...]} 根结构。"
|
|
118
128
|
print(
|
|
119
129
|
"llm parse failed: response is not valid JSON "
|
|
120
130
|
f"finish_reason={finish_reason or 'unknown'} "
|
|
@@ -123,15 +133,17 @@ def _normalize_with_llm(block: ParsedBlock, status: str) -> List[KnowledgeItem]:
|
|
|
123
133
|
f"preview={_preview(content)}"
|
|
124
134
|
)
|
|
125
135
|
if attempt >= LLM_MAX_RETRIES:
|
|
126
|
-
|
|
127
|
-
|
|
136
|
+
print(f"WARNING: response is not valid JSON after {LLM_MAX_RETRIES} attempts; using heuristic fallback")
|
|
137
|
+
return []
|
|
138
|
+
_append_retry_messages(messages, content, retry_prompt)
|
|
128
139
|
time.sleep(min(2 ** (attempt - 1), 30))
|
|
129
140
|
continue
|
|
130
|
-
json_retry_feedback = ""
|
|
131
141
|
|
|
132
142
|
raw_items = _coerce_raw_items(parsed)
|
|
133
143
|
if not isinstance(raw_items, list):
|
|
134
|
-
|
|
144
|
+
retry_prompt = _json_shape_retry_prompt(parsed)
|
|
145
|
+
if _looks_truncated(content, finish_reason):
|
|
146
|
+
retry_prompt += "\n上一轮响应疑似被截断,本轮请压缩表述但保持 {\"items\": [...]} 根结构。"
|
|
135
147
|
print(
|
|
136
148
|
"llm parse failed: JSON does not contain an items list "
|
|
137
149
|
f"finish_reason={finish_reason or 'unknown'} "
|
|
@@ -139,11 +151,11 @@ def _normalize_with_llm(block: ParsedBlock, status: str) -> List[KnowledgeItem]:
|
|
|
139
151
|
f"top_level={_json_shape(parsed)} preview={_preview(content)}"
|
|
140
152
|
)
|
|
141
153
|
if attempt >= LLM_MAX_RETRIES:
|
|
142
|
-
|
|
143
|
-
|
|
154
|
+
print(f"WARNING: JSON does not contain an items list after {LLM_MAX_RETRIES} attempts; using heuristic fallback")
|
|
155
|
+
return []
|
|
156
|
+
_append_retry_messages(messages, content, retry_prompt)
|
|
144
157
|
time.sleep(min(2 ** (attempt - 1), 30))
|
|
145
158
|
continue
|
|
146
|
-
json_retry_feedback = ""
|
|
147
159
|
|
|
148
160
|
items: List[KnowledgeItem] = []
|
|
149
161
|
for idx, raw in enumerate(raw_items, start=1):
|
|
@@ -154,37 +166,37 @@ def _normalize_with_llm(block: ParsedBlock, status: str) -> List[KnowledgeItem]:
|
|
|
154
166
|
if items:
|
|
155
167
|
coverage_issues = _source_fact_coverage_issues(block, items)
|
|
156
168
|
if coverage_issues:
|
|
157
|
-
high_relevance_issues = _high_relevance_coverage_issues(
|
|
158
|
-
client, config, block, items, coverage_issues
|
|
159
|
-
)
|
|
160
|
-
if not high_relevance_issues:
|
|
161
|
-
print(
|
|
162
|
-
"llm coverage warning ignored: "
|
|
163
|
-
"no highly relevant missing facts after relevance review"
|
|
164
|
-
)
|
|
165
|
-
return items
|
|
166
169
|
print(
|
|
167
170
|
"llm coverage failed: "
|
|
168
171
|
f"missing_facts={len(coverage_issues)} "
|
|
169
|
-
f"
|
|
170
|
-
f"preview={_preview(';'.join(high_relevance_issues[:3]))}"
|
|
172
|
+
f"preview={_preview(';'.join(coverage_issues[:3]))}"
|
|
171
173
|
)
|
|
172
174
|
if coverage_retry_count >= COVERAGE_MAX_RETRIES or attempt >= LLM_MAX_RETRIES:
|
|
173
175
|
print(
|
|
174
176
|
"WARNING: source fact coverage failed after "
|
|
175
177
|
f"{coverage_retry_count} coverage retries; releasing draft for manual review"
|
|
176
178
|
)
|
|
177
|
-
return _items_with_coverage_warning(items, block,
|
|
179
|
+
return _items_with_coverage_warning(items, block, coverage_issues)
|
|
178
180
|
coverage_retry_count += 1
|
|
179
|
-
|
|
181
|
+
_append_retry_messages(
|
|
182
|
+
messages,
|
|
183
|
+
content,
|
|
184
|
+
_coverage_retry_prompt(block, coverage_issues, items),
|
|
185
|
+
)
|
|
180
186
|
time.sleep(min(2 ** (attempt - 1), 30))
|
|
181
187
|
continue
|
|
182
188
|
return items
|
|
183
189
|
if attempt >= LLM_MAX_RETRIES:
|
|
184
|
-
|
|
190
|
+
print(f"WARNING: items list contained no valid objects after {LLM_MAX_RETRIES} attempts; using heuristic fallback")
|
|
191
|
+
return []
|
|
192
|
+
_append_retry_messages(
|
|
193
|
+
messages,
|
|
194
|
+
content,
|
|
195
|
+
"重试补充要求:上一轮 items 数组没有可用对象。请返回 {\"items\": [...]},items 中每个元素都必须是知识库条目对象。",
|
|
196
|
+
)
|
|
185
197
|
time.sleep(min(2 ** (attempt - 1), 30))
|
|
186
198
|
|
|
187
|
-
|
|
199
|
+
return []
|
|
188
200
|
|
|
189
201
|
|
|
190
202
|
def _system_message() -> str:
|
|
@@ -204,176 +216,17 @@ def _messages(prompt: str) -> List[Dict[str, str]]:
|
|
|
204
216
|
]
|
|
205
217
|
|
|
206
218
|
|
|
207
|
-
def _compact_retry_prompt(base_prompt: str) -> str:
|
|
208
|
-
"""构造输出截断后的紧凑重试提示。"""
|
|
209
|
-
return (
|
|
210
|
-
base_prompt
|
|
211
|
-
+ "\n\n重试补充要求:上一次输出疑似过长或结构不完整。"
|
|
212
|
-
"本次必须只生成 1 个 item,保留规范要求的正文 5 个章节,但不能丢失原文事实句。"
|
|
213
|
-
"不要省略 JSON 外层 items,不要输出多个条目,不要输出解释文字。"
|
|
214
|
-
)
|
|
215
219
|
|
|
216
220
|
|
|
217
|
-
def _high_relevance_coverage_issues(
|
|
218
|
-
client,
|
|
219
|
-
config,
|
|
220
|
-
block: ParsedBlock,
|
|
221
|
-
items: List[KnowledgeItem],
|
|
222
|
-
missing_facts: List[str],
|
|
223
|
-
) -> List[str]:
|
|
224
|
-
"""让 LLM 判断缺失事实是否与当前条目高度相关。"""
|
|
225
|
-
if not missing_facts:
|
|
226
|
-
return []
|
|
227
|
-
prompt = _coverage_relevance_prompt(block, items, missing_facts)
|
|
228
|
-
try:
|
|
229
|
-
response = _create_zhipu_completion(client, config, prompt)
|
|
230
|
-
content = _extract_response_content(response)
|
|
231
|
-
parsed = _extract_json_with_diagnostics(content).value
|
|
232
|
-
high_relevance = _high_relevance_facts_from_analysis(parsed, missing_facts)
|
|
233
|
-
if high_relevance is not None:
|
|
234
|
-
print(
|
|
235
|
-
"llm coverage relevance: "
|
|
236
|
-
f"missing_facts={len(missing_facts)} high_relevance={len(high_relevance)}"
|
|
237
|
-
)
|
|
238
|
-
return high_relevance
|
|
239
|
-
except Exception as exc:
|
|
240
|
-
print(f"llm coverage relevance failed: {type(exc).__name__} detail={exc}")
|
|
241
221
|
|
|
242
|
-
fallback = _fallback_high_relevance_coverage_issues(block, items, missing_facts)
|
|
243
|
-
print(
|
|
244
|
-
"llm coverage relevance fallback: "
|
|
245
|
-
f"missing_facts={len(missing_facts)} high_relevance={len(fallback)}"
|
|
246
|
-
)
|
|
247
|
-
return fallback
|
|
248
222
|
|
|
249
223
|
|
|
250
|
-
def _coverage_relevance_prompt(
|
|
251
|
-
block: ParsedBlock,
|
|
252
|
-
items: List[KnowledgeItem],
|
|
253
|
-
missing_facts: List[str],
|
|
254
|
-
) -> str:
|
|
255
|
-
"""构造缺失事实相关性判定提示。"""
|
|
256
|
-
fact_lines = "\n".join(f"- {fact}" for fact in missing_facts[:20])
|
|
257
|
-
current_items = "\n\n".join(
|
|
258
|
-
f"标题:{item.title}\n核心正文:{_core_sections_for_coverage(item.body)}"
|
|
259
|
-
for item in items
|
|
260
|
-
)
|
|
261
|
-
return f"""
|
|
262
|
-
请判断以下“覆盖校验缺失事实”是否与当前知识条目的主题极高相关。
|
|
263
|
-
|
|
264
|
-
判定规则:
|
|
265
|
-
1. 只有缺失事实是回答当前条目标题或核心正文所必须保留的定义、规则、阈值、条件、主体、简称、例外或限制时,才标记为“极高”。
|
|
266
|
-
2. 来源文件标题、章节标题、目录项、上级主题名称、页眉页脚、纯标签、仅用于定位的小标题,通常不是“极高”,除非它本身就是当前条目要解释的完整定义或规则。
|
|
267
|
-
3. 辅助上下文只用于理解位置和主题,不要把辅助上下文中独有的信息作为缺失事实依据。
|
|
268
|
-
4. 只能返回 JSON object,不要 Markdown 或解释文字。
|
|
269
|
-
|
|
270
|
-
返回格式:
|
|
271
|
-
{{
|
|
272
|
-
"facts": [
|
|
273
|
-
{{"fact": "必须原样复制待判断事实", "relevance": "极高|一般|低", "reason": "一句话原因"}}
|
|
274
|
-
]
|
|
275
|
-
}}
|
|
276
|
-
|
|
277
|
-
来源文档:{block.source_doc}
|
|
278
|
-
来源章节:{block.source_section or "全文"}
|
|
279
|
-
|
|
280
|
-
当前来源原文片段:
|
|
281
|
-
{_preview(block.content)[:4000] or "无"}
|
|
282
|
-
|
|
283
|
-
辅助上下文:
|
|
284
|
-
{_preview(block.context)[:2000] or "无"}
|
|
285
|
-
|
|
286
|
-
当前已生成条目:
|
|
287
|
-
{_preview(current_items)[:4000] or "无"}
|
|
288
|
-
|
|
289
|
-
待判断事实:
|
|
290
|
-
{fact_lines}
|
|
291
|
-
""".strip()
|
|
292
224
|
|
|
293
225
|
|
|
294
|
-
def _high_relevance_facts_from_analysis(parsed, missing_facts: List[str]):
|
|
295
|
-
"""从相关性判定 JSON 中提取极高相关事实。"""
|
|
296
|
-
if not isinstance(parsed, dict):
|
|
297
|
-
return None
|
|
298
|
-
raw_facts = parsed.get("facts")
|
|
299
|
-
if raw_facts is None and isinstance(parsed.get("results"), list):
|
|
300
|
-
raw_facts = parsed.get("results")
|
|
301
|
-
if raw_facts is None and isinstance(parsed.get("items"), list):
|
|
302
|
-
raw_facts = parsed.get("items")
|
|
303
|
-
if not isinstance(raw_facts, list):
|
|
304
|
-
return None
|
|
305
|
-
|
|
306
|
-
missing_by_norm = {_coverage_text(fact): fact for fact in missing_facts}
|
|
307
|
-
selected: List[str] = []
|
|
308
|
-
for raw in raw_facts:
|
|
309
|
-
if not isinstance(raw, dict):
|
|
310
|
-
continue
|
|
311
|
-
relevance = str(raw.get("relevance") or raw.get("关联度") or "").strip().lower()
|
|
312
|
-
if not ("极高" in relevance or "high" in relevance):
|
|
313
|
-
continue
|
|
314
|
-
fact = str(raw.get("fact") or raw.get("事实") or raw.get("text") or "").strip()
|
|
315
|
-
matched = _match_missing_fact(fact, missing_by_norm)
|
|
316
|
-
if matched and matched not in selected:
|
|
317
|
-
selected.append(matched)
|
|
318
|
-
return selected
|
|
319
226
|
|
|
320
227
|
|
|
321
|
-
def _match_missing_fact(fact: str, missing_by_norm: Dict[str, str]) -> str:
|
|
322
|
-
"""把模型返回事实匹配回原始缺失事实。"""
|
|
323
|
-
fact_norm = _coverage_text(fact)
|
|
324
|
-
if not fact_norm:
|
|
325
|
-
return ""
|
|
326
|
-
if fact_norm in missing_by_norm:
|
|
327
|
-
return missing_by_norm[fact_norm]
|
|
328
|
-
for missing_norm, missing in missing_by_norm.items():
|
|
329
|
-
if fact_norm in missing_norm or missing_norm in fact_norm:
|
|
330
|
-
return missing
|
|
331
|
-
return ""
|
|
332
228
|
|
|
333
229
|
|
|
334
|
-
def _fallback_high_relevance_coverage_issues(
|
|
335
|
-
block: ParsedBlock,
|
|
336
|
-
items: List[KnowledgeItem],
|
|
337
|
-
missing_facts: List[str],
|
|
338
|
-
) -> List[str]:
|
|
339
|
-
"""相关性判定失败时的保守兜底,过滤明显结构性标题。"""
|
|
340
|
-
return [
|
|
341
|
-
fact for fact in missing_facts
|
|
342
|
-
if not _looks_like_structural_missing_fact(block, items, fact)
|
|
343
|
-
]
|
|
344
|
-
|
|
345
|
-
|
|
346
|
-
def _looks_like_structural_missing_fact(
|
|
347
|
-
block: ParsedBlock,
|
|
348
|
-
items: List[KnowledgeItem],
|
|
349
|
-
fact: str,
|
|
350
|
-
) -> bool:
|
|
351
|
-
"""判断缺失事实是否只是标题、章节或定位信息。"""
|
|
352
|
-
fact_norm = _coverage_text(fact)
|
|
353
|
-
if not fact_norm:
|
|
354
|
-
return True
|
|
355
|
-
candidates = [
|
|
356
|
-
block.source_doc,
|
|
357
|
-
Path(block.source_doc).stem,
|
|
358
|
-
block.source_section,
|
|
359
|
-
block.category,
|
|
360
|
-
block.subcategory,
|
|
361
|
-
block.source_doc_description,
|
|
362
|
-
block.subcategory_description,
|
|
363
|
-
*block.category_path,
|
|
364
|
-
*block.related_categories,
|
|
365
|
-
*(item.title for item in items),
|
|
366
|
-
]
|
|
367
|
-
candidate_norms = {_coverage_text(value) for value in candidates if value}
|
|
368
|
-
if fact_norm in candidate_norms:
|
|
369
|
-
return True
|
|
370
|
-
if len(fact_norm) <= 30 and not re.search(
|
|
371
|
-
r"是|为|指|称|简称|英文|应|需|必须|不得|禁止|超过|低于|大于|小于|不少于|不超过|\d",
|
|
372
|
-
fact,
|
|
373
|
-
):
|
|
374
|
-
return True
|
|
375
|
-
return False
|
|
376
|
-
|
|
377
230
|
|
|
378
231
|
def _coverage_retry_prompt(
|
|
379
232
|
block: ParsedBlock,
|
|
@@ -427,17 +280,23 @@ def _json_shape_retry_prompt(parsed) -> str:
|
|
|
427
280
|
)
|
|
428
281
|
|
|
429
282
|
|
|
430
|
-
def
|
|
283
|
+
def _append_retry_messages(messages: List[Dict[str, str]], previous_content: str, feedback: str) -> None:
|
|
284
|
+
"""把坏返回和纠偏要求放进下一轮会话。"""
|
|
285
|
+
messages.append({"role": "assistant", "content": previous_content or ""})
|
|
286
|
+
messages.append({"role": "user", "content": feedback})
|
|
287
|
+
|
|
288
|
+
|
|
289
|
+
def _create_zhipu_completion(client, config, messages: List[Dict[str, str]]):
|
|
431
290
|
"""发起一次非流式模型补全请求。"""
|
|
432
291
|
return client.chat.completions.create(
|
|
433
292
|
model=config.model,
|
|
434
|
-
messages=
|
|
293
|
+
messages=messages,
|
|
435
294
|
stream=False,
|
|
436
295
|
max_tokens=config.max_tokens,
|
|
437
296
|
temperature=config.temperature,
|
|
438
297
|
do_sample=False,
|
|
439
298
|
response_format={"type": "json_object"},
|
|
440
|
-
thinking={"type": "disabled", "clear_thinking":
|
|
299
|
+
thinking={"type": "disabled", "clear_thinking": False},
|
|
441
300
|
)
|
|
442
301
|
|
|
443
302
|
|
|
@@ -574,69 +433,35 @@ def _json_shape(value) -> str:
|
|
|
574
433
|
return type(value).__name__
|
|
575
434
|
|
|
576
435
|
|
|
577
|
-
def
|
|
578
|
-
"""
|
|
579
|
-
|
|
580
|
-
|
|
581
|
-
|
|
582
|
-
|
|
436
|
+
def _mark_llm_failed_item(item: KnowledgeItem, block: ParsedBlock) -> KnowledgeItem:
|
|
437
|
+
"""标记 LLM 失败后的离线兜底条目。"""
|
|
438
|
+
item.review_status = "failed"
|
|
439
|
+
item.body = _append_failed_chunk_source(item.body, block.content)
|
|
440
|
+
return item
|
|
441
|
+
|
|
442
|
+
|
|
443
|
+
def _append_failed_chunk_source(body: str, source: str) -> str:
|
|
444
|
+
"""在 failed 文件末尾保存原始 chunk。"""
|
|
445
|
+
warning = (
|
|
446
|
+
"## LLM 生成失败警告\n\n"
|
|
447
|
+
"WARNING: LLM 多次重试后仍未返回合格 JSON,本文件由离线规则兜底生成,需人工核对。\n\n"
|
|
448
|
+
"## failed_chunk_source\n\n"
|
|
449
|
+
"```text\n"
|
|
450
|
+
f"{source.strip()}\n"
|
|
451
|
+
"```"
|
|
452
|
+
)
|
|
453
|
+
return f"{body.strip()}\n\n{warning}".strip()
|
|
583
454
|
|
|
584
|
-
for key in (
|
|
585
|
-
"knowledge_items",
|
|
586
|
-
"records",
|
|
587
|
-
"data",
|
|
588
|
-
"payload",
|
|
589
|
-
"output",
|
|
590
|
-
"response",
|
|
591
|
-
"answer",
|
|
592
|
-
"content",
|
|
593
|
-
"message",
|
|
594
|
-
"result",
|
|
595
|
-
"results",
|
|
596
|
-
):
|
|
597
|
-
value = parsed.get(key)
|
|
598
|
-
if isinstance(value, list):
|
|
599
|
-
print(f"llm parse notice: using non-standard list field '{key}' as items")
|
|
600
|
-
return value
|
|
601
|
-
if isinstance(value, dict):
|
|
602
|
-
nested = _coerce_raw_items(value)
|
|
603
|
-
if isinstance(nested, list):
|
|
604
|
-
print(f"llm parse notice: using nested field '{key}' as items")
|
|
605
|
-
return nested
|
|
606
|
-
if isinstance(value, str) and value.strip():
|
|
607
|
-
nested = _extract_json_with_diagnostics(value)
|
|
608
|
-
if nested.value is not None:
|
|
609
|
-
nested_items = _coerce_raw_items(nested.value)
|
|
610
|
-
if isinstance(nested_items, list):
|
|
611
|
-
print(f"llm parse notice: parsed JSON string field '{key}' as items")
|
|
612
|
-
return nested_items
|
|
613
|
-
|
|
614
|
-
if _looks_like_single_item(parsed):
|
|
615
|
-
print("llm parse notice: wrapping single item object as items[0]")
|
|
616
|
-
return [parsed]
|
|
617
|
-
|
|
618
|
-
if isinstance(parsed, list):
|
|
619
|
-
print("llm parse notice: wrapping root array as items")
|
|
620
|
-
return parsed
|
|
621
455
|
|
|
456
|
+
def _coerce_raw_items(parsed):
|
|
457
|
+
"""只接受标准 {"items": [...]} 输出。"""
|
|
458
|
+
if isinstance(parsed, dict) and isinstance(parsed.get("items"), list):
|
|
459
|
+
return parsed["items"]
|
|
622
460
|
return None
|
|
623
461
|
|
|
624
462
|
|
|
625
|
-
|
|
626
|
-
|
|
627
|
-
required_signal = {"title", "body"}
|
|
628
|
-
item_fields = {
|
|
629
|
-
"title",
|
|
630
|
-
"doc_type",
|
|
631
|
-
"business_modules",
|
|
632
|
-
"source_version",
|
|
633
|
-
"risk_level",
|
|
634
|
-
"applicable_roles",
|
|
635
|
-
"tags",
|
|
636
|
-
"body",
|
|
637
|
-
"split_reason",
|
|
638
|
-
}
|
|
639
|
-
return required_signal.issubset(value.keys()) and len(item_fields.intersection(value.keys())) >= 4
|
|
463
|
+
|
|
464
|
+
|
|
640
465
|
|
|
641
466
|
|
|
642
467
|
def _looks_truncated(content: str, finish_reason: str) -> bool:
|
|
@@ -649,15 +474,6 @@ def _looks_truncated(content: str, finish_reason: str) -> bool:
|
|
|
649
474
|
return stripped.count("{") > stripped.count("}") or stripped.count("[") > stripped.count("]")
|
|
650
475
|
|
|
651
476
|
|
|
652
|
-
def _abort_llm(message: str, block: ParsedBlock) -> None:
|
|
653
|
-
"""输出失败告警并终止 LLM 生成。"""
|
|
654
|
-
print(
|
|
655
|
-
"ALERT: llm draft failed; aborting. "
|
|
656
|
-
f"reason={message} doc={block.source_doc} section={block.source_section}"
|
|
657
|
-
)
|
|
658
|
-
raise SystemExit(1)
|
|
659
|
-
|
|
660
|
-
|
|
661
477
|
def _build_prompt(block: ParsedBlock, status: str) -> str:
|
|
662
478
|
"""构造片段生成知识条目的完整提示。"""
|
|
663
479
|
spec = _read_kb_spec()
|
|
@@ -757,38 +573,23 @@ def _read_tools() -> List[Dict[str, Any]]:
|
|
|
757
573
|
|
|
758
574
|
def _extract_json_with_diagnostics(text: str) -> JsonParseResult:
|
|
759
575
|
"""提取 JSON 并保留失败诊断。"""
|
|
760
|
-
text = text.strip()
|
|
761
|
-
text = _strip_code_fence(text)
|
|
576
|
+
text = _strip_code_fence(text.strip())
|
|
762
577
|
errors: List[str] = []
|
|
763
578
|
try:
|
|
764
579
|
return JsonParseResult(value=json.loads(text))
|
|
765
580
|
except json.JSONDecodeError as exc:
|
|
766
581
|
errors.append(_json_error_message(exc, text))
|
|
767
582
|
|
|
768
|
-
repaired = _repair_json_text(text)
|
|
769
|
-
if repaired != text:
|
|
770
|
-
try:
|
|
771
|
-
print("llm parse notice: repaired unescaped quotes in JSON string fields")
|
|
772
|
-
return JsonParseResult(value=json.loads(repaired))
|
|
773
|
-
except json.JSONDecodeError as exc:
|
|
774
|
-
errors.append(_json_error_message(exc, repaired))
|
|
775
|
-
|
|
776
583
|
for candidate in _json_candidates(text):
|
|
777
584
|
try:
|
|
778
585
|
return JsonParseResult(value=json.loads(candidate))
|
|
779
586
|
except json.JSONDecodeError as exc:
|
|
780
587
|
errors.append(_json_error_message(exc, candidate))
|
|
781
|
-
repaired = _repair_json_text(candidate)
|
|
782
|
-
if repaired != candidate:
|
|
783
|
-
try:
|
|
784
|
-
print("llm parse notice: repaired unescaped quotes in JSON candidate")
|
|
785
|
-
return JsonParseResult(value=json.loads(repaired))
|
|
786
|
-
except json.JSONDecodeError as repair_exc:
|
|
787
|
-
errors.append(_json_error_message(repair_exc, repaired))
|
|
788
|
-
continue
|
|
789
588
|
return JsonParseResult(error=errors[-1] if errors else "no JSON object or array found")
|
|
790
589
|
|
|
791
590
|
|
|
591
|
+
|
|
592
|
+
|
|
792
593
|
def _json_error_message(exc: json.JSONDecodeError, text: str) -> str:
|
|
793
594
|
"""格式化 JSON 解析错误位置。"""
|
|
794
595
|
line = text.splitlines()[exc.lineno - 1] if 0 < exc.lineno <= len(text.splitlines()) else ""
|
|
@@ -823,130 +624,18 @@ def _json_candidates(text: str) -> List[str]:
|
|
|
823
624
|
return candidates
|
|
824
625
|
|
|
825
626
|
|
|
826
|
-
def _repair_json_text(text: str) -> str:
|
|
827
|
-
"""修复常见的 JSON 字符串引号问题。"""
|
|
828
|
-
for field in (
|
|
829
|
-
"title",
|
|
830
|
-
"category",
|
|
831
|
-
"category_description",
|
|
832
|
-
"source_version",
|
|
833
|
-
"risk_level",
|
|
834
|
-
"split_reason",
|
|
835
|
-
):
|
|
836
|
-
text = _repair_unescaped_quotes_in_json_field(text, field)
|
|
837
|
-
return text
|
|
838
627
|
|
|
839
628
|
|
|
840
|
-
def _repair_unescaped_quotes_in_json_field(text: str, field: str) -> str:
|
|
841
|
-
"""修复指定 JSON 字段中的未转义引号。"""
|
|
842
|
-
pattern = re.compile(rf'("{re.escape(field)}"\s*:\s*)"')
|
|
843
|
-
output = []
|
|
844
|
-
cursor = 0
|
|
845
|
-
while True:
|
|
846
|
-
match = pattern.search(text, cursor)
|
|
847
|
-
if not match:
|
|
848
|
-
output.append(text[cursor:])
|
|
849
|
-
break
|
|
850
629
|
|
|
851
|
-
value_start = match.end()
|
|
852
|
-
closing = _find_json_field_string_end(text, value_start)
|
|
853
|
-
if closing == -1:
|
|
854
|
-
output.append(text[cursor:])
|
|
855
|
-
break
|
|
856
630
|
|
|
857
|
-
raw_value = text[value_start:closing]
|
|
858
|
-
repaired_value = _escape_unescaped_quotes(raw_value)
|
|
859
|
-
output.append(text[cursor:value_start])
|
|
860
|
-
output.append(repaired_value)
|
|
861
|
-
output.append('"')
|
|
862
|
-
cursor = closing + 1
|
|
863
|
-
|
|
864
|
-
return "".join(output)
|
|
865
|
-
|
|
866
|
-
|
|
867
|
-
def _find_json_field_string_end(text: str, start: int) -> int:
|
|
868
|
-
"""定位 JSON 字符串字段的结束引号。"""
|
|
869
|
-
idx = start
|
|
870
|
-
escaped = False
|
|
871
|
-
while idx < len(text):
|
|
872
|
-
char = text[idx]
|
|
873
|
-
if escaped:
|
|
874
|
-
escaped = False
|
|
875
|
-
elif char == "\\":
|
|
876
|
-
escaped = True
|
|
877
|
-
elif char == '"' and _looks_like_json_value_end(text, idx):
|
|
878
|
-
return idx
|
|
879
|
-
idx += 1
|
|
880
|
-
return -1
|
|
881
|
-
|
|
882
|
-
|
|
883
|
-
def _looks_like_json_value_end(text: str, quote_idx: int) -> bool:
|
|
884
|
-
"""判断引号是否像 JSON 值结尾。"""
|
|
885
|
-
next_idx = _next_nonspace(text, quote_idx + 1)
|
|
886
|
-
if next_idx >= len(text):
|
|
887
|
-
return True
|
|
888
|
-
if text[next_idx] in "}]":
|
|
889
|
-
return True
|
|
890
|
-
if text[next_idx] != ",":
|
|
891
|
-
return False
|
|
892
631
|
|
|
893
|
-
following = _next_nonspace(text, next_idx + 1)
|
|
894
|
-
if following >= len(text):
|
|
895
|
-
return True
|
|
896
|
-
if text[following] in "}]":
|
|
897
|
-
return True
|
|
898
|
-
if text[following] != '"':
|
|
899
|
-
return False
|
|
900
632
|
|
|
901
|
-
|
|
902
|
-
|
|
903
|
-
|
|
904
|
-
|
|
905
|
-
|
|
906
|
-
|
|
907
|
-
|
|
908
|
-
def _next_nonspace(text: str, start: int) -> int:
|
|
909
|
-
"""查找下一个非空白字符位置。"""
|
|
910
|
-
idx = start
|
|
911
|
-
while idx < len(text) and text[idx].isspace():
|
|
912
|
-
idx += 1
|
|
913
|
-
return idx
|
|
914
|
-
|
|
915
|
-
|
|
916
|
-
def _find_plain_json_string_end(text: str, start: int) -> int:
|
|
917
|
-
"""定位普通 JSON 字符串结束位置。"""
|
|
918
|
-
idx = start
|
|
919
|
-
escaped = False
|
|
920
|
-
while idx < len(text):
|
|
921
|
-
char = text[idx]
|
|
922
|
-
if escaped:
|
|
923
|
-
escaped = False
|
|
924
|
-
elif char == "\\":
|
|
925
|
-
escaped = True
|
|
926
|
-
elif char == '"':
|
|
927
|
-
return idx
|
|
928
|
-
idx += 1
|
|
929
|
-
return -1
|
|
930
|
-
|
|
931
|
-
|
|
932
|
-
def _escape_unescaped_quotes(value: str) -> str:
|
|
933
|
-
"""转义字符串中的裸双引号。"""
|
|
934
|
-
output = []
|
|
935
|
-
escaped = False
|
|
936
|
-
for char in value:
|
|
937
|
-
if escaped:
|
|
938
|
-
output.append(char)
|
|
939
|
-
escaped = False
|
|
940
|
-
continue
|
|
941
|
-
if char == "\\":
|
|
942
|
-
output.append(char)
|
|
943
|
-
escaped = True
|
|
944
|
-
continue
|
|
945
|
-
if char == '"':
|
|
946
|
-
output.append('\\"')
|
|
947
|
-
continue
|
|
948
|
-
output.append(char)
|
|
949
|
-
return "".join(output)
|
|
633
|
+
|
|
634
|
+
|
|
635
|
+
|
|
636
|
+
|
|
637
|
+
|
|
638
|
+
|
|
950
639
|
|
|
951
640
|
|
|
952
641
|
def _preview(text: str) -> str:
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "union_kb_ingest",
|
|
3
|
-
"version": "1.0.
|
|
3
|
+
"version": "1.0.11",
|
|
4
4
|
"description": "Offline knowledge-base ingest helper for PDF, Word, Markdown and TXT documents.",
|
|
5
5
|
"bin": {
|
|
6
6
|
"union_kb_ingest": "bin/union_kb_ingest"
|
|
@@ -9,12 +9,13 @@
|
|
|
9
9
|
"README.md",
|
|
10
10
|
"requirements.txt",
|
|
11
11
|
"bin/union_kb_ingest",
|
|
12
|
-
"ArkKickidcService.java",
|
|
13
12
|
"*.py",
|
|
14
13
|
"config/config.yaml",
|
|
15
14
|
"prompts/",
|
|
16
15
|
"input/.gitkeep",
|
|
17
16
|
"input/pdf/.gitkeep",
|
|
17
|
+
"input/function/.gitkeep",
|
|
18
|
+
"input/function/tools.yaml",
|
|
18
19
|
"input/word/.gitkeep",
|
|
19
20
|
"parsed/.gitkeep",
|
|
20
21
|
"result/.gitkeep"
|
package/requirements.txt
CHANGED
|
@@ -1,9 +1,6 @@
|
|
|
1
|
-
# Offline-only optional dependencies. Do not add these to the app runtime unless needed.
|
|
2
1
|
pyyaml>=6.0.1
|
|
3
2
|
zai-sdk==0.2.2
|
|
4
|
-
sniffio>=1.3.0
|
|
5
3
|
|
|
6
|
-
# Docling
|
|
7
|
-
#
|
|
8
|
-
# layout/table ML models, torch/onnxruntime, and may try to download artifacts.
|
|
4
|
+
# Docling file-format backends only. Avoid docling-slim[standard]: it pulls OCR,
|
|
5
|
+
# layout/table ML models, torch/onnxruntime, and downloaded artifacts.
|
|
9
6
|
docling-slim[format-pdf-docling,format-docx,format-markdown]>=2.70.0; python_version >= "3.10"
|