union_kb_ingest 1.0.9 → 1.0.11
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +6 -31
- package/app_config.py +21 -33
- package/ingest.py +289 -25
- package/input/function/.gitkeep +1 -0
- package/input/function/tools.yaml +27 -0
- package/normalizer.py +99 -406
- package/package.json +3 -2
- package/parser.py +1 -1
- package/requirements.txt +2 -5
- package/writer.py +19 -3
package/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Simon
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
package/README.md
CHANGED
|
@@ -12,23 +12,11 @@
|
|
|
12
12
|
|
|
13
13
|
启用大模型时,工具只会读取 `prompts/知识库建立规范.md` 作为格式和质量约束,并由代码按当前片段、辅助上下文和输出 JSON 结构组装生成提示词。模型需依据原文语义判断业务场景、模块、角色、标签和风险等级;代码中的启发式生成只作为未启用大模型时的兜底,不使用预设业务关键词去指导大模型输出。
|
|
14
14
|
|
|
15
|
-
##
|
|
16
|
-
|
|
17
|
-
不建议把这些依赖加入项目根 `requirements.txt`。离线机器单独安装即可:
|
|
15
|
+
## 安装依赖
|
|
18
16
|
|
|
19
17
|
```bash
|
|
20
18
|
python -m pip install -r requirements.txt
|
|
21
19
|
```
|
|
22
|
-
|
|
23
|
-
解析层不使用 OCR,不加载本地视觉/版面模型,也不访问远程模型服务:
|
|
24
|
-
|
|
25
|
-
- PDF:使用 `docling-parse` 抽取 PDF 内嵌文本和文本行顺序;扫描件或图片型 PDF 不会识别。
|
|
26
|
-
- DOCX:使用 Docling 的 Word 后端转为 Markdown。
|
|
27
|
-
- 旧版 `.doc`:通过 LibreOffice `soffice` 转为 `.docx` 后再解析;不使用 OCR。
|
|
28
|
-
- Markdown / TXT:作为已文本化材料直接读取。
|
|
29
|
-
|
|
30
|
-
不要安装 `docling` 或 `docling-slim[standard]`,它们会引入 OCR、版面/表格模型、Torch/ONNXRuntime 等重依赖,并可能在运行时下载模型。内网机器建议为离线工具单独准备 Python 3.10+ 环境。
|
|
31
|
-
|
|
32
20
|
## 基本用法
|
|
33
21
|
|
|
34
22
|
把文件放入:
|
|
@@ -37,13 +25,15 @@ python -m pip install -r requirements.txt
|
|
|
37
25
|
input/
|
|
38
26
|
```
|
|
39
27
|
|
|
28
|
+
工具调用示例放在 `input/function/tools.yaml` 中,可按业务需要替换。
|
|
29
|
+
|
|
40
30
|
生成知识库文件:
|
|
41
31
|
|
|
42
32
|
```bash
|
|
43
33
|
python ingest.py draft
|
|
44
34
|
```
|
|
45
35
|
|
|
46
|
-
如果 `result/`
|
|
36
|
+
如果 `result/` 中已有生成文件,命令会提示选择删除重建、从断点继续或退出。断点状态保存在 `result/.draft_progress.json`,大模型多次重试失败退出时会记录当前文件和片段位置,下次可选择从断点继续。
|
|
47
37
|
|
|
48
38
|
只解析为中间 Markdown:
|
|
49
39
|
|
|
@@ -67,7 +57,7 @@ python ingest.py validate
|
|
|
67
57
|
|
|
68
58
|
## 大模型配置
|
|
69
59
|
|
|
70
|
-
|
|
60
|
+
默认不强制调用大模型,但是强烈建议启用大模型分析,会使用启发式模板生成知识库文件。
|
|
71
61
|
|
|
72
62
|
如果要启用大模型整理,修改 `config/config.yaml`:
|
|
73
63
|
|
|
@@ -75,7 +65,7 @@ python ingest.py validate
|
|
|
75
65
|
llm:
|
|
76
66
|
enabled: true
|
|
77
67
|
base_url: "https://open.bigmodel.cn/api/paas/v4/"
|
|
78
|
-
api_key: "
|
|
68
|
+
api_key: ""
|
|
79
69
|
model: "glm-4.7"
|
|
80
70
|
timeout_seconds: 120
|
|
81
71
|
max_tokens: 8192
|
|
@@ -87,19 +77,4 @@ draft:
|
|
|
87
77
|
outline_max_sections: 40
|
|
88
78
|
```
|
|
89
79
|
|
|
90
|
-
也可以继续使用环境变量覆盖配置文件:
|
|
91
|
-
|
|
92
|
-
```bash
|
|
93
|
-
export KB_LLM_ENABLED=true
|
|
94
|
-
export KB_LLM_BASE_URL="https://open.bigmodel.cn/api/paas/v4/"
|
|
95
|
-
export KB_LLM_API_KEY="your-zhipu-api-key"
|
|
96
|
-
export KB_LLM_MODEL="glm-4.7"
|
|
97
|
-
```
|
|
98
|
-
|
|
99
80
|
工具通过 Z.AI 新版 Python SDK 调用中文智谱开放平台 GLM,依赖固定为 `zai-sdk==0.2.2`,客户端固定使用官方中文写法 `from zai import ZhipuAiClient`,`base_url` 使用 `https://open.bigmodel.cn/api/paas/v4/`。工具不再包含旧 `zhipuai` SDK、国际版 `ZaiClient` 或 OpenAI 调用路径,也不 import 项目 `src` 代码。
|
|
100
|
-
|
|
101
|
-
## 与线上项目的关系
|
|
102
|
-
|
|
103
|
-
这个工具只产出符合规范的 `*.md` 文件到 `result/`,后续由线上知识库加载流程处理。
|
|
104
|
-
|
|
105
|
-
建议线上打包时排除整个 `tools/kb_ingest` 目录。
|
package/app_config.py
CHANGED
|
@@ -1,6 +1,5 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
-
import os
|
|
4
3
|
from dataclasses import dataclass
|
|
5
4
|
from functools import lru_cache
|
|
6
5
|
from pathlib import Path
|
|
@@ -15,7 +14,7 @@ DEFAULT_CONFIG_PATH = CURRENT_DIR / "config" / "config.yaml"
|
|
|
15
14
|
|
|
16
15
|
@dataclass(frozen=True)
|
|
17
16
|
class LlmConfig:
|
|
18
|
-
"""LLM
|
|
17
|
+
"""LLM 调用配置。"""
|
|
19
18
|
enabled: bool = False
|
|
20
19
|
base_url: str = ""
|
|
21
20
|
api_key: str = ""
|
|
@@ -35,67 +34,56 @@ class DraftConfig:
|
|
|
35
34
|
|
|
36
35
|
@lru_cache(maxsize=1)
|
|
37
36
|
def get_llm_config() -> LlmConfig:
|
|
38
|
-
"""
|
|
37
|
+
"""读取 LLM 配置。"""
|
|
39
38
|
raw = _read_config().get("llm", {})
|
|
40
39
|
if not isinstance(raw, dict):
|
|
41
40
|
raw = {}
|
|
42
41
|
|
|
43
42
|
return LlmConfig(
|
|
44
|
-
enabled=
|
|
45
|
-
base_url=
|
|
46
|
-
api_key=
|
|
47
|
-
model=
|
|
48
|
-
timeout_seconds=
|
|
49
|
-
max_tokens=
|
|
50
|
-
temperature=
|
|
43
|
+
enabled=_as_bool(raw.get("enabled"), False),
|
|
44
|
+
base_url=str(raw.get("base_url") or ""),
|
|
45
|
+
api_key=str(raw.get("api_key") or ""),
|
|
46
|
+
model=str(raw.get("model") or ""),
|
|
47
|
+
timeout_seconds=_as_int(raw.get("timeout_seconds"), 120),
|
|
48
|
+
max_tokens=_as_int(raw.get("max_tokens"), 4096),
|
|
49
|
+
temperature=_as_float(raw.get("temperature"), 0.1),
|
|
51
50
|
)
|
|
52
51
|
|
|
53
52
|
|
|
54
53
|
@lru_cache(maxsize=1)
|
|
55
54
|
def get_draft_config() -> DraftConfig:
|
|
56
|
-
"""
|
|
55
|
+
"""读取草稿生成配置。"""
|
|
57
56
|
raw = _read_config().get("draft", {})
|
|
58
57
|
if not isinstance(raw, dict):
|
|
59
58
|
raw = {}
|
|
60
59
|
|
|
61
60
|
return DraftConfig(
|
|
62
|
-
max_chars=
|
|
63
|
-
context_chars=
|
|
64
|
-
outline_max_sections=
|
|
61
|
+
max_chars=_as_int(raw.get("max_chars"), 3600),
|
|
62
|
+
context_chars=_as_int(raw.get("context_chars"), 800),
|
|
63
|
+
outline_max_sections=_as_int(raw.get("outline_max_sections"), 40),
|
|
65
64
|
)
|
|
66
65
|
|
|
67
66
|
|
|
68
67
|
def _read_config() -> Dict[str, Any]:
|
|
69
68
|
"""读取 YAML 配置文件并返回字典。"""
|
|
70
|
-
|
|
71
|
-
if not path.exists():
|
|
69
|
+
if not DEFAULT_CONFIG_PATH.exists():
|
|
72
70
|
return {}
|
|
73
|
-
data = yaml.safe_load(
|
|
71
|
+
data = yaml.safe_load(DEFAULT_CONFIG_PATH.read_text(encoding="utf-8")) or {}
|
|
74
72
|
return data if isinstance(data, dict) else {}
|
|
75
73
|
|
|
76
74
|
|
|
77
|
-
def
|
|
78
|
-
"""
|
|
79
|
-
value = os.environ.get(name)
|
|
80
|
-
if value is None:
|
|
81
|
-
return default
|
|
82
|
-
return _as_bool(value, default)
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
def _env_int(name: str, value: Any, default: int) -> int:
|
|
86
|
-
"""读取整数环境变量并回退到默认值。"""
|
|
87
|
-
raw = os.environ.get(name, value)
|
|
75
|
+
def _as_int(value: Any, default: int) -> int:
|
|
76
|
+
"""把配置值转换为整数。"""
|
|
88
77
|
try:
|
|
89
|
-
return int(
|
|
78
|
+
return int(value)
|
|
90
79
|
except (TypeError, ValueError):
|
|
91
80
|
return default
|
|
92
81
|
|
|
93
82
|
|
|
94
|
-
def
|
|
95
|
-
"""
|
|
96
|
-
raw = os.environ.get(name, value)
|
|
83
|
+
def _as_float(value: Any, default: float) -> float:
|
|
84
|
+
"""把配置值转换为浮点数。"""
|
|
97
85
|
try:
|
|
98
|
-
return float(
|
|
86
|
+
return float(value)
|
|
99
87
|
except (TypeError, ValueError):
|
|
100
88
|
return default
|
|
101
89
|
|
package/ingest.py
CHANGED
|
@@ -2,17 +2,22 @@
|
|
|
2
2
|
from __future__ import annotations
|
|
3
3
|
|
|
4
4
|
import argparse
|
|
5
|
+
import json
|
|
5
6
|
import sys
|
|
6
7
|
from dataclasses import replace
|
|
8
|
+
from datetime import datetime
|
|
7
9
|
from pathlib import Path
|
|
8
10
|
from typing import List
|
|
11
|
+
from uuid import uuid4
|
|
12
|
+
|
|
13
|
+
import yaml
|
|
9
14
|
|
|
10
15
|
CURRENT_DIR = Path(__file__).resolve().parent
|
|
11
16
|
if str(CURRENT_DIR) not in sys.path:
|
|
12
17
|
sys.path.insert(0, str(CURRENT_DIR))
|
|
13
18
|
|
|
14
19
|
from app_config import get_draft_config
|
|
15
|
-
from normalizer import normalize_block
|
|
20
|
+
from normalizer import fallback_failed_block, normalize_block
|
|
16
21
|
from parser import iter_input_files, parse_document
|
|
17
22
|
from schemas import ParsedBlock
|
|
18
23
|
from splitter import split_blocks
|
|
@@ -21,6 +26,7 @@ from writer import write_item
|
|
|
21
26
|
|
|
22
27
|
|
|
23
28
|
IGNORED_EXISTING_FILES = {".gitkeep", ".DS_Store"}
|
|
29
|
+
PROGRESS_FILENAME = ".draft_progress.json"
|
|
24
30
|
|
|
25
31
|
|
|
26
32
|
def cmd_parse(args) -> int:
|
|
@@ -45,21 +51,46 @@ def cmd_draft(args) -> int:
|
|
|
45
51
|
output_dir = Path(args.output)
|
|
46
52
|
|
|
47
53
|
existing = _list_effective_files(output_dir)
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
return 0
|
|
51
|
-
|
|
54
|
+
progress_path = output_dir / PROGRESS_FILENAME
|
|
55
|
+
resume_state = None
|
|
52
56
|
if existing:
|
|
53
|
-
|
|
57
|
+
action = _choose_existing_result_action(output_dir, existing)
|
|
58
|
+
if action == "exit":
|
|
59
|
+
print("aborted. existing files were kept.")
|
|
60
|
+
return 0
|
|
61
|
+
if action == "rebuild":
|
|
62
|
+
_clear_generated_files(output_dir)
|
|
63
|
+
elif action == "retry":
|
|
64
|
+
return _retry_failed_files(output_dir, status=args.status)
|
|
65
|
+
elif action == "resume":
|
|
66
|
+
resume_state = _load_progress(progress_path)
|
|
67
|
+
if not resume_state:
|
|
68
|
+
print(f"aborted. no usable checkpoint found at {progress_path}.")
|
|
69
|
+
return 1
|
|
54
70
|
|
|
55
71
|
output_dir.mkdir(parents=True, exist_ok=True)
|
|
56
72
|
|
|
57
|
-
|
|
58
|
-
|
|
73
|
+
run_timestamp = (
|
|
74
|
+
str(resume_state.get("run_timestamp"))
|
|
75
|
+
if resume_state
|
|
76
|
+
else _make_timestamp()
|
|
77
|
+
)
|
|
78
|
+
run_trace_id = (
|
|
79
|
+
str(resume_state.get("run_trace_id"))
|
|
80
|
+
if resume_state
|
|
81
|
+
else uuid4().hex[:8]
|
|
82
|
+
)
|
|
83
|
+
total_items = int(resume_state.get("total_items", 0)) if resume_state else 0
|
|
84
|
+
source_order = int(resume_state.get("source_order", 0)) if resume_state else 0
|
|
59
85
|
draft_config = get_draft_config()
|
|
60
86
|
max_chars = args.max_chars or draft_config.max_chars
|
|
61
87
|
files = iter_input_files(input_path)
|
|
62
|
-
|
|
88
|
+
start_file_index = int(resume_state.get("file_index", 0)) if resume_state else 0
|
|
89
|
+
start_block_index = int(resume_state.get("block_index", 0)) if resume_state else 0
|
|
90
|
+
|
|
91
|
+
for file_index, path in enumerate(files):
|
|
92
|
+
if file_index < start_file_index:
|
|
93
|
+
continue
|
|
63
94
|
parsed = parse_document(path)
|
|
64
95
|
blocks = split_blocks(parsed.blocks, max_chars=max_chars)
|
|
65
96
|
blocks = _attach_block_context(
|
|
@@ -67,15 +98,59 @@ def cmd_draft(args) -> int:
|
|
|
67
98
|
context_chars=draft_config.context_chars,
|
|
68
99
|
outline_max_sections=draft_config.outline_max_sections,
|
|
69
100
|
)
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
101
|
+
block_start = start_block_index if file_index == start_file_index else 0
|
|
102
|
+
for block_index, block in enumerate(blocks):
|
|
103
|
+
if block_index < block_start:
|
|
104
|
+
continue
|
|
105
|
+
_save_progress(
|
|
106
|
+
progress_path,
|
|
107
|
+
input_path=input_path,
|
|
108
|
+
output_dir=output_dir,
|
|
109
|
+
files=files,
|
|
110
|
+
run_timestamp=run_timestamp,
|
|
111
|
+
run_trace_id=run_trace_id,
|
|
112
|
+
source_order=source_order,
|
|
113
|
+
total_items=total_items,
|
|
114
|
+
file_index=file_index,
|
|
115
|
+
block_index=block_index,
|
|
116
|
+
status="running",
|
|
117
|
+
)
|
|
118
|
+
try:
|
|
119
|
+
items = normalize_block(block, status=args.status)
|
|
120
|
+
except SystemExit as exc:
|
|
121
|
+
print(f"WARNING: block failed with SystemExit({exc.code}); writing failed fallback")
|
|
122
|
+
items = fallback_failed_block(block, status=args.status)
|
|
123
|
+
except Exception as exc:
|
|
124
|
+
print(f"WARNING: block failed with {type(exc).__name__}: {exc}; writing failed fallback")
|
|
125
|
+
items = fallback_failed_block(block, status=args.status)
|
|
126
|
+
if not items:
|
|
127
|
+
items = fallback_failed_block(block, status=args.status)
|
|
128
|
+
source_order, written = _write_items(
|
|
129
|
+
items,
|
|
130
|
+
output_dir,
|
|
131
|
+
block,
|
|
132
|
+
source_order=source_order,
|
|
133
|
+
run_timestamp=run_timestamp,
|
|
134
|
+
run_trace_id=run_trace_id,
|
|
135
|
+
)
|
|
136
|
+
total_items += written
|
|
137
|
+
_save_progress(
|
|
138
|
+
progress_path,
|
|
139
|
+
input_path=input_path,
|
|
140
|
+
output_dir=output_dir,
|
|
141
|
+
files=files,
|
|
142
|
+
run_timestamp=run_timestamp,
|
|
143
|
+
run_trace_id=run_trace_id,
|
|
144
|
+
source_order=source_order,
|
|
145
|
+
total_items=total_items,
|
|
146
|
+
file_index=file_index,
|
|
147
|
+
block_index=block_index + 1,
|
|
148
|
+
status="running",
|
|
149
|
+
)
|
|
78
150
|
print(f"drafted: {path} blocks={len(blocks)}")
|
|
151
|
+
start_block_index = 0
|
|
152
|
+
if progress_path.exists():
|
|
153
|
+
progress_path.unlink()
|
|
79
154
|
print(f"done. files={len(files)} draft_items={total_items} output={output_dir}")
|
|
80
155
|
return 0
|
|
81
156
|
|
|
@@ -90,16 +165,205 @@ def _list_effective_files(path: Path) -> list[Path]:
|
|
|
90
165
|
)
|
|
91
166
|
|
|
92
167
|
|
|
93
|
-
def
|
|
168
|
+
def _write_items(
|
|
169
|
+
items,
|
|
94
170
|
output_dir: Path,
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
171
|
+
block: ParsedBlock,
|
|
172
|
+
*,
|
|
173
|
+
source_order: int,
|
|
174
|
+
run_timestamp: str,
|
|
175
|
+
run_trace_id: str,
|
|
176
|
+
) -> tuple[int, int]:
|
|
177
|
+
written = 0
|
|
178
|
+
for item in items:
|
|
179
|
+
source_order += 1
|
|
180
|
+
item.source_order = source_order
|
|
181
|
+
item.source_pages = sorted(set(block.pages))
|
|
182
|
+
item.source_trace = _source_trace(block)
|
|
183
|
+
write_item(
|
|
184
|
+
item,
|
|
185
|
+
output_dir,
|
|
186
|
+
source_title=Path(block.source_doc).stem,
|
|
187
|
+
timestamp=run_timestamp,
|
|
188
|
+
trace_id=run_trace_id,
|
|
189
|
+
)
|
|
190
|
+
written += 1
|
|
191
|
+
return source_order, written
|
|
192
|
+
|
|
193
|
+
|
|
194
|
+
def _retry_failed_files(output_dir: Path, status: str) -> int:
|
|
195
|
+
failed_files = [path for path in _list_effective_files(output_dir) if "failed" in path.stem.lower()]
|
|
196
|
+
if not failed_files:
|
|
197
|
+
print("done. failed_files=0 retried=0")
|
|
198
|
+
return 0
|
|
199
|
+
|
|
200
|
+
run_timestamp = _make_timestamp()
|
|
201
|
+
run_trace_id = uuid4().hex[:8]
|
|
202
|
+
retried = 0
|
|
203
|
+
succeeded = 0
|
|
204
|
+
still_failed = 0
|
|
205
|
+
for path in failed_files:
|
|
206
|
+
block = _block_from_failed_file(path)
|
|
207
|
+
if not block:
|
|
208
|
+
print(f"WARNING: skipped failed file without chunk source: {path}")
|
|
209
|
+
continue
|
|
210
|
+
try:
|
|
211
|
+
items = normalize_block(block, status=status)
|
|
212
|
+
except SystemExit as exc:
|
|
213
|
+
print(f"WARNING: retry failed with SystemExit({exc.code}); keeping failed fallback: {path}")
|
|
214
|
+
items = fallback_failed_block(block, status=status)
|
|
215
|
+
except Exception as exc:
|
|
216
|
+
print(f"WARNING: retry failed with {type(exc).__name__}: {exc}; keeping failed fallback: {path}")
|
|
217
|
+
items = fallback_failed_block(block, status=status)
|
|
218
|
+
if not items:
|
|
219
|
+
items = fallback_failed_block(block, status=status)
|
|
220
|
+
|
|
221
|
+
source_order = max(int(block.order or 0) - 1, 0)
|
|
222
|
+
_, written = _write_items(
|
|
223
|
+
items,
|
|
224
|
+
output_dir,
|
|
225
|
+
block,
|
|
226
|
+
source_order=source_order,
|
|
227
|
+
run_timestamp=run_timestamp,
|
|
228
|
+
run_trace_id=run_trace_id,
|
|
229
|
+
)
|
|
230
|
+
if written:
|
|
231
|
+
path.unlink()
|
|
232
|
+
retried += 1
|
|
233
|
+
if any(item.review_status == "failed" for item in items):
|
|
234
|
+
still_failed += 1
|
|
235
|
+
else:
|
|
236
|
+
succeeded += 1
|
|
237
|
+
print(f"done. failed_files={len(failed_files)} retried={retried} succeeded={succeeded} still_failed={still_failed}")
|
|
238
|
+
return 0
|
|
239
|
+
|
|
240
|
+
|
|
241
|
+
def _block_from_failed_file(path: Path) -> ParsedBlock | None:
|
|
242
|
+
text = path.read_text(encoding="utf-8")
|
|
243
|
+
metadata = _front_matter(text)
|
|
244
|
+
source = _failed_chunk_source(text)
|
|
245
|
+
if not source:
|
|
246
|
+
return None
|
|
247
|
+
return ParsedBlock(
|
|
248
|
+
source_doc=str(metadata.get("source_doc") or path.stem),
|
|
249
|
+
source_section=str(metadata.get("source_section") or ""),
|
|
250
|
+
content=source,
|
|
251
|
+
pages=[int(page) for page in metadata.get("source_pages") or [] if str(page).isdigit()],
|
|
252
|
+
order=int(metadata.get("source_order") or 0),
|
|
253
|
+
category=str(metadata.get("category") or ""),
|
|
254
|
+
category_keywords=[str(item) for item in metadata.get("category_keywords") or []],
|
|
255
|
+
source_doc_description=str(metadata.get("source_doc_description") or ""),
|
|
256
|
+
subcategory=str(metadata.get("subcategory") or ""),
|
|
257
|
+
category_path=[str(item) for item in metadata.get("category_path") or []],
|
|
258
|
+
related_categories=[str(item) for item in metadata.get("related_categories") or []],
|
|
259
|
+
relation_notes=[str(item) for item in metadata.get("relation_notes") or []],
|
|
260
|
+
related_items=metadata.get("related_items") or [],
|
|
261
|
+
)
|
|
262
|
+
|
|
263
|
+
|
|
264
|
+
def _front_matter(text: str) -> dict:
|
|
265
|
+
if not text.startswith("---\n"):
|
|
266
|
+
return {}
|
|
267
|
+
end = text.find("\n---\n", 4)
|
|
268
|
+
if end < 0:
|
|
269
|
+
return {}
|
|
270
|
+
data = yaml.safe_load(text[4:end]) or {}
|
|
271
|
+
return data if isinstance(data, dict) else {}
|
|
272
|
+
|
|
273
|
+
|
|
274
|
+
def _failed_chunk_source(text: str) -> str:
|
|
275
|
+
marker = "## failed_chunk_source"
|
|
276
|
+
start = text.find(marker)
|
|
277
|
+
if start < 0:
|
|
278
|
+
return ""
|
|
279
|
+
source = text[start + len(marker):].strip()
|
|
280
|
+
if source.startswith("```"):
|
|
281
|
+
first_line = source.find("\n")
|
|
282
|
+
if first_line >= 0:
|
|
283
|
+
source = source[first_line + 1:]
|
|
284
|
+
if source.endswith("```"):
|
|
285
|
+
source = source[:-3]
|
|
286
|
+
return source.strip()
|
|
287
|
+
|
|
288
|
+
|
|
289
|
+
def _choose_existing_result_action(output_dir: Path, existing: list[Path]) -> str:
|
|
290
|
+
"""询问用户如何处理已有生成结果。"""
|
|
98
291
|
print(f"found {len(existing)} existing file(s) in {output_dir}.")
|
|
99
|
-
print("
|
|
100
|
-
print(
|
|
101
|
-
|
|
102
|
-
|
|
292
|
+
print("Choose how to continue:")
|
|
293
|
+
print("1. delete and rebuild")
|
|
294
|
+
print("2. resume from checkpoint")
|
|
295
|
+
print("3. retry failed files")
|
|
296
|
+
print("4. exit")
|
|
297
|
+
answer = input("Select [1/2/3/4]: ").strip().lower().translate(
|
|
298
|
+
str.maketrans({"1": "1", "2": "2", "3": "3", "4": "4"})
|
|
299
|
+
)
|
|
300
|
+
if answer.startswith("1") or answer in {"d", "delete", "rebuild", "r"}:
|
|
301
|
+
return "rebuild"
|
|
302
|
+
if answer.startswith("2") or answer in {"resume", "continue", "c"}:
|
|
303
|
+
return "resume"
|
|
304
|
+
if answer.startswith("3") or answer in {"retry", "failed", "f"}:
|
|
305
|
+
return "retry"
|
|
306
|
+
return "exit"
|
|
307
|
+
|
|
308
|
+
|
|
309
|
+
def _load_progress(path: Path) -> dict | None:
|
|
310
|
+
"""读取断点续传状态。"""
|
|
311
|
+
if not path.exists():
|
|
312
|
+
return None
|
|
313
|
+
try:
|
|
314
|
+
data = json.loads(path.read_text(encoding="utf-8"))
|
|
315
|
+
except (OSError, json.JSONDecodeError) as exc:
|
|
316
|
+
print(f"WARNING: failed to read checkpoint: {exc}")
|
|
317
|
+
return None
|
|
318
|
+
if not isinstance(data, dict):
|
|
319
|
+
return None
|
|
320
|
+
return data
|
|
321
|
+
|
|
322
|
+
|
|
323
|
+
def _save_progress(
|
|
324
|
+
path: Path,
|
|
325
|
+
*,
|
|
326
|
+
input_path: Path,
|
|
327
|
+
output_dir: Path,
|
|
328
|
+
files: list[Path],
|
|
329
|
+
run_timestamp: str,
|
|
330
|
+
run_trace_id: str,
|
|
331
|
+
source_order: int,
|
|
332
|
+
total_items: int,
|
|
333
|
+
file_index: int,
|
|
334
|
+
block_index: int,
|
|
335
|
+
status: str,
|
|
336
|
+
error: str = "",
|
|
337
|
+
) -> None:
|
|
338
|
+
"""保存 draft 断点续传状态。"""
|
|
339
|
+
path.parent.mkdir(parents=True, exist_ok=True)
|
|
340
|
+
current_file = files[file_index] if 0 <= file_index < len(files) else None
|
|
341
|
+
payload = {
|
|
342
|
+
"version": 1,
|
|
343
|
+
"status": status,
|
|
344
|
+
"error": error,
|
|
345
|
+
"input_path": str(input_path),
|
|
346
|
+
"output_dir": str(output_dir),
|
|
347
|
+
"run_timestamp": run_timestamp,
|
|
348
|
+
"run_trace_id": run_trace_id,
|
|
349
|
+
"source_order": source_order,
|
|
350
|
+
"total_items": total_items,
|
|
351
|
+
"file_index": file_index,
|
|
352
|
+
"block_index": block_index,
|
|
353
|
+
"current_file": str(current_file) if current_file else "",
|
|
354
|
+
"current_file_name": current_file.name if current_file else "",
|
|
355
|
+
"files": [str(path) for path in files],
|
|
356
|
+
"updated_at": _make_timestamp(),
|
|
357
|
+
}
|
|
358
|
+
path.write_text(
|
|
359
|
+
json.dumps(payload, ensure_ascii=False, indent=2) + "\n",
|
|
360
|
+
encoding="utf-8",
|
|
361
|
+
)
|
|
362
|
+
|
|
363
|
+
|
|
364
|
+
def _make_timestamp() -> str:
|
|
365
|
+
"""生成用于文件名和断点记录的本地时间戳。"""
|
|
366
|
+
return datetime.now().strftime("%Y%m%d%H%M%S")
|
|
103
367
|
|
|
104
368
|
|
|
105
369
|
def _clear_generated_files(*dirs: Path) -> None:
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
tools:
|
|
2
|
+
- name: "query_member_change_announcements"
|
|
3
|
+
display_name: "查询成员机构变更通知"
|
|
4
|
+
description: "查询成员机构近期变更、变更报备、计划变更、执行评价以及影响范围。"
|
|
5
|
+
trigger: "当用户询问成员机构变更通知、变更报备、计划变更或执行评价时优先调用。"
|
|
6
|
+
input_schema:
|
|
7
|
+
type: "object"
|
|
8
|
+
properties:
|
|
9
|
+
orgCode:
|
|
10
|
+
type: "string"
|
|
11
|
+
description: "成员机构金融编码,通常由 resolve_member_org 工具获得。"
|
|
12
|
+
required: ["orgCode"]
|
|
13
|
+
output_schema:
|
|
14
|
+
type: "array"
|
|
15
|
+
description: "匹配到的成员机构变更通知列表。"
|
|
16
|
+
items:
|
|
17
|
+
type: "object"
|
|
18
|
+
properties:
|
|
19
|
+
member_code:
|
|
20
|
+
type: "string"
|
|
21
|
+
description: "成员机构编码。"
|
|
22
|
+
member_name:
|
|
23
|
+
type: "string"
|
|
24
|
+
description: "成员机构名称。"
|
|
25
|
+
change_type:
|
|
26
|
+
type: "string"
|
|
27
|
+
description: "变更类型。"
|