union_kb_ingest 1.0.8 → 1.0.10
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +1 -1
- package/config/config.yaml +1 -1
- package/ingest.py +173 -20
- package/normalizer.py +7 -3
- package/package.json +1 -1
- package/parser.py +1 -1
- package/writer.py +18 -3
package/README.md
CHANGED
|
@@ -43,7 +43,7 @@ input/
|
|
|
43
43
|
python ingest.py draft
|
|
44
44
|
```
|
|
45
45
|
|
|
46
|
-
如果 `result/`
|
|
46
|
+
如果 `result/` 中已有生成文件,命令会提示选择删除重建、从断点继续或退出。断点状态保存在 `result/.draft_progress.json`,大模型多次重试失败退出时会记录当前文件和片段位置,下次可选择从断点继续。
|
|
47
47
|
|
|
48
48
|
只解析为中间 Markdown:
|
|
49
49
|
|
package/config/config.yaml
CHANGED
package/ingest.py
CHANGED
|
@@ -2,10 +2,13 @@
|
|
|
2
2
|
from __future__ import annotations
|
|
3
3
|
|
|
4
4
|
import argparse
|
|
5
|
+
import json
|
|
5
6
|
import sys
|
|
6
7
|
from dataclasses import replace
|
|
8
|
+
from datetime import datetime
|
|
7
9
|
from pathlib import Path
|
|
8
10
|
from typing import List
|
|
11
|
+
from uuid import uuid4
|
|
9
12
|
|
|
10
13
|
CURRENT_DIR = Path(__file__).resolve().parent
|
|
11
14
|
if str(CURRENT_DIR) not in sys.path:
|
|
@@ -21,6 +24,7 @@ from writer import write_item
|
|
|
21
24
|
|
|
22
25
|
|
|
23
26
|
IGNORED_EXISTING_FILES = {".gitkeep", ".DS_Store"}
|
|
27
|
+
PROGRESS_FILENAME = ".draft_progress.json"
|
|
24
28
|
|
|
25
29
|
|
|
26
30
|
def cmd_parse(args) -> int:
|
|
@@ -45,21 +49,44 @@ def cmd_draft(args) -> int:
|
|
|
45
49
|
output_dir = Path(args.output)
|
|
46
50
|
|
|
47
51
|
existing = _list_effective_files(output_dir)
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
return 0
|
|
51
|
-
|
|
52
|
+
progress_path = output_dir / PROGRESS_FILENAME
|
|
53
|
+
resume_state = None
|
|
52
54
|
if existing:
|
|
53
|
-
|
|
55
|
+
action = _choose_existing_result_action(output_dir, existing)
|
|
56
|
+
if action == "exit":
|
|
57
|
+
print("aborted. existing files were kept.")
|
|
58
|
+
return 0
|
|
59
|
+
if action == "rebuild":
|
|
60
|
+
_clear_generated_files(output_dir)
|
|
61
|
+
elif action == "resume":
|
|
62
|
+
resume_state = _load_progress(progress_path)
|
|
63
|
+
if not resume_state:
|
|
64
|
+
print(f"aborted. no usable checkpoint found at {progress_path}.")
|
|
65
|
+
return 1
|
|
54
66
|
|
|
55
67
|
output_dir.mkdir(parents=True, exist_ok=True)
|
|
56
68
|
|
|
57
|
-
|
|
58
|
-
|
|
69
|
+
run_timestamp = (
|
|
70
|
+
str(resume_state.get("run_timestamp"))
|
|
71
|
+
if resume_state
|
|
72
|
+
else _make_timestamp()
|
|
73
|
+
)
|
|
74
|
+
run_trace_id = (
|
|
75
|
+
str(resume_state.get("run_trace_id"))
|
|
76
|
+
if resume_state
|
|
77
|
+
else uuid4().hex[:8]
|
|
78
|
+
)
|
|
79
|
+
total_items = int(resume_state.get("total_items", 0)) if resume_state else 0
|
|
80
|
+
source_order = int(resume_state.get("source_order", 0)) if resume_state else 0
|
|
59
81
|
draft_config = get_draft_config()
|
|
60
82
|
max_chars = args.max_chars or draft_config.max_chars
|
|
61
83
|
files = iter_input_files(input_path)
|
|
62
|
-
|
|
84
|
+
start_file_index = int(resume_state.get("file_index", 0)) if resume_state else 0
|
|
85
|
+
start_block_index = int(resume_state.get("block_index", 0)) if resume_state else 0
|
|
86
|
+
|
|
87
|
+
for file_index, path in enumerate(files):
|
|
88
|
+
if file_index < start_file_index:
|
|
89
|
+
continue
|
|
63
90
|
parsed = parse_document(path)
|
|
64
91
|
blocks = split_blocks(parsed.blocks, max_chars=max_chars)
|
|
65
92
|
blocks = _attach_block_context(
|
|
@@ -67,15 +94,76 @@ def cmd_draft(args) -> int:
|
|
|
67
94
|
context_chars=draft_config.context_chars,
|
|
68
95
|
outline_max_sections=draft_config.outline_max_sections,
|
|
69
96
|
)
|
|
70
|
-
|
|
71
|
-
|
|
97
|
+
block_start = start_block_index if file_index == start_file_index else 0
|
|
98
|
+
for block_index, block in enumerate(blocks):
|
|
99
|
+
if block_index < block_start:
|
|
100
|
+
continue
|
|
101
|
+
_save_progress(
|
|
102
|
+
progress_path,
|
|
103
|
+
input_path=input_path,
|
|
104
|
+
output_dir=output_dir,
|
|
105
|
+
files=files,
|
|
106
|
+
run_timestamp=run_timestamp,
|
|
107
|
+
run_trace_id=run_trace_id,
|
|
108
|
+
source_order=source_order,
|
|
109
|
+
total_items=total_items,
|
|
110
|
+
file_index=file_index,
|
|
111
|
+
block_index=block_index,
|
|
112
|
+
status="running",
|
|
113
|
+
)
|
|
114
|
+
try:
|
|
115
|
+
items = normalize_block(block, status=args.status)
|
|
116
|
+
except SystemExit as exc:
|
|
117
|
+
_save_progress(
|
|
118
|
+
progress_path,
|
|
119
|
+
input_path=input_path,
|
|
120
|
+
output_dir=output_dir,
|
|
121
|
+
files=files,
|
|
122
|
+
run_timestamp=run_timestamp,
|
|
123
|
+
run_trace_id=run_trace_id,
|
|
124
|
+
source_order=source_order,
|
|
125
|
+
total_items=total_items,
|
|
126
|
+
file_index=file_index,
|
|
127
|
+
block_index=block_index,
|
|
128
|
+
status="failed",
|
|
129
|
+
error=f"SystemExit({exc.code})",
|
|
130
|
+
)
|
|
131
|
+
print(
|
|
132
|
+
"checkpoint saved. "
|
|
133
|
+
f"file={path.name} block={block_index + 1}/{len(blocks)} "
|
|
134
|
+
f"progress={progress_path}"
|
|
135
|
+
)
|
|
136
|
+
raise
|
|
137
|
+
for item in items:
|
|
72
138
|
source_order += 1
|
|
73
139
|
item.source_order = source_order
|
|
74
140
|
item.source_pages = sorted(set(block.pages))
|
|
75
141
|
item.source_trace = _source_trace(block)
|
|
76
|
-
write_item(
|
|
142
|
+
write_item(
|
|
143
|
+
item,
|
|
144
|
+
output_dir,
|
|
145
|
+
source_title=Path(block.source_doc).stem,
|
|
146
|
+
timestamp=run_timestamp,
|
|
147
|
+
trace_id=run_trace_id,
|
|
148
|
+
)
|
|
77
149
|
total_items += 1
|
|
150
|
+
_save_progress(
|
|
151
|
+
progress_path,
|
|
152
|
+
input_path=input_path,
|
|
153
|
+
output_dir=output_dir,
|
|
154
|
+
files=files,
|
|
155
|
+
run_timestamp=run_timestamp,
|
|
156
|
+
run_trace_id=run_trace_id,
|
|
157
|
+
source_order=source_order,
|
|
158
|
+
total_items=total_items,
|
|
159
|
+
file_index=file_index,
|
|
160
|
+
block_index=block_index + 1,
|
|
161
|
+
status="running",
|
|
162
|
+
)
|
|
78
163
|
print(f"drafted: {path} blocks={len(blocks)}")
|
|
164
|
+
start_block_index = 0
|
|
165
|
+
if progress_path.exists():
|
|
166
|
+
progress_path.unlink()
|
|
79
167
|
print(f"done. files={len(files)} draft_items={total_items} output={output_dir}")
|
|
80
168
|
return 0
|
|
81
169
|
|
|
@@ -90,16 +178,81 @@ def _list_effective_files(path: Path) -> list[Path]:
|
|
|
90
178
|
)
|
|
91
179
|
|
|
92
180
|
|
|
93
|
-
def
|
|
94
|
-
|
|
95
|
-
existing: list[Path],
|
|
96
|
-
) -> bool:
|
|
97
|
-
"""询问用户是否覆盖已有生成文件。"""
|
|
181
|
+
def _choose_existing_result_action(output_dir: Path, existing: list[Path]) -> str:
|
|
182
|
+
"""询问用户如何处理已有生成结果。"""
|
|
98
183
|
print(f"found {len(existing)} existing file(s) in {output_dir}.")
|
|
99
|
-
print("
|
|
100
|
-
print(
|
|
101
|
-
|
|
102
|
-
|
|
184
|
+
print("Choose how to continue:")
|
|
185
|
+
print("1. delete and rebuild")
|
|
186
|
+
print("2. resume from checkpoint")
|
|
187
|
+
print("3. exit")
|
|
188
|
+
answer = input("Select [1/2/3]: ").strip().lower().translate(
|
|
189
|
+
str.maketrans({"1": "1", "2": "2", "3": "3"})
|
|
190
|
+
)
|
|
191
|
+
if answer.startswith("1") or answer in {"d", "delete", "rebuild", "r"}:
|
|
192
|
+
return "rebuild"
|
|
193
|
+
if answer.startswith("2") or answer in {"resume", "continue", "c"}:
|
|
194
|
+
return "resume"
|
|
195
|
+
return "exit"
|
|
196
|
+
|
|
197
|
+
|
|
198
|
+
def _load_progress(path: Path) -> dict | None:
|
|
199
|
+
"""读取断点续传状态。"""
|
|
200
|
+
if not path.exists():
|
|
201
|
+
return None
|
|
202
|
+
try:
|
|
203
|
+
data = json.loads(path.read_text(encoding="utf-8"))
|
|
204
|
+
except (OSError, json.JSONDecodeError) as exc:
|
|
205
|
+
print(f"WARNING: failed to read checkpoint: {exc}")
|
|
206
|
+
return None
|
|
207
|
+
if not isinstance(data, dict):
|
|
208
|
+
return None
|
|
209
|
+
return data
|
|
210
|
+
|
|
211
|
+
|
|
212
|
+
def _save_progress(
|
|
213
|
+
path: Path,
|
|
214
|
+
*,
|
|
215
|
+
input_path: Path,
|
|
216
|
+
output_dir: Path,
|
|
217
|
+
files: list[Path],
|
|
218
|
+
run_timestamp: str,
|
|
219
|
+
run_trace_id: str,
|
|
220
|
+
source_order: int,
|
|
221
|
+
total_items: int,
|
|
222
|
+
file_index: int,
|
|
223
|
+
block_index: int,
|
|
224
|
+
status: str,
|
|
225
|
+
error: str = "",
|
|
226
|
+
) -> None:
|
|
227
|
+
"""保存 draft 断点续传状态。"""
|
|
228
|
+
path.parent.mkdir(parents=True, exist_ok=True)
|
|
229
|
+
current_file = files[file_index] if 0 <= file_index < len(files) else None
|
|
230
|
+
payload = {
|
|
231
|
+
"version": 1,
|
|
232
|
+
"status": status,
|
|
233
|
+
"error": error,
|
|
234
|
+
"input_path": str(input_path),
|
|
235
|
+
"output_dir": str(output_dir),
|
|
236
|
+
"run_timestamp": run_timestamp,
|
|
237
|
+
"run_trace_id": run_trace_id,
|
|
238
|
+
"source_order": source_order,
|
|
239
|
+
"total_items": total_items,
|
|
240
|
+
"file_index": file_index,
|
|
241
|
+
"block_index": block_index,
|
|
242
|
+
"current_file": str(current_file) if current_file else "",
|
|
243
|
+
"current_file_name": current_file.name if current_file else "",
|
|
244
|
+
"files": [str(path) for path in files],
|
|
245
|
+
"updated_at": _make_timestamp(),
|
|
246
|
+
}
|
|
247
|
+
path.write_text(
|
|
248
|
+
json.dumps(payload, ensure_ascii=False, indent=2) + "\n",
|
|
249
|
+
encoding="utf-8",
|
|
250
|
+
)
|
|
251
|
+
|
|
252
|
+
|
|
253
|
+
def _make_timestamp() -> str:
|
|
254
|
+
"""生成用于文件名和断点记录的本地时间戳。"""
|
|
255
|
+
return datetime.now().strftime("%Y%m%d%H%M%S")
|
|
103
256
|
|
|
104
257
|
|
|
105
258
|
def _clear_generated_files(*dirs: Path) -> None:
|
package/normalizer.py
CHANGED
|
@@ -9,6 +9,8 @@ from functools import lru_cache
|
|
|
9
9
|
from pathlib import Path
|
|
10
10
|
from typing import Any, Dict, List
|
|
11
11
|
|
|
12
|
+
import yaml
|
|
13
|
+
|
|
12
14
|
from app_config import get_llm_config
|
|
13
15
|
from schemas import DOC_TYPES, KnowledgeItem, ParsedBlock
|
|
14
16
|
|
|
@@ -17,7 +19,7 @@ DEFAULT_DOMAIN = "网联清算业务"
|
|
|
17
19
|
DEFAULT_OWNER = "网联清算业务知识库"
|
|
18
20
|
CURRENT_DIR = Path(__file__).resolve().parent
|
|
19
21
|
KB_SPEC_PATH = CURRENT_DIR / "prompts" / "知识库建立规范.md"
|
|
20
|
-
TOOLS_PATH = CURRENT_DIR / "input" / "function" / "tools.
|
|
22
|
+
TOOLS_PATH = CURRENT_DIR / "input" / "function" / "tools.yaml"
|
|
21
23
|
LLM_MAX_RETRIES = 10
|
|
22
24
|
COVERAGE_MAX_RETRIES = 3
|
|
23
25
|
|
|
@@ -743,9 +745,11 @@ def _read_kb_spec() -> str:
|
|
|
743
745
|
def _read_tools() -> List[Dict[str, Any]]:
|
|
744
746
|
"""读取本地工具维护文件。"""
|
|
745
747
|
try:
|
|
746
|
-
raw =
|
|
747
|
-
except (FileNotFoundError,
|
|
748
|
+
raw = yaml.safe_load(TOOLS_PATH.read_text(encoding="utf-8"))
|
|
749
|
+
except (FileNotFoundError, yaml.YAMLError):
|
|
748
750
|
return []
|
|
751
|
+
if isinstance(raw, dict):
|
|
752
|
+
raw = raw.get("tools")
|
|
749
753
|
if not isinstance(raw, list):
|
|
750
754
|
return []
|
|
751
755
|
return [item for item in raw if isinstance(item, dict)]
|
package/package.json
CHANGED
package/parser.py
CHANGED
|
@@ -21,7 +21,7 @@ def iter_input_files(input_path: Path) -> List[Path]:
|
|
|
21
21
|
p for p in input_path.rglob("*")
|
|
22
22
|
if p.is_file() and p.suffix.lower() in SUPPORTED_EXTENSIONS
|
|
23
23
|
]
|
|
24
|
-
return sorted(files)
|
|
24
|
+
return sorted(files, key=lambda path: (path.name.lower(), str(path).lower()))
|
|
25
25
|
|
|
26
26
|
|
|
27
27
|
def parse_document(path: Path) -> ParsedDocument:
|
package/writer.py
CHANGED
|
@@ -8,10 +8,25 @@ import yaml
|
|
|
8
8
|
from schemas import KnowledgeItem
|
|
9
9
|
|
|
10
10
|
|
|
11
|
-
def write_item(
|
|
11
|
+
def write_item(
|
|
12
|
+
item: KnowledgeItem,
|
|
13
|
+
output_dir: Path,
|
|
14
|
+
*,
|
|
15
|
+
source_title: str | None = None,
|
|
16
|
+
timestamp: str | None = None,
|
|
17
|
+
trace_id: str | None = None,
|
|
18
|
+
) -> Path:
|
|
12
19
|
"""把知识条目渲染并写入输出目录。"""
|
|
13
20
|
output_dir.mkdir(parents=True, exist_ok=True)
|
|
14
|
-
|
|
21
|
+
parts = [
|
|
22
|
+
source_title,
|
|
23
|
+
timestamp,
|
|
24
|
+
trace_id,
|
|
25
|
+
_order_prefix(item.source_order),
|
|
26
|
+
item.kb_id,
|
|
27
|
+
]
|
|
28
|
+
filename = "-".join(_safe_filename(part) for part in parts if part)
|
|
29
|
+
path = output_dir / f"{filename}.md"
|
|
15
30
|
path.write_text(render_markdown(item), encoding="utf-8")
|
|
16
31
|
return path
|
|
17
32
|
|
|
@@ -30,7 +45,7 @@ def render_markdown(item: KnowledgeItem) -> str:
|
|
|
30
45
|
def _safe_filename(value: str) -> str:
|
|
31
46
|
"""把标识符转换为安全文件名。"""
|
|
32
47
|
value = value.lower().strip()
|
|
33
|
-
value = re.sub(r"[
|
|
48
|
+
value = re.sub(r"[^\w._-]+", "-", value)
|
|
34
49
|
value = re.sub(r"-+", "-", value).strip("-")
|
|
35
50
|
return value or "kb-item"
|
|
36
51
|
|