union_kb_ingest 1.0.8 → 1.0.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -43,7 +43,7 @@ input/
43
43
  python ingest.py draft
44
44
  ```
45
45
 
46
- 如果 `result/` 中已有生成文件,命令会先询问是否覆盖。选择 `y` 后会清空 `result/` 中已有生成文件,再重新生成;选择其他内容会直接退出,避免多次生成结果相互影响。
46
+ 如果 `result/` 中已有生成文件,命令会提示选择删除重建、从断点继续或退出。断点状态保存在 `result/.draft_progress.json`,大模型多次重试失败退出时会记录当前文件和片段位置,下次可选择从断点继续。
47
47
 
48
48
  只解析为中间 Markdown:
49
49
 
@@ -3,7 +3,7 @@ llm:
3
3
  timeout_seconds: 120
4
4
  max_tokens: 8192
5
5
  temperature: 0.1
6
- api_key: "15f066c4509845038027ea5746524af5.w4CLSC6ODiKVC1wK"
6
+ api_key: ""
7
7
  model: "GLM-4.7-Flash"
8
8
  base_url: "https://open.bigmodel.cn/api/paas/v4/"
9
9
 
package/ingest.py CHANGED
@@ -2,10 +2,13 @@
2
2
  from __future__ import annotations
3
3
 
4
4
  import argparse
5
+ import json
5
6
  import sys
6
7
  from dataclasses import replace
8
+ from datetime import datetime
7
9
  from pathlib import Path
8
10
  from typing import List
11
+ from uuid import uuid4
9
12
 
10
13
  CURRENT_DIR = Path(__file__).resolve().parent
11
14
  if str(CURRENT_DIR) not in sys.path:
@@ -21,6 +24,7 @@ from writer import write_item
21
24
 
22
25
 
23
26
  IGNORED_EXISTING_FILES = {".gitkeep", ".DS_Store"}
27
+ PROGRESS_FILENAME = ".draft_progress.json"
24
28
 
25
29
 
26
30
  def cmd_parse(args) -> int:
@@ -45,21 +49,44 @@ def cmd_draft(args) -> int:
45
49
  output_dir = Path(args.output)
46
50
 
47
51
  existing = _list_effective_files(output_dir)
48
- if existing and not _confirm_overwrite(output_dir, existing):
49
- print("aborted. existing files were kept.")
50
- return 0
51
-
52
+ progress_path = output_dir / PROGRESS_FILENAME
53
+ resume_state = None
52
54
  if existing:
53
- _clear_generated_files(output_dir)
55
+ action = _choose_existing_result_action(output_dir, existing)
56
+ if action == "exit":
57
+ print("aborted. existing files were kept.")
58
+ return 0
59
+ if action == "rebuild":
60
+ _clear_generated_files(output_dir)
61
+ elif action == "resume":
62
+ resume_state = _load_progress(progress_path)
63
+ if not resume_state:
64
+ print(f"aborted. no usable checkpoint found at {progress_path}.")
65
+ return 1
54
66
 
55
67
  output_dir.mkdir(parents=True, exist_ok=True)
56
68
 
57
- total_items = 0
58
- source_order = 0
69
+ run_timestamp = (
70
+ str(resume_state.get("run_timestamp"))
71
+ if resume_state
72
+ else _make_timestamp()
73
+ )
74
+ run_trace_id = (
75
+ str(resume_state.get("run_trace_id"))
76
+ if resume_state
77
+ else uuid4().hex[:8]
78
+ )
79
+ total_items = int(resume_state.get("total_items", 0)) if resume_state else 0
80
+ source_order = int(resume_state.get("source_order", 0)) if resume_state else 0
59
81
  draft_config = get_draft_config()
60
82
  max_chars = args.max_chars or draft_config.max_chars
61
83
  files = iter_input_files(input_path)
62
- for path in files:
84
+ start_file_index = int(resume_state.get("file_index", 0)) if resume_state else 0
85
+ start_block_index = int(resume_state.get("block_index", 0)) if resume_state else 0
86
+
87
+ for file_index, path in enumerate(files):
88
+ if file_index < start_file_index:
89
+ continue
63
90
  parsed = parse_document(path)
64
91
  blocks = split_blocks(parsed.blocks, max_chars=max_chars)
65
92
  blocks = _attach_block_context(
@@ -67,15 +94,76 @@ def cmd_draft(args) -> int:
67
94
  context_chars=draft_config.context_chars,
68
95
  outline_max_sections=draft_config.outline_max_sections,
69
96
  )
70
- for block in blocks:
71
- for item in normalize_block(block, status=args.status):
97
+ block_start = start_block_index if file_index == start_file_index else 0
98
+ for block_index, block in enumerate(blocks):
99
+ if block_index < block_start:
100
+ continue
101
+ _save_progress(
102
+ progress_path,
103
+ input_path=input_path,
104
+ output_dir=output_dir,
105
+ files=files,
106
+ run_timestamp=run_timestamp,
107
+ run_trace_id=run_trace_id,
108
+ source_order=source_order,
109
+ total_items=total_items,
110
+ file_index=file_index,
111
+ block_index=block_index,
112
+ status="running",
113
+ )
114
+ try:
115
+ items = normalize_block(block, status=args.status)
116
+ except SystemExit as exc:
117
+ _save_progress(
118
+ progress_path,
119
+ input_path=input_path,
120
+ output_dir=output_dir,
121
+ files=files,
122
+ run_timestamp=run_timestamp,
123
+ run_trace_id=run_trace_id,
124
+ source_order=source_order,
125
+ total_items=total_items,
126
+ file_index=file_index,
127
+ block_index=block_index,
128
+ status="failed",
129
+ error=f"SystemExit({exc.code})",
130
+ )
131
+ print(
132
+ "checkpoint saved. "
133
+ f"file={path.name} block={block_index + 1}/{len(blocks)} "
134
+ f"progress={progress_path}"
135
+ )
136
+ raise
137
+ for item in items:
72
138
  source_order += 1
73
139
  item.source_order = source_order
74
140
  item.source_pages = sorted(set(block.pages))
75
141
  item.source_trace = _source_trace(block)
76
- write_item(item, output_dir)
142
+ write_item(
143
+ item,
144
+ output_dir,
145
+ source_title=Path(block.source_doc).stem,
146
+ timestamp=run_timestamp,
147
+ trace_id=run_trace_id,
148
+ )
77
149
  total_items += 1
150
+ _save_progress(
151
+ progress_path,
152
+ input_path=input_path,
153
+ output_dir=output_dir,
154
+ files=files,
155
+ run_timestamp=run_timestamp,
156
+ run_trace_id=run_trace_id,
157
+ source_order=source_order,
158
+ total_items=total_items,
159
+ file_index=file_index,
160
+ block_index=block_index + 1,
161
+ status="running",
162
+ )
78
163
  print(f"drafted: {path} blocks={len(blocks)}")
164
+ start_block_index = 0
165
+ if progress_path.exists():
166
+ progress_path.unlink()
79
167
  print(f"done. files={len(files)} draft_items={total_items} output={output_dir}")
80
168
  return 0
81
169
 
@@ -90,16 +178,81 @@ def _list_effective_files(path: Path) -> list[Path]:
90
178
  )
91
179
 
92
180
 
93
- def _confirm_overwrite(
94
- output_dir: Path,
95
- existing: list[Path],
96
- ) -> bool:
97
- """询问用户是否覆盖已有生成文件。"""
181
+ def _choose_existing_result_action(output_dir: Path, existing: list[Path]) -> str:
182
+ """询问用户如何处理已有生成结果。"""
98
183
  print(f"found {len(existing)} existing file(s) in {output_dir}.")
99
- print("Continuing will delete existing generated files under:")
100
- print(f"- {output_dir}")
101
- answer = input("Overwrite and continue? [y/N]: ").strip().lower()
102
- return answer in {"y", "yes"}
184
+ print("Choose how to continue:")
185
+ print("1. delete and rebuild")
186
+ print("2. resume from checkpoint")
187
+ print("3. exit")
188
+ answer = input("Select [1/2/3]: ").strip().lower().translate(
189
+ str.maketrans({"1": "1", "2": "2", "3": "3"})
190
+ )
191
+ if answer.startswith("1") or answer in {"d", "delete", "rebuild", "r"}:
192
+ return "rebuild"
193
+ if answer.startswith("2") or answer in {"resume", "continue", "c"}:
194
+ return "resume"
195
+ return "exit"
196
+
197
+
198
+ def _load_progress(path: Path) -> dict | None:
199
+ """读取断点续传状态。"""
200
+ if not path.exists():
201
+ return None
202
+ try:
203
+ data = json.loads(path.read_text(encoding="utf-8"))
204
+ except (OSError, json.JSONDecodeError) as exc:
205
+ print(f"WARNING: failed to read checkpoint: {exc}")
206
+ return None
207
+ if not isinstance(data, dict):
208
+ return None
209
+ return data
210
+
211
+
212
+ def _save_progress(
213
+ path: Path,
214
+ *,
215
+ input_path: Path,
216
+ output_dir: Path,
217
+ files: list[Path],
218
+ run_timestamp: str,
219
+ run_trace_id: str,
220
+ source_order: int,
221
+ total_items: int,
222
+ file_index: int,
223
+ block_index: int,
224
+ status: str,
225
+ error: str = "",
226
+ ) -> None:
227
+ """保存 draft 断点续传状态。"""
228
+ path.parent.mkdir(parents=True, exist_ok=True)
229
+ current_file = files[file_index] if 0 <= file_index < len(files) else None
230
+ payload = {
231
+ "version": 1,
232
+ "status": status,
233
+ "error": error,
234
+ "input_path": str(input_path),
235
+ "output_dir": str(output_dir),
236
+ "run_timestamp": run_timestamp,
237
+ "run_trace_id": run_trace_id,
238
+ "source_order": source_order,
239
+ "total_items": total_items,
240
+ "file_index": file_index,
241
+ "block_index": block_index,
242
+ "current_file": str(current_file) if current_file else "",
243
+ "current_file_name": current_file.name if current_file else "",
244
+ "files": [str(path) for path in files],
245
+ "updated_at": _make_timestamp(),
246
+ }
247
+ path.write_text(
248
+ json.dumps(payload, ensure_ascii=False, indent=2) + "\n",
249
+ encoding="utf-8",
250
+ )
251
+
252
+
253
+ def _make_timestamp() -> str:
254
+ """生成用于文件名和断点记录的本地时间戳。"""
255
+ return datetime.now().strftime("%Y%m%d%H%M%S")
103
256
 
104
257
 
105
258
  def _clear_generated_files(*dirs: Path) -> None:
package/normalizer.py CHANGED
@@ -9,6 +9,8 @@ from functools import lru_cache
9
9
  from pathlib import Path
10
10
  from typing import Any, Dict, List
11
11
 
12
+ import yaml
13
+
12
14
  from app_config import get_llm_config
13
15
  from schemas import DOC_TYPES, KnowledgeItem, ParsedBlock
14
16
 
@@ -17,7 +19,7 @@ DEFAULT_DOMAIN = "网联清算业务"
17
19
  DEFAULT_OWNER = "网联清算业务知识库"
18
20
  CURRENT_DIR = Path(__file__).resolve().parent
19
21
  KB_SPEC_PATH = CURRENT_DIR / "prompts" / "知识库建立规范.md"
20
- TOOLS_PATH = CURRENT_DIR / "input" / "function" / "tools.json"
22
+ TOOLS_PATH = CURRENT_DIR / "input" / "function" / "tools.yaml"
21
23
  LLM_MAX_RETRIES = 10
22
24
  COVERAGE_MAX_RETRIES = 3
23
25
 
@@ -743,9 +745,11 @@ def _read_kb_spec() -> str:
743
745
  def _read_tools() -> List[Dict[str, Any]]:
744
746
  """读取本地工具维护文件。"""
745
747
  try:
746
- raw = json.loads(TOOLS_PATH.read_text(encoding="utf-8"))
747
- except (FileNotFoundError, json.JSONDecodeError):
748
+ raw = yaml.safe_load(TOOLS_PATH.read_text(encoding="utf-8"))
749
+ except (FileNotFoundError, yaml.YAMLError):
748
750
  return []
751
+ if isinstance(raw, dict):
752
+ raw = raw.get("tools")
749
753
  if not isinstance(raw, list):
750
754
  return []
751
755
  return [item for item in raw if isinstance(item, dict)]
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "union_kb_ingest",
3
- "version": "1.0.8",
3
+ "version": "1.0.10",
4
4
  "description": "Offline knowledge-base ingest helper for PDF, Word, Markdown and TXT documents.",
5
5
  "bin": {
6
6
  "union_kb_ingest": "bin/union_kb_ingest"
package/parser.py CHANGED
@@ -21,7 +21,7 @@ def iter_input_files(input_path: Path) -> List[Path]:
21
21
  p for p in input_path.rglob("*")
22
22
  if p.is_file() and p.suffix.lower() in SUPPORTED_EXTENSIONS
23
23
  ]
24
- return sorted(files)
24
+ return sorted(files, key=lambda path: (path.name.lower(), str(path).lower()))
25
25
 
26
26
 
27
27
  def parse_document(path: Path) -> ParsedDocument:
package/writer.py CHANGED
@@ -8,10 +8,25 @@ import yaml
8
8
  from schemas import KnowledgeItem
9
9
 
10
10
 
11
- def write_item(item: KnowledgeItem, output_dir: Path) -> Path:
11
+ def write_item(
12
+ item: KnowledgeItem,
13
+ output_dir: Path,
14
+ *,
15
+ source_title: str | None = None,
16
+ timestamp: str | None = None,
17
+ trace_id: str | None = None,
18
+ ) -> Path:
12
19
  """把知识条目渲染并写入输出目录。"""
13
20
  output_dir.mkdir(parents=True, exist_ok=True)
14
- path = output_dir / f"{_order_prefix(item.source_order)}-{_safe_filename(item.kb_id)}.md"
21
+ parts = [
22
+ source_title,
23
+ timestamp,
24
+ trace_id,
25
+ _order_prefix(item.source_order),
26
+ item.kb_id,
27
+ ]
28
+ filename = "-".join(_safe_filename(part) for part in parts if part)
29
+ path = output_dir / f"{filename}.md"
15
30
  path.write_text(render_markdown(item), encoding="utf-8")
16
31
  return path
17
32
 
@@ -30,7 +45,7 @@ def render_markdown(item: KnowledgeItem) -> str:
30
45
  def _safe_filename(value: str) -> str:
31
46
  """把标识符转换为安全文件名。"""
32
47
  value = value.lower().strip()
33
- value = re.sub(r"[^a-z0-9._-]+", "-", value)
48
+ value = re.sub(r"[^\w._-]+", "-", value)
34
49
  value = re.sub(r"-+", "-", value).strip("-")
35
50
  return value or "kb-item"
36
51