union_kb_ingest 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +100 -0
- package/app_config.py +84 -0
- package/approved/.gitkeep +1 -0
- package/bin/union_kb_ingest +5 -0
- package/config/config.yaml +8 -0
- package/drafts/.gitkeep +1 -0
- package/ingest.py +157 -0
- package/input/.gitkeep +1 -0
- package/input/pdf/.gitkeep +1 -0
- package/input/word/.gitkeep +1 -0
- package/normalizer.py +413 -0
- package/package.json +27 -0
- package/parsed/.gitkeep +1 -0
- package/parser.py +287 -0
- package/prompts/generate_kb_items.md +27 -0
- package/prompts//350/201/224/345/220/210/350/277/220/347/273/264/347/237/245/350/257/206/345/272/223/345/273/272/347/253/213/350/247/204/350/214/203.md +272 -0
- package/requirements.txt +9 -0
- package/schemas.py +85 -0
- package/splitter.py +127 -0
- package/validator.py +72 -0
- package/writer.py +33 -0
package/README.md
ADDED
|
@@ -0,0 +1,100 @@
|
|
|
1
|
+
# 离线知识库文件生成工具
|
|
2
|
+
|
|
3
|
+
这个目录是离线辅助工具,不参与线上应用运行。
|
|
4
|
+
|
|
5
|
+
目标:
|
|
6
|
+
|
|
7
|
+
1. 批量读取 PDF、Word、Markdown、TXT 文档。
|
|
8
|
+
2. 通过 Docling slim 的离线文本解析能力生成统一 Markdown 中间格式。
|
|
9
|
+
3. 按章节、场景、规则、指标等粒度切割。
|
|
10
|
+
4. 可选调用大模型,把内容整理为项目知识库规范要求的 Markdown 文件。
|
|
11
|
+
5. 默认生成 `status: draft` 草稿,不进入现有 RAG 检索。
|
|
12
|
+
|
|
13
|
+
启用大模型时,工具会把 `prompts/联合运维知识库建立规范.md` 作为格式和质量约束放入提示词,要求模型依据原文语义判断业务场景、模块、角色、标签和风险等级。代码中的启发式生成只作为未启用大模型或调用失败时的兜底,不使用预设业务关键词去指导大模型输出。
|
|
14
|
+
|
|
15
|
+
## 安装可选依赖
|
|
16
|
+
|
|
17
|
+
不建议把这些依赖加入项目根 `requirements.txt`。离线机器单独安装即可:
|
|
18
|
+
|
|
19
|
+
```bash
|
|
20
|
+
python -m pip install -r requirements.txt
|
|
21
|
+
```
|
|
22
|
+
|
|
23
|
+
解析层不使用 OCR,不加载本地视觉/版面模型,也不访问远程模型服务:
|
|
24
|
+
|
|
25
|
+
- PDF:使用 `docling-parse` 抽取 PDF 内嵌文本和文本行顺序;扫描件或图片型 PDF 不会识别。
|
|
26
|
+
- DOCX:使用 Docling 的 Word 后端转为 Markdown。
|
|
27
|
+
- 旧版 `.doc`:通过 LibreOffice `soffice` 转为 `.docx` 后再解析;不使用 OCR。
|
|
28
|
+
- Markdown / TXT:作为已文本化材料直接读取。
|
|
29
|
+
|
|
30
|
+
不要安装 `docling` 或 `docling-slim[standard]`,它们会引入 OCR、版面/表格模型、Torch/ONNXRuntime 等重依赖,并可能在运行时下载模型。内网机器建议为离线工具单独准备 Python 3.10+ 环境。
|
|
31
|
+
|
|
32
|
+
## 基本用法
|
|
33
|
+
|
|
34
|
+
把文件放入:
|
|
35
|
+
|
|
36
|
+
```text
|
|
37
|
+
input/
|
|
38
|
+
```
|
|
39
|
+
|
|
40
|
+
生成草稿:
|
|
41
|
+
|
|
42
|
+
```bash
|
|
43
|
+
python ingest.py draft
|
|
44
|
+
```
|
|
45
|
+
|
|
46
|
+
如果 `drafts/` 中已有草稿文件,命令会先询问是否覆盖。选择 `y` 后会清空 `drafts/`、`approved/` 和 `result/` 中已有生成文件,再重新生成;选择其他内容会直接退出,避免多次生成结果相互影响。
|
|
47
|
+
|
|
48
|
+
只解析为中间 Markdown:
|
|
49
|
+
|
|
50
|
+
```bash
|
|
51
|
+
python ingest.py parse
|
|
52
|
+
```
|
|
53
|
+
|
|
54
|
+
校验草稿:
|
|
55
|
+
|
|
56
|
+
```bash
|
|
57
|
+
python ingest.py validate
|
|
58
|
+
```
|
|
59
|
+
|
|
60
|
+
审核后复制到知识库:
|
|
61
|
+
|
|
62
|
+
```bash
|
|
63
|
+
python ingest.py promote
|
|
64
|
+
```
|
|
65
|
+
|
|
66
|
+
默认目录为 `input/`、`parsed/`、`drafts/`、`approved/` 和 `result/`。只有需要处理其他目录时,才使用 `--input`、`--output` 或 `--result-dir` 覆盖。
|
|
67
|
+
|
|
68
|
+
## 大模型配置
|
|
69
|
+
|
|
70
|
+
默认不强制调用大模型,会使用启发式模板生成 `draft` 文件。
|
|
71
|
+
|
|
72
|
+
如果要启用大模型整理,修改 `config/config.yaml`:
|
|
73
|
+
|
|
74
|
+
```yaml
|
|
75
|
+
llm:
|
|
76
|
+
enabled: true
|
|
77
|
+
base_url: "https://your-model-endpoint"
|
|
78
|
+
api_key: "your-api-key"
|
|
79
|
+
model: "your-model"
|
|
80
|
+
timeout_seconds: 120
|
|
81
|
+
max_tokens: 4096
|
|
82
|
+
temperature: 0.1
|
|
83
|
+
```
|
|
84
|
+
|
|
85
|
+
也可以继续使用环境变量覆盖配置文件:
|
|
86
|
+
|
|
87
|
+
```bash
|
|
88
|
+
export KB_LLM_ENABLED=true
|
|
89
|
+
export KB_LLM_BASE_URL="https://your-model-endpoint"
|
|
90
|
+
export KB_LLM_API_KEY="your-api-key"
|
|
91
|
+
export KB_LLM_MODEL="your-model"
|
|
92
|
+
```
|
|
93
|
+
|
|
94
|
+
`base_url` 必须填写完整的大模型调用地址,工具不会自动拼接任何路径。工具不 import 项目 `src` 代码。
|
|
95
|
+
|
|
96
|
+
## 与线上项目的关系
|
|
97
|
+
|
|
98
|
+
这个工具只产出符合规范的 `*.md` 文件。确认无误后,人工把 `status: draft` 改为 `active`,再放入 `result/`,后续由线上知识库加载流程处理。
|
|
99
|
+
|
|
100
|
+
建议线上打包时排除整个 `tools/kb_ingest` 目录。
|
package/app_config.py
ADDED
|
@@ -0,0 +1,84 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import os
|
|
4
|
+
from dataclasses import dataclass
|
|
5
|
+
from functools import lru_cache
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import Any, Dict
|
|
8
|
+
|
|
9
|
+
import yaml
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
CURRENT_DIR = Path(__file__).resolve().parent
|
|
13
|
+
DEFAULT_CONFIG_PATH = CURRENT_DIR / "config" / "config.yaml"
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
@dataclass(frozen=True)
|
|
17
|
+
class LlmConfig:
|
|
18
|
+
enabled: bool = False
|
|
19
|
+
base_url: str = ""
|
|
20
|
+
api_key: str = ""
|
|
21
|
+
model: str = ""
|
|
22
|
+
timeout_seconds: int = 120
|
|
23
|
+
max_tokens: int = 4096
|
|
24
|
+
temperature: float = 0.1
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
@lru_cache(maxsize=1)
|
|
28
|
+
def get_llm_config() -> LlmConfig:
|
|
29
|
+
raw = _read_config().get("llm", {})
|
|
30
|
+
if not isinstance(raw, dict):
|
|
31
|
+
raw = {}
|
|
32
|
+
|
|
33
|
+
return LlmConfig(
|
|
34
|
+
enabled=_env_bool("KB_LLM_ENABLED", _as_bool(raw.get("enabled"), False)),
|
|
35
|
+
base_url=os.environ.get("KB_LLM_BASE_URL", str(raw.get("base_url") or "")),
|
|
36
|
+
api_key=os.environ.get("KB_LLM_API_KEY", str(raw.get("api_key") or "")),
|
|
37
|
+
model=os.environ.get("KB_LLM_MODEL", str(raw.get("model") or "")),
|
|
38
|
+
timeout_seconds=_env_int("KB_LLM_TIMEOUT_SECONDS", raw.get("timeout_seconds"), 120),
|
|
39
|
+
max_tokens=_env_int("KB_LLM_MAX_TOKENS", raw.get("max_tokens"), 4096),
|
|
40
|
+
temperature=_env_float("KB_LLM_TEMPERATURE", raw.get("temperature"), 0.1),
|
|
41
|
+
)
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def _read_config() -> Dict[str, Any]:
|
|
45
|
+
path = Path(os.environ.get("KB_INGEST_CONFIG", DEFAULT_CONFIG_PATH))
|
|
46
|
+
if not path.exists():
|
|
47
|
+
return {}
|
|
48
|
+
data = yaml.safe_load(path.read_text(encoding="utf-8")) or {}
|
|
49
|
+
return data if isinstance(data, dict) else {}
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def _env_bool(name: str, default: bool) -> bool:
|
|
53
|
+
value = os.environ.get(name)
|
|
54
|
+
if value is None:
|
|
55
|
+
return default
|
|
56
|
+
return _as_bool(value, default)
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def _env_int(name: str, value: Any, default: int) -> int:
|
|
60
|
+
raw = os.environ.get(name, value)
|
|
61
|
+
try:
|
|
62
|
+
return int(raw)
|
|
63
|
+
except (TypeError, ValueError):
|
|
64
|
+
return default
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def _env_float(name: str, value: Any, default: float) -> float:
|
|
68
|
+
raw = os.environ.get(name, value)
|
|
69
|
+
try:
|
|
70
|
+
return float(raw)
|
|
71
|
+
except (TypeError, ValueError):
|
|
72
|
+
return default
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def _as_bool(value: Any, default: bool) -> bool:
|
|
76
|
+
if isinstance(value, bool):
|
|
77
|
+
return value
|
|
78
|
+
if isinstance(value, str):
|
|
79
|
+
normalized = value.strip().lower()
|
|
80
|
+
if normalized in {"1", "true", "yes", "y", "on"}:
|
|
81
|
+
return True
|
|
82
|
+
if normalized in {"0", "false", "no", "n", "off"}:
|
|
83
|
+
return False
|
|
84
|
+
return default
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
package/drafts/.gitkeep
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
package/ingest.py
ADDED
|
@@ -0,0 +1,157 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
import argparse
|
|
5
|
+
import shutil
|
|
6
|
+
import sys
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
|
|
9
|
+
CURRENT_DIR = Path(__file__).resolve().parent
|
|
10
|
+
if str(CURRENT_DIR) not in sys.path:
|
|
11
|
+
sys.path.insert(0, str(CURRENT_DIR))
|
|
12
|
+
|
|
13
|
+
from normalizer import normalize_block
|
|
14
|
+
from parser import iter_input_files, parse_document
|
|
15
|
+
from splitter import split_blocks
|
|
16
|
+
from validator import validate_dir
|
|
17
|
+
from writer import write_item
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
IGNORED_EXISTING_FILES = {".gitkeep", ".DS_Store"}
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def cmd_parse(args) -> int:
|
|
24
|
+
input_path = Path(args.input)
|
|
25
|
+
output_dir = Path(args.output)
|
|
26
|
+
output_dir.mkdir(parents=True, exist_ok=True)
|
|
27
|
+
|
|
28
|
+
files = iter_input_files(input_path)
|
|
29
|
+
for path in files:
|
|
30
|
+
parsed = parse_document(path)
|
|
31
|
+
out = output_dir / f"{path.stem}.parsed.md"
|
|
32
|
+
out.write_text(parsed.markdown, encoding="utf-8")
|
|
33
|
+
print(f"parsed: {path} -> {out}")
|
|
34
|
+
print(f"done. files={len(files)}")
|
|
35
|
+
return 0
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def cmd_draft(args) -> int:
|
|
39
|
+
input_path = Path(args.input)
|
|
40
|
+
output_dir = Path(args.output)
|
|
41
|
+
approved_dir = Path(args.approved_dir)
|
|
42
|
+
result_dir = Path(args.result_dir)
|
|
43
|
+
|
|
44
|
+
existing = _list_effective_files(output_dir)
|
|
45
|
+
if existing and not _confirm_overwrite(output_dir, approved_dir, result_dir, existing):
|
|
46
|
+
print("aborted. existing files were kept.")
|
|
47
|
+
return 0
|
|
48
|
+
|
|
49
|
+
if existing:
|
|
50
|
+
_clear_generated_files(output_dir, approved_dir, result_dir)
|
|
51
|
+
|
|
52
|
+
output_dir.mkdir(parents=True, exist_ok=True)
|
|
53
|
+
|
|
54
|
+
total_items = 0
|
|
55
|
+
files = iter_input_files(input_path)
|
|
56
|
+
for path in files:
|
|
57
|
+
parsed = parse_document(path)
|
|
58
|
+
blocks = split_blocks(parsed.blocks, max_chars=args.max_chars)
|
|
59
|
+
for block in blocks:
|
|
60
|
+
for item in normalize_block(block, status="draft"):
|
|
61
|
+
write_item(item, output_dir)
|
|
62
|
+
total_items += 1
|
|
63
|
+
print(f"drafted: {path} blocks={len(blocks)}")
|
|
64
|
+
print(f"done. files={len(files)} draft_items={total_items} output={output_dir}")
|
|
65
|
+
return 0
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def _list_effective_files(path: Path) -> list[Path]:
|
|
69
|
+
if not path.exists():
|
|
70
|
+
return []
|
|
71
|
+
return sorted(
|
|
72
|
+
p for p in path.rglob("*")
|
|
73
|
+
if p.is_file() and p.name not in IGNORED_EXISTING_FILES
|
|
74
|
+
)
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def _confirm_overwrite(
|
|
78
|
+
output_dir: Path,
|
|
79
|
+
approved_dir: Path,
|
|
80
|
+
result_dir: Path,
|
|
81
|
+
existing: list[Path],
|
|
82
|
+
) -> bool:
|
|
83
|
+
print(f"found {len(existing)} existing file(s) in {output_dir}.")
|
|
84
|
+
print("Continuing will delete existing generated files under:")
|
|
85
|
+
print(f"- {output_dir}")
|
|
86
|
+
print(f"- {approved_dir}")
|
|
87
|
+
print(f"- {result_dir}")
|
|
88
|
+
answer = input("Overwrite and continue? [y/N]: ").strip().lower()
|
|
89
|
+
return answer in {"y", "yes"}
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
def _clear_generated_files(*dirs: Path) -> None:
|
|
93
|
+
for directory in dirs:
|
|
94
|
+
for path in _list_effective_files(directory):
|
|
95
|
+
path.unlink()
|
|
96
|
+
print(f"deleted: {path}")
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
def cmd_validate(args) -> int:
|
|
100
|
+
issues = validate_dir(Path(args.input))
|
|
101
|
+
for issue in issues:
|
|
102
|
+
print(f"{issue.level}: {issue.path}: {issue.message}")
|
|
103
|
+
print(f"done. issues={len(issues)}")
|
|
104
|
+
return 1 if any(i.level == "error" for i in issues) else 0
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
def cmd_promote(args) -> int:
|
|
108
|
+
input_dir = Path(args.input)
|
|
109
|
+
result_dir = Path(args.result_dir)
|
|
110
|
+
result_dir.mkdir(parents=True, exist_ok=True)
|
|
111
|
+
count = 0
|
|
112
|
+
for path in sorted(input_dir.rglob("*.md")):
|
|
113
|
+
target = result_dir / path.name
|
|
114
|
+
shutil.copy2(path, target)
|
|
115
|
+
count += 1
|
|
116
|
+
print(f"promoted: {path} -> {target}")
|
|
117
|
+
print(f"done. promoted={count}")
|
|
118
|
+
return 0
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
def build_parser() -> argparse.ArgumentParser:
|
|
122
|
+
parser = argparse.ArgumentParser(description="Offline document-to-knowledge Markdown generator.")
|
|
123
|
+
sub = parser.add_subparsers(dest="command", required=True)
|
|
124
|
+
|
|
125
|
+
parse_cmd = sub.add_parser("parse", help="Parse input documents to intermediate Markdown.")
|
|
126
|
+
parse_cmd.add_argument("--input", default=str(CURRENT_DIR / "input"))
|
|
127
|
+
parse_cmd.add_argument("--output", default=str(CURRENT_DIR / "parsed"))
|
|
128
|
+
parse_cmd.set_defaults(func=cmd_parse)
|
|
129
|
+
|
|
130
|
+
draft_cmd = sub.add_parser("draft", help="Generate draft knowledge files.")
|
|
131
|
+
draft_cmd.add_argument("--input", default=str(CURRENT_DIR / "input"))
|
|
132
|
+
draft_cmd.add_argument("--output", default=str(CURRENT_DIR / "drafts"))
|
|
133
|
+
draft_cmd.add_argument("--approved-dir", default=str(CURRENT_DIR / "approved"))
|
|
134
|
+
draft_cmd.add_argument("--result-dir", default=str(CURRENT_DIR / "result"))
|
|
135
|
+
draft_cmd.add_argument("--max-chars", type=int, default=8000)
|
|
136
|
+
draft_cmd.set_defaults(func=cmd_draft)
|
|
137
|
+
|
|
138
|
+
validate_cmd = sub.add_parser("validate", help="Validate generated Markdown files.")
|
|
139
|
+
validate_cmd.add_argument("--input", default=str(CURRENT_DIR / "drafts"))
|
|
140
|
+
validate_cmd.set_defaults(func=cmd_validate)
|
|
141
|
+
|
|
142
|
+
promote_cmd = sub.add_parser("promote", help="Copy reviewed files to result.")
|
|
143
|
+
promote_cmd.add_argument("--input", default=str(CURRENT_DIR / "approved"))
|
|
144
|
+
promote_cmd.add_argument("--result-dir", default=str(CURRENT_DIR / "result"))
|
|
145
|
+
promote_cmd.set_defaults(func=cmd_promote)
|
|
146
|
+
|
|
147
|
+
return parser
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
def main() -> int:
|
|
151
|
+
parser = build_parser()
|
|
152
|
+
args = parser.parse_args()
|
|
153
|
+
return args.func(args)
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
if __name__ == "__main__":
|
|
157
|
+
raise SystemExit(main())
|
package/input/.gitkeep
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|