union_kb_ingest 1.0.4 → 1.0.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/app_config.py +9 -0
- package/config/config.yaml +2 -2
- package/ingest.py +15 -16
- package/normalizer.py +191 -100
- package/package.json +1 -1
- package/parser.py +47 -34
- package/schemas.py +6 -0
- package/splitter.py +13 -29
- package/validator.py +10 -0
- package/writer.py +4 -0
package/app_config.py
CHANGED
|
@@ -15,6 +15,7 @@ DEFAULT_CONFIG_PATH = CURRENT_DIR / "config" / "config.yaml"
|
|
|
15
15
|
|
|
16
16
|
@dataclass(frozen=True)
|
|
17
17
|
class LlmConfig:
|
|
18
|
+
"""LLM 调用配置,支持配置文件和环境变量覆盖。"""
|
|
18
19
|
enabled: bool = False
|
|
19
20
|
base_url: str = ""
|
|
20
21
|
api_key: str = ""
|
|
@@ -26,6 +27,7 @@ class LlmConfig:
|
|
|
26
27
|
|
|
27
28
|
@dataclass(frozen=True)
|
|
28
29
|
class DraftConfig:
|
|
30
|
+
"""草稿生成阶段的切分和上下文配置。"""
|
|
29
31
|
max_chars: int = 3600
|
|
30
32
|
context_chars: int = 800
|
|
31
33
|
outline_max_sections: int = 40
|
|
@@ -33,6 +35,7 @@ class DraftConfig:
|
|
|
33
35
|
|
|
34
36
|
@lru_cache(maxsize=1)
|
|
35
37
|
def get_llm_config() -> LlmConfig:
|
|
38
|
+
"""读取并合并 LLM 配置。"""
|
|
36
39
|
raw = _read_config().get("llm", {})
|
|
37
40
|
if not isinstance(raw, dict):
|
|
38
41
|
raw = {}
|
|
@@ -50,6 +53,7 @@ def get_llm_config() -> LlmConfig:
|
|
|
50
53
|
|
|
51
54
|
@lru_cache(maxsize=1)
|
|
52
55
|
def get_draft_config() -> DraftConfig:
|
|
56
|
+
"""读取并合并草稿生成配置。"""
|
|
53
57
|
raw = _read_config().get("draft", {})
|
|
54
58
|
if not isinstance(raw, dict):
|
|
55
59
|
raw = {}
|
|
@@ -62,6 +66,7 @@ def get_draft_config() -> DraftConfig:
|
|
|
62
66
|
|
|
63
67
|
|
|
64
68
|
def _read_config() -> Dict[str, Any]:
|
|
69
|
+
"""读取 YAML 配置文件并返回字典。"""
|
|
65
70
|
path = Path(os.environ.get("KB_INGEST_CONFIG", DEFAULT_CONFIG_PATH))
|
|
66
71
|
if not path.exists():
|
|
67
72
|
return {}
|
|
@@ -70,6 +75,7 @@ def _read_config() -> Dict[str, Any]:
|
|
|
70
75
|
|
|
71
76
|
|
|
72
77
|
def _env_bool(name: str, default: bool) -> bool:
|
|
78
|
+
"""读取布尔环境变量并回退到默认值。"""
|
|
73
79
|
value = os.environ.get(name)
|
|
74
80
|
if value is None:
|
|
75
81
|
return default
|
|
@@ -77,6 +83,7 @@ def _env_bool(name: str, default: bool) -> bool:
|
|
|
77
83
|
|
|
78
84
|
|
|
79
85
|
def _env_int(name: str, value: Any, default: int) -> int:
|
|
86
|
+
"""读取整数环境变量并回退到默认值。"""
|
|
80
87
|
raw = os.environ.get(name, value)
|
|
81
88
|
try:
|
|
82
89
|
return int(raw)
|
|
@@ -85,6 +92,7 @@ def _env_int(name: str, value: Any, default: int) -> int:
|
|
|
85
92
|
|
|
86
93
|
|
|
87
94
|
def _env_float(name: str, value: Any, default: float) -> float:
|
|
95
|
+
"""读取浮点环境变量并回退到默认值。"""
|
|
88
96
|
raw = os.environ.get(name, value)
|
|
89
97
|
try:
|
|
90
98
|
return float(raw)
|
|
@@ -93,6 +101,7 @@ def _env_float(name: str, value: Any, default: float) -> float:
|
|
|
93
101
|
|
|
94
102
|
|
|
95
103
|
def _as_bool(value: Any, default: bool) -> bool:
|
|
104
|
+
"""把配置值转换为布尔值。"""
|
|
96
105
|
if isinstance(value, bool):
|
|
97
106
|
return value
|
|
98
107
|
if isinstance(value, str):
|
package/config/config.yaml
CHANGED
package/ingest.py
CHANGED
|
@@ -3,6 +3,7 @@ from __future__ import annotations
|
|
|
3
3
|
|
|
4
4
|
import argparse
|
|
5
5
|
import sys
|
|
6
|
+
from dataclasses import replace
|
|
6
7
|
from pathlib import Path
|
|
7
8
|
from typing import List
|
|
8
9
|
|
|
@@ -23,6 +24,7 @@ IGNORED_EXISTING_FILES = {".gitkeep", ".DS_Store"}
|
|
|
23
24
|
|
|
24
25
|
|
|
25
26
|
def cmd_parse(args) -> int:
|
|
27
|
+
"""执行解析子命令。"""
|
|
26
28
|
input_path = Path(args.input)
|
|
27
29
|
output_dir = Path(args.output)
|
|
28
30
|
output_dir.mkdir(parents=True, exist_ok=True)
|
|
@@ -38,6 +40,7 @@ def cmd_parse(args) -> int:
|
|
|
38
40
|
|
|
39
41
|
|
|
40
42
|
def cmd_draft(args) -> int:
|
|
43
|
+
"""执行草稿生成子命令。"""
|
|
41
44
|
input_path = Path(args.input)
|
|
42
45
|
output_dir = Path(args.output)
|
|
43
46
|
|
|
@@ -78,6 +81,7 @@ def cmd_draft(args) -> int:
|
|
|
78
81
|
|
|
79
82
|
|
|
80
83
|
def _list_effective_files(path: Path) -> list[Path]:
|
|
84
|
+
"""列出目录下需要考虑的已有文件。"""
|
|
81
85
|
if not path.exists():
|
|
82
86
|
return []
|
|
83
87
|
return sorted(
|
|
@@ -90,6 +94,7 @@ def _confirm_overwrite(
|
|
|
90
94
|
output_dir: Path,
|
|
91
95
|
existing: list[Path],
|
|
92
96
|
) -> bool:
|
|
97
|
+
"""询问用户是否覆盖已有生成文件。"""
|
|
93
98
|
print(f"found {len(existing)} existing file(s) in {output_dir}.")
|
|
94
99
|
print("Continuing will delete existing generated files under:")
|
|
95
100
|
print(f"- {output_dir}")
|
|
@@ -98,6 +103,7 @@ def _confirm_overwrite(
|
|
|
98
103
|
|
|
99
104
|
|
|
100
105
|
def _clear_generated_files(*dirs: Path) -> None:
|
|
106
|
+
"""删除指定目录下的已有生成文件。"""
|
|
101
107
|
for directory in dirs:
|
|
102
108
|
for path in _list_effective_files(directory):
|
|
103
109
|
path.unlink()
|
|
@@ -109,6 +115,7 @@ def _attach_block_context(
|
|
|
109
115
|
context_chars: int,
|
|
110
116
|
outline_max_sections: int,
|
|
111
117
|
) -> List[ParsedBlock]:
|
|
118
|
+
"""为片段附加目录和邻近片段上下文。"""
|
|
112
119
|
if context_chars <= 0:
|
|
113
120
|
return blocks
|
|
114
121
|
|
|
@@ -137,28 +144,15 @@ def _attach_block_context(
|
|
|
137
144
|
f"章节:{blocks[idx + 1].source_section}\n"
|
|
138
145
|
f"{_compact_context_text(blocks[idx + 1].content, context_chars // 2)}"
|
|
139
146
|
)
|
|
140
|
-
output.append(
|
|
141
|
-
|
|
142
|
-
source_section=block.source_section,
|
|
143
|
-
content=block.content,
|
|
144
|
-
pages=block.pages,
|
|
145
|
-
order=block.order,
|
|
147
|
+
output.append(replace(
|
|
148
|
+
block,
|
|
146
149
|
context="\n\n".join(parts),
|
|
147
|
-
category=block.category,
|
|
148
|
-
category_description=block.category_description,
|
|
149
|
-
category_keywords=block.category_keywords,
|
|
150
|
-
source_doc_description=block.source_doc_description,
|
|
151
|
-
subcategory=block.subcategory,
|
|
152
|
-
subcategory_description=block.subcategory_description,
|
|
153
|
-
category_path=block.category_path,
|
|
154
|
-
related_categories=block.related_categories,
|
|
155
|
-
relation_notes=block.relation_notes,
|
|
156
|
-
related_items=block.related_items,
|
|
157
150
|
))
|
|
158
151
|
return output
|
|
159
152
|
|
|
160
153
|
|
|
161
154
|
def _document_outline(blocks: List[ParsedBlock], max_sections: int) -> str:
|
|
155
|
+
"""生成文档片段目录摘要。"""
|
|
162
156
|
sections = []
|
|
163
157
|
seen = set()
|
|
164
158
|
for block in blocks:
|
|
@@ -176,6 +170,7 @@ def _document_outline(blocks: List[ParsedBlock], max_sections: int) -> str:
|
|
|
176
170
|
|
|
177
171
|
|
|
178
172
|
def _compact_context_text(text: str, limit: int) -> str:
|
|
173
|
+
"""压缩上下文文本到指定长度。"""
|
|
179
174
|
compact = " ".join(text.split())
|
|
180
175
|
if limit <= 0 or len(compact) <= limit:
|
|
181
176
|
return compact
|
|
@@ -183,6 +178,7 @@ def _compact_context_text(text: str, limit: int) -> str:
|
|
|
183
178
|
|
|
184
179
|
|
|
185
180
|
def _source_trace(block: ParsedBlock) -> str:
|
|
181
|
+
"""生成来源章节和页码追踪信息。"""
|
|
186
182
|
parts = [f"section={block.source_section}"]
|
|
187
183
|
if block.pages:
|
|
188
184
|
parts.append(f"pages={','.join(map(str, sorted(set(block.pages))))}")
|
|
@@ -190,6 +186,7 @@ def _source_trace(block: ParsedBlock) -> str:
|
|
|
190
186
|
|
|
191
187
|
|
|
192
188
|
def cmd_validate(args) -> int:
|
|
189
|
+
"""执行校验子命令。"""
|
|
193
190
|
issues = validate_dir(Path(args.input))
|
|
194
191
|
for issue in issues:
|
|
195
192
|
print(f"{issue.level}: {issue.path}: {issue.message}")
|
|
@@ -198,6 +195,7 @@ def cmd_validate(args) -> int:
|
|
|
198
195
|
|
|
199
196
|
|
|
200
197
|
def build_parser() -> argparse.ArgumentParser:
|
|
198
|
+
"""构建命令行参数解析器。"""
|
|
201
199
|
parser = argparse.ArgumentParser(description="Offline document-to-knowledge Markdown generator.")
|
|
202
200
|
sub = parser.add_subparsers(dest="command", required=True)
|
|
203
201
|
|
|
@@ -221,6 +219,7 @@ def build_parser() -> argparse.ArgumentParser:
|
|
|
221
219
|
|
|
222
220
|
|
|
223
221
|
def main() -> int:
|
|
222
|
+
"""命令行入口。"""
|
|
224
223
|
parser = build_parser()
|
|
225
224
|
args = parser.parse_args()
|
|
226
225
|
return args.func(args)
|