union_kb_ingest 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +100 -0
- package/app_config.py +84 -0
- package/approved/.gitkeep +1 -0
- package/bin/union_kb_ingest +5 -0
- package/config/config.yaml +8 -0
- package/drafts/.gitkeep +1 -0
- package/ingest.py +157 -0
- package/input/.gitkeep +1 -0
- package/input/pdf/.gitkeep +1 -0
- package/input/word/.gitkeep +1 -0
- package/normalizer.py +413 -0
- package/package.json +27 -0
- package/parsed/.gitkeep +1 -0
- package/parser.py +287 -0
- package/prompts/generate_kb_items.md +27 -0
- package/prompts//350/201/224/345/220/210/350/277/220/347/273/264/347/237/245/350/257/206/345/272/223/345/273/272/347/253/213/350/247/204/350/214/203.md +272 -0
- package/requirements.txt +9 -0
- package/schemas.py +85 -0
- package/splitter.py +127 -0
- package/validator.py +72 -0
- package/writer.py +33 -0
package/splitter.py
ADDED
|
@@ -0,0 +1,127 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import re
|
|
4
|
+
from typing import List
|
|
5
|
+
|
|
6
|
+
from schemas import ParsedBlock
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def split_blocks(blocks: List[ParsedBlock], max_chars: int = 8000) -> List[ParsedBlock]:
|
|
10
|
+
result: List[ParsedBlock] = []
|
|
11
|
+
for block in _merge_parent_child_blocks(blocks):
|
|
12
|
+
result.extend(_split_one(block, max_chars=max_chars))
|
|
13
|
+
return [b for b in result if len(b.content.strip()) >= 40]
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def _merge_parent_child_blocks(blocks: List[ParsedBlock]) -> List[ParsedBlock]:
|
|
17
|
+
merged: List[ParsedBlock] = []
|
|
18
|
+
current: ParsedBlock | None = None
|
|
19
|
+
current_prefix = ""
|
|
20
|
+
|
|
21
|
+
for block in blocks:
|
|
22
|
+
prefix = _section_prefix(block.source_section)
|
|
23
|
+
if prefix and _is_parent_section(prefix, block.content):
|
|
24
|
+
if current:
|
|
25
|
+
merged.append(current)
|
|
26
|
+
current = block
|
|
27
|
+
current_prefix = prefix
|
|
28
|
+
continue
|
|
29
|
+
|
|
30
|
+
if current and current_prefix and _is_child_section(current_prefix, block.source_section):
|
|
31
|
+
current = ParsedBlock(
|
|
32
|
+
source_doc=current.source_doc,
|
|
33
|
+
source_section=current.source_section,
|
|
34
|
+
content=f"{current.content.rstrip()}\n\n{block.content.strip()}",
|
|
35
|
+
pages=sorted(set(current.pages + block.pages)),
|
|
36
|
+
order=current.order,
|
|
37
|
+
)
|
|
38
|
+
continue
|
|
39
|
+
|
|
40
|
+
if current:
|
|
41
|
+
merged.append(current)
|
|
42
|
+
current = None
|
|
43
|
+
current_prefix = ""
|
|
44
|
+
merged.append(block)
|
|
45
|
+
|
|
46
|
+
if current:
|
|
47
|
+
merged.append(current)
|
|
48
|
+
return merged
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def _section_prefix(title: str) -> str:
|
|
52
|
+
match = re.match(r"^(\d+\.\d+)(?!\.)", title.strip())
|
|
53
|
+
return match.group(1) if match else ""
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def _is_parent_section(prefix: str, content: str) -> bool:
|
|
57
|
+
return bool(prefix) and len(content.strip()) < 120
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def _is_child_section(parent_prefix: str, title: str) -> bool:
|
|
61
|
+
return title.strip().startswith(f"{parent_prefix}.")
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def _split_one(block: ParsedBlock, max_chars: int) -> List[ParsedBlock]:
|
|
65
|
+
content = block.content.strip()
|
|
66
|
+
if len(content) <= max_chars:
|
|
67
|
+
return [block]
|
|
68
|
+
|
|
69
|
+
candidates = _split_by_heading_window(content, max_chars)
|
|
70
|
+
|
|
71
|
+
output: List[ParsedBlock] = []
|
|
72
|
+
for idx, text in enumerate(candidates, start=1):
|
|
73
|
+
text = text.strip()
|
|
74
|
+
if not text:
|
|
75
|
+
continue
|
|
76
|
+
output.append(ParsedBlock(
|
|
77
|
+
source_doc=block.source_doc,
|
|
78
|
+
source_section=f"{block.source_section} / 片段 {idx}",
|
|
79
|
+
content=text,
|
|
80
|
+
pages=block.pages,
|
|
81
|
+
order=block.order * 100 + idx,
|
|
82
|
+
))
|
|
83
|
+
return output
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def _split_by_heading_window(text: str, max_chars: int) -> List[str]:
|
|
87
|
+
sections = [p.strip() for p in re.split(r"\n(?=#{2,4}\s+)", text) if p.strip()]
|
|
88
|
+
if len(sections) <= 1:
|
|
89
|
+
return _split_by_paragraph_window(text, max_chars)
|
|
90
|
+
|
|
91
|
+
chunks: List[str] = []
|
|
92
|
+
current: List[str] = []
|
|
93
|
+
current_len = 0
|
|
94
|
+
for section in sections:
|
|
95
|
+
if current and current_len + len(section) > max_chars:
|
|
96
|
+
chunks.append("\n\n".join(current))
|
|
97
|
+
current = []
|
|
98
|
+
current_len = 0
|
|
99
|
+
if len(section) > max_chars:
|
|
100
|
+
if current:
|
|
101
|
+
chunks.append("\n\n".join(current))
|
|
102
|
+
current = []
|
|
103
|
+
current_len = 0
|
|
104
|
+
chunks.extend(_split_by_paragraph_window(section, max_chars))
|
|
105
|
+
continue
|
|
106
|
+
current.append(section)
|
|
107
|
+
current_len += len(section)
|
|
108
|
+
if current:
|
|
109
|
+
chunks.append("\n\n".join(current))
|
|
110
|
+
return chunks or [text]
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
def _split_by_paragraph_window(text: str, max_chars: int) -> List[str]:
|
|
114
|
+
paragraphs = [p.strip() for p in re.split(r"\n\s*\n", text) if p.strip()]
|
|
115
|
+
chunks: List[str] = []
|
|
116
|
+
current: List[str] = []
|
|
117
|
+
current_len = 0
|
|
118
|
+
for paragraph in paragraphs:
|
|
119
|
+
if current and current_len + len(paragraph) > max_chars:
|
|
120
|
+
chunks.append("\n\n".join(current))
|
|
121
|
+
current = []
|
|
122
|
+
current_len = 0
|
|
123
|
+
current.append(paragraph)
|
|
124
|
+
current_len += len(paragraph)
|
|
125
|
+
if current:
|
|
126
|
+
chunks.append("\n\n".join(current))
|
|
127
|
+
return chunks or [text]
|
package/validator.py
ADDED
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import re
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import Dict, List, Tuple
|
|
6
|
+
|
|
7
|
+
import yaml
|
|
8
|
+
|
|
9
|
+
from schemas import DOC_TYPES, ValidationIssue
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
REQUIRED_FIELDS = [
|
|
13
|
+
"kb_id",
|
|
14
|
+
"title",
|
|
15
|
+
"doc_type",
|
|
16
|
+
"domain",
|
|
17
|
+
"business_modules",
|
|
18
|
+
"source_doc",
|
|
19
|
+
"source_version",
|
|
20
|
+
"source_section",
|
|
21
|
+
"owner",
|
|
22
|
+
"confidentiality",
|
|
23
|
+
"risk_level",
|
|
24
|
+
"applicable_roles",
|
|
25
|
+
"tags",
|
|
26
|
+
"status",
|
|
27
|
+
]
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def validate_dir(input_dir: Path) -> List[ValidationIssue]:
|
|
31
|
+
issues: List[ValidationIssue] = []
|
|
32
|
+
for path in sorted(input_dir.rglob("*.md")):
|
|
33
|
+
issues.extend(validate_file(path))
|
|
34
|
+
return issues
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def validate_file(path: Path) -> List[ValidationIssue]:
|
|
38
|
+
issues: List[ValidationIssue] = []
|
|
39
|
+
try:
|
|
40
|
+
metadata, body = _read_markdown(path)
|
|
41
|
+
except Exception as exc:
|
|
42
|
+
return [ValidationIssue(path, "error", f"无法读取 Markdown Front Matter: {exc}")]
|
|
43
|
+
|
|
44
|
+
for field in REQUIRED_FIELDS:
|
|
45
|
+
if field not in metadata:
|
|
46
|
+
issues.append(ValidationIssue(path, "error", f"缺少必填字段: {field}"))
|
|
47
|
+
|
|
48
|
+
if metadata.get("doc_type") not in DOC_TYPES:
|
|
49
|
+
issues.append(ValidationIssue(path, "error", f"doc_type 不合法: {metadata.get('doc_type')}"))
|
|
50
|
+
|
|
51
|
+
if metadata.get("status") == "active" and metadata.get("review_status") != "approved":
|
|
52
|
+
issues.append(ValidationIssue(path, "warning", "active 条目建议先设置 review_status: approved"))
|
|
53
|
+
|
|
54
|
+
if len(body) > 3000:
|
|
55
|
+
issues.append(ValidationIssue(path, "warning", f"正文较长,建议继续拆分: {len(body)} 字符"))
|
|
56
|
+
if len(body) < 200:
|
|
57
|
+
issues.append(ValidationIssue(path, "warning", f"正文较短,可能信息不足: {len(body)} 字符"))
|
|
58
|
+
|
|
59
|
+
headings = re.findall(r"^##\s+", body, re.M)
|
|
60
|
+
if len(headings) < 3:
|
|
61
|
+
issues.append(ValidationIssue(path, "warning", "二级标题过少,可能未遵照标准结构"))
|
|
62
|
+
|
|
63
|
+
return issues
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def _read_markdown(path: Path) -> Tuple[Dict, str]:
|
|
67
|
+
text = path.read_text(encoding="utf-8")
|
|
68
|
+
match = re.match(r"^---\n(.*?)\n---\n(.*)$", text, re.S)
|
|
69
|
+
if not match:
|
|
70
|
+
raise ValueError("缺少 YAML Front Matter")
|
|
71
|
+
return yaml.safe_load(match.group(1)) or {}, match.group(2).strip()
|
|
72
|
+
|
package/writer.py
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import re
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
|
|
6
|
+
import yaml
|
|
7
|
+
|
|
8
|
+
from schemas import KnowledgeItem
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def write_item(item: KnowledgeItem, output_dir: Path) -> Path:
|
|
12
|
+
output_dir.mkdir(parents=True, exist_ok=True)
|
|
13
|
+
path = output_dir / f"{_safe_filename(item.kb_id)}.md"
|
|
14
|
+
path.write_text(render_markdown(item), encoding="utf-8")
|
|
15
|
+
return path
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def render_markdown(item: KnowledgeItem) -> str:
|
|
19
|
+
metadata = yaml.safe_dump(
|
|
20
|
+
item.metadata(),
|
|
21
|
+
allow_unicode=True,
|
|
22
|
+
sort_keys=False,
|
|
23
|
+
default_flow_style=False,
|
|
24
|
+
).strip()
|
|
25
|
+
return f"---\n{metadata}\n---\n\n{item.body.strip()}\n"
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def _safe_filename(value: str) -> str:
|
|
29
|
+
value = value.lower().strip()
|
|
30
|
+
value = re.sub(r"[^a-z0-9._-]+", "-", value)
|
|
31
|
+
value = re.sub(r"-+", "-", value).strip("-")
|
|
32
|
+
return value or "kb-item"
|
|
33
|
+
|