npm - union_kb_ingest - Versions diffs - 1.0.0 - Mend

union_kb_ingest 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (21) hide show

package/README.md +100 -0
package/app_config.py +84 -0
package/approved/.gitkeep +1 -0
package/bin/union_kb_ingest +5 -0
package/config/config.yaml +8 -0
package/drafts/.gitkeep +1 -0
package/ingest.py +157 -0
package/input/.gitkeep +1 -0
package/input/pdf/.gitkeep +1 -0
package/input/word/.gitkeep +1 -0
package/normalizer.py +413 -0
package/package.json +27 -0
package/parsed/.gitkeep +1 -0
package/parser.py +287 -0
package/prompts/generate_kb_items.md +27 -0
package/prompts//350/201/224/345/220/210/350/277/220/347/273/264/347/237/245/350/257/206/345/272/223/345/273/272/347/253/213/350/247/204/350/214/203.md +272 -0
package/requirements.txt +9 -0
package/schemas.py +85 -0
package/splitter.py +127 -0
package/validator.py +72 -0
package/writer.py +33 -0

package/parser.py ADDED Viewed

@@ -0,0 +1,287 @@
+from __future__ import annotations
+import re
+import shutil
+import subprocess
+import tempfile
+from pathlib import Path
+from typing import List
+from schemas import ParsedBlock, ParsedDocument
+SUPPORTED_EXTENSIONS = {".pdf", ".docx", ".doc", ".md", ".markdown", ".txt"}
+def iter_input_files(input_path: Path) -> List[Path]:
+    if input_path.is_file():
+        return [input_path] if input_path.suffix.lower() in SUPPORTED_EXTENSIONS else []
+    files = [
+        p for p in input_path.rglob("*")
+        if p.is_file() and p.suffix.lower() in SUPPORTED_EXTENSIONS
+    ]
+    return sorted(files)
+def parse_document(path: Path) -> ParsedDocument:
+    suffix = path.suffix.lower()
+    if suffix == ".pdf":
+        markdown = _parse_pdf_text(path)
+    elif suffix == ".docx":
+        markdown = _parse_docx(path)
+    elif suffix == ".doc":
+        markdown = _parse_legacy_doc(path)
+    elif suffix in {".md", ".markdown", ".txt"}:
+        markdown = path.read_text(encoding="utf-8")
+    else:
+        raise ValueError(f"Unsupported file type: {path}")
+    blocks = _markdown_to_blocks(path.name, markdown)
+    return ParsedDocument(
+        source_path=path,
+        source_doc=path.name,
+        markdown=markdown,
+        blocks=blocks,
+    )
+def _parse_pdf_text(path: Path) -> str:
+    """Extract embedded PDF text without OCR, layout models, or remote artifacts."""
+    from docling_parse.pdf_parser import DoclingPdfParser
+    from docling_parse.pdf_parsers import DecodePageConfig
+    parser = DoclingPdfParser(loglevel="fatal")
+    document = parser.load(path_or_stream=path)
+    if document is None:
+        raise RuntimeError(f"docling-parse could not load PDF: {path}")
+    try:
+        config = DecodePageConfig()
+        config.keep_char_cells = True
+        config.keep_shapes = False
+        config.keep_bitmaps = False
+        config.create_word_cells = False
+        config.create_line_cells = True
+        config.enforce_same_font = True
+        lines: List[str] = []
+        for page_no in range(1, document.number_of_pages() + 1):
+            page = document.get_page(page_no, config=config)
+            lines.extend(cell.text.strip() for cell in page.textline_cells if cell.text.strip())
+        return _compact_pdf_lines(lines)
+    finally:
+        document.unload()
+def _compact_pdf_lines(lines: List[str]) -> str:
+    cleaned = [_normalize_pdf_line(line) for line in lines]
+    cleaned = [line for line in cleaned if line and not _is_noise_line(line)]
+    output: List[str] = []
+    paragraph = ""
+    for line in cleaned:
+        heading = _heading_markdown(line)
+        if heading:
+            if paragraph:
+                output.append(paragraph)
+                paragraph = ""
+            output.append(heading)
+            continue
+        if _should_keep_as_own_line(line):
+            if paragraph:
+                output.append(paragraph)
+                paragraph = ""
+            output.append(line)
+            continue
+        paragraph = _join_text(paragraph, line)
+    if paragraph:
+        output.append(paragraph)
+    return _separate_embedded_headings("\n\n".join(output).strip())
+def _normalize_pdf_line(line: str) -> str:
+    line = line.replace("\u00a0", " ")
+    line = line.replace("网联清算有限公司", "")
+    line = re.sub(r"[ \t]+", " ", line)
+    return line.strip()
+def _is_noise_line(line: str) -> bool:
+    if re.fullmatch(r"[-—–_·•\s]+", line):
+        return True
+    if re.fullmatch(r"\d{1,4}", line):
+        return True
+    if re.fullmatch(r"第\s*\d{1,4}\s*页(?:\s*/\s*共\s*\d{1,4}\s*页)?", line):
+        return True
+    if re.fullmatch(r"Page\s+\d{1,4}(?:\s+of\s+\d{1,4})?", line, re.I):
+        return True
+    if "目 录" in line or "目录" in line and "." * 3 in line:
+        return True
+    if line.count(".") >= 20:
+        return True
+    if re.search(r"[\x00-\x08\x0b-\x1f\x7f]", line):
+        return True
+    return False
+def _heading_markdown(line: str) -> str:
+    if line.startswith("#"):
+        return line
+    match = re.match(r"^(\d+(?:\.\d+){1,4})[、.．\s]*(.{2,100})$", line)
+    if match:
+        title = match.group(2).strip()
+        if not re.match(r"^[\u4e00-\u9fffA-Za-z]", title):
+            return ""
+        depth = 2 + min(match.group(1).count("."), 2)
+        return f"{'#' * depth} {line}"
+    match = re.match(r"^(第[一二三四五六七八九十百千万]+[章节条])[、.．\s]*(.{2,100})$", line)
+    if match:
+        return f"## {line}"
+    if re.match(r"^附\s*录\s*[A-ZＡ-Ｚ]\s*.{2,100}$", line):
+        return f"## {line}"
+    return ""
+def _separate_embedded_headings(text: str) -> str:
+    preface = text.find("前 言本指引")
+    if preface > 0:
+        text = text[preface:]
+    def split_numeric(match: re.Match) -> str:
+        start = match.start()
+        prev = text[max(0, start - 8):start]
+        if prev.endswith(("参见", "详见", "参考")) or re.search(r"[Vv]$", prev):
+            return match.group(1)
+        if not prev.endswith(("指引", "说明", "文件", "策略", "评价", "内容", "范围")):
+            return match.group(1)
+        return f"\n\n{match.group(1)}"
+    text = re.sub(
+        r"(?<![\d.])(\d+\.\d+(?:\.\d+){0,3})(?=[\u4e00-\u9fffA-Za-z])",
+        split_numeric,
+        text,
+    )
+    text = re.sub(
+        r"(?<!\n)(附\s*录\s*[A-ZＡ-Ｚ])(?=[\u4e00-\u9fffA-Za-z])",
+        r"\n\n\1",
+        text,
+    )
+    text = re.sub(r"\n{3,}", "\n\n", text)
+    return text.strip()
+def _should_keep_as_own_line(line: str) -> bool:
+    return bool(
+        re.match(r"^([（(]?[一二三四五六七八九十]+[）)]?|[A-Za-z]|\d+)[、.．)]\s+", line)
+        or re.search(r"\s{2,}", line)
+        or "\t" in line
+    )
+def _join_text(current: str, line: str) -> str:
+    if not current:
+        return line
+    if current.endswith(("。", "；", "：", ":", "？", "！", ".", ";", "?", "!")):
+        return f"{current}\n{line}"
+    if re.search(r"[A-Za-z0-9]$", current) and re.match(r"^[A-Za-z0-9]", line):
+        return f"{current} {line}"
+    return f"{current}{line}"
+def _parse_docx(path: Path) -> str:
+    from docling.datamodel.base_models import InputFormat
+    from docling.document_converter import DocumentConverter, WordFormatOption
+    converter = DocumentConverter(
+        allowed_formats=[InputFormat.DOCX],
+        format_options={InputFormat.DOCX: WordFormatOption()},
+    )
+    result = converter.convert(str(path))
+    return result.document.export_to_markdown()
+def _parse_legacy_doc(path: Path) -> str:
+    soffice = shutil.which("soffice") or shutil.which("libreoffice")
+    if not soffice:
+        raise RuntimeError(
+            "Legacy .doc parsing needs LibreOffice headless conversion. "
+            "Install LibreOffice and expose soffice in PATH, or convert the file to .docx first."
+        )
+    with tempfile.TemporaryDirectory(prefix="kb_ingest_doc_") as tmp:
+        tmp_dir = Path(tmp)
+        subprocess.run(
+            [
+                soffice,
+                "--headless",
+                "--convert-to",
+                "docx",
+                "--outdir",
+                str(tmp_dir),
+                str(path),
+            ],
+            check=True,
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,
+            text=True,
+        )
+        converted = tmp_dir / f"{path.stem}.docx"
+        if not converted.exists():
+            candidates = list(tmp_dir.glob("*.docx"))
+            if not candidates:
+                raise RuntimeError(f"LibreOffice did not produce docx for {path}")
+            converted = candidates[0]
+        return _parse_docx(converted)
+def _markdown_to_blocks(source_doc: str, markdown: str) -> List[ParsedBlock]:
+    normalized = _recover_headings(markdown)
+    pieces = re.split(r"\n(?=#{1,4}\s+)", normalized)
+    blocks: List[ParsedBlock] = []
+    for idx, piece in enumerate(p.strip() for p in pieces if p.strip()):
+        title = _first_heading(piece) or f"文档片段 {idx + 1}"
+        pages = [int(n) for n in re.findall(r"第\s*(\d+)\s*页", title)]
+        blocks.append(ParsedBlock(
+            source_doc=source_doc,
+            source_section=title,
+            content=piece,
+            pages=pages,
+            order=idx,
+        ))
+    if not blocks and normalized.strip():
+        blocks.append(ParsedBlock(source_doc=source_doc, source_section="全文", content=normalized.strip()))
+    return blocks
+def _recover_headings(markdown: str) -> str:
+    lines = []
+    heading_pattern = re.compile(r"^(\d+(?:\.\d+){1,4}|第[一二三四五六七八九十百]+[章节条]|附\s*录\s*[A-ZＡ-Ｚ])[\s、.．]*(.{2,80})$")
+    for raw in markdown.splitlines():
+        line = raw.strip()
+        match = heading_pattern.match(line)
+        if match and not line.startswith("#"):
+            title = match.group(2).strip()
+            if not re.match(r"^[\u4e00-\u9fffA-Za-z]", title):
+                lines.append(raw)
+                continue
+            depth = 2 + min(match.group(1).count("."), 2)
+            lines.append(f"{'#' * depth} {line}")
+        else:
+            lines.append(raw)
+    return "\n".join(lines)
+def _first_heading(text: str) -> str:
+    for line in text.splitlines():
+        match = re.match(r"^#{1,4}\s+(.+)$", line.strip())
+        if match:
+            return match.group(1).strip()
+    return ""

package/prompts/generate_kb_items.md ADDED Viewed

@@ -0,0 +1,27 @@
+你是联合运维知识库整理助手。
+请基于输入原文生成标准知识库条目，并严格参照 `prompts/联合运维知识库建立规范.md`。必须遵守：
+1. 只依据原文，不编造阈值、角色、日期、版本。
+2. 如果一个片段包含多个独立场景、规则、指标、处置策略，拆成多个 items。
+3. 每个 item 需要可独立检索、独立回答。
+4. 保留表格、阈值、比较符、单位、持续时间和适用对象。
+5. 输出严格 JSON，不要 Markdown 代码围栏。
+6. 不要依据预设业务关键词套写业务模块、角色、标签或风险等级，应根据原文语义判断；原文缺失时使用空数组或规范允许的默认值。
+输出格式：
+{
+  "items": [
+    {
+      "title": "",
+      "doc_type": "scenario",
+      "business_modules": [],
+      "source_version": "",
+      "risk_level": "low",
+      "applicable_roles": [],
+      "tags": [],
+      "body": "# 标题\n\n## 1. 适用范围\n\n...\n\n## 7. 来源依据\n\n..."
+    }
+  ]
+}

package/prompts//350/201/224/345/220/210/350/277/220/347/273/264/347/237/245/350/257/206/345/272/223/345/273/272/347/253/213/350/247/204/350/214/203.md ADDED Viewed

@@ -0,0 +1,272 @@
+# 联合运维知识库建立规范
+## 1. 建设目标
+联合运维知识库用于支撑运维智能助手在一期阶段完成“只读问答 + 查询类 Function Call”的能力建设。知识库应帮助系统准确识别用户问题所属的业务模块、运行场景、风险等级、处置策略、可用查询能力和引用依据。
+一期知识库的核心目标如下：
+1. 支持 RAG 检索识别业务场景和业务模块。
+2. 支持大模型基于权威资料回答运维制度、指标阈值、处置策略、故障定级等问题。
+3. 支持大模型理解可用的查询类 Function Call，但不直接执行变更类动作。
+4. 支持检索结果可追溯到原始文档、章节和版本。
+5. 支持后续扩展到诊断类、操作类和审批类能力。
+## 2. 文档分类
+知识库文档按内容类型分为以下几类：
+| 类型 | 编码前缀 | 说明 | 示例 |
+| --- | --- | --- | --- |
+| 业务模块 | `biz` | 描述业务范围、业务链路、参与方、上下游依赖 | 快捷支付、退款、付款 |
+| 运行场景 | `scenario` | 描述生产运行风险、业务异常、关闭渠道、恢复等场景 | 成功率下降、耗时升高 |
+| 处置策略 | `sop` | 描述不同角色在不同场景下应采取的动作 | 异常方排查、影响方降流 |
+| 指标规则 | `metric` | 描述指标定义、阈值、计算口径 | 系统成功率、平均耗时 |
+| 故障定级 | `severity` | 描述故障等级、触发条件、善后要求 | 轻微、一般、严重 |
+| 变更管理 | `change` | 描述生产变更评估、通知、异常处置、质量评价 | 变更影响评估 |
+| 函数说明 | `function` | 描述可调用函数的用途、参数、权限、返回值 | 查询告警、查询指标 |
+| 评价规则 | `evaluation` | 描述周期评价对象、周期、指标、权重和定级 | 周/月/年评价 |
+## 3. 单篇知识文档格式
+每个知识条目建议使用 Markdown 文件保存，并使用 YAML Front Matter 描述元数据。正文结构应稳定，便于切分、检索和展示。
+```markdown
+---
+kb_id: "scenario-netunion-runtime-risk-v1"
+title: "网络支付清算平台生产运行风险场景"
+doc_type: "scenario"
+domain: "联合运维"
+business_modules: ["签约", "解约", "快捷支付", "付款", "退款", "代收", "网关支付"]
+source_doc: "网络支付清算平台联合运维运行实践指引V1.0.md"
+source_version: "V1.0"
+source_section: "5.2.1 存在运行风险"
+effective_date: ""
+owner: "联合运维知识库"
+confidentiality: "内部"
+risk_level: "low"
+applicable_roles: ["网联平台", "异常方", "影响方", "成员单位"]
+tags: ["生产运行", "风险", "系统成功率", "平均耗时", "带宽"]
+status: "active"
+---
+# 标题
+## 1. 适用范围
+## 2. 触发条件
+## 3. 处置策略
+## 4. 关联指标
+## 5. 关联函数
+## 6. 检索提示
+## 7. 来源依据
+```
+## 4. 元数据规范
+| 字段 | 必填 | 说明 | 示例 |
+| --- | --- | --- | --- |
+| `kb_id` | 是 | 全局唯一 ID，建议使用英文、数字和短横线 | `severity-netunion-runtime-v1` |
+| `title` | 是 | 知识条目标题 | `运行故障定级规则` |
+| `doc_type` | 是 | 文档分类 | `severity` |
+| `domain` | 是 | 所属领域 | `联合运维` |
+| `business_modules` | 是 | 适用业务模块，可为空数组但字段必须存在 | `["快捷支付", "退款"]` |
+| `source_doc` | 是 | 来源文档文件名 | `网络支付清算平台联合运维运行实践指引V1.0.md` |
+| `source_version` | 是 | 来源版本 | `V1.0` |
+| `source_section` | 是 | 来源章节 | `5.3 运行故障定级` |
+| `effective_date` | 否 | 生效日期，未知可留空 | `2026-01-01` |
+| `owner` | 是 | 维护责任方 | `联合运维知识库` |
+| `confidentiality` | 是 | 密级 | `内部` |
+| `risk_level` | 是 | 知识对应的操作风险 | `low` |
+| `applicable_roles` | 是 | 适用角色 | `["异常方", "影响方"]` |
+| `tags` | 是 | 检索标签 | `["故障", "定级"]` |
+| `status` | 是 | 状态 | `active` |
+`risk_level` 建议枚举：
+| 值 | 含义 | 一期处理策略 |
+| --- | --- | --- |
+| `low` | 查询、解释、制度说明 | 可直接回答或调用只读函数 |
+| `medium` | 诊断建议、影响判断 | 可回答，可建议查询，不自动变更 |
+| `high` | 涉及降流、关停、重启、扩容、回滚 | 一期只给建议，不执行 |
+| `critical` | 涉及生产高危或不可逆动作 | 必须转人工审批 |
+## 5. 正文内容规范
+### 5.1 适用范围
+说明该知识条目适用于哪些业务、系统、角色和场景。范围要明确，避免出现“所有情况均适用”这类模糊表述。
+### 5.2 触发条件
+凡是涉及阈值、时间、比例、数量的内容，必须结构化表达。建议使用表格。
+示例：
+| 指标 | 大型单位 | 中型单位 | 小型单位 | 持续时间 | 是否决策指标 |
+| --- | --- | --- | --- | --- | --- |
+| 平均系统成功率 | `<99.99% 且系统失败笔数 >100` | `<99.99% 且系统失败笔数 >50` | `<99.99% 且系统失败笔数 >10` | `5min` | 是 |
+### 5.3 处置策略
+处置策略必须按角色拆分，至少区分：
+1. 平台方。
+2. 异常方。
+3. 影响方。
+4. 变更发起方。
+5. 变更影响方。
+动作应使用明确动词，例如“通知”“排查”“同步”“降流”“暂停发送”“灰度恢复”。涉及高风险动作时，必须标记“需人工确认”。
+### 5.4 关联指标
+指标必须引用统一指标口径。指标定义应尽量沉淀为独立 `metric` 条目，业务条目中只引用。
+常见指标包括：
+1. 系统请求数量。
+2. 系统失败数量。
+3. 业务失败数量。
+4. 系统成功率。
+5. 业务成功率。
+6. 平均耗时。
+7. 正常服务时间。
+8. 异常持续时间。
+### 5.5 关联函数
+一期只登记查询类函数。函数说明可以进入知识库，但执行权限由 Function Call 层控制。
+函数说明建议格式：
+```yaml
+function_name: "query_runtime_metrics"
+display_name: "查询运行指标"
+function_type: "read"
+description: "查询指定业务、机构、时间窗口内的系统成功率、业务成功率、平均耗时和交易量。"
+risk_level: "low"
+requires_confirmation: false
+required_permissions: ["metrics:read"]
+input_schema:
+  business_module: "业务模块，例如快捷支付、退款"
+  org_type: "单位分类，例如大型单位、中型单位、小型单位"
+  time_range: "查询时间范围"
+output_schema:
+  system_success_rate: "系统成功率"
+  business_success_rate: "业务成功率"
+  avg_latency_ms: "平均耗时"
+  request_count: "系统请求数量"
+  failed_count: "系统失败数量"
+```
+### 5.6 检索提示
+为提升 RAG 命中率，每篇知识条目应维护自然语言检索提示，包括用户可能的问法。
+示例：
+1. “系统成功率低于多少算运行风险？”
+2. “平均耗时超过 2 秒要怎么处理？”
+3. “大型单位异常交易多少笔需要升级？”
+### 5.7 来源依据
+必须记录来源文档、版本和章节。若为人工总结，应标明“基于来源章节归纳”，避免将推理内容伪装成原文。
+## 6. 内容切分规范
+为了兼顾召回准确性和上下文完整性，建议按“一个知识点一条文档”的方式切分。
+切分原则：
+1. 一个场景一条，例如“存在运行风险”“出现业务异常-联合处置”“出现业务异常-关闭渠道”。
+2. 一个规则一条，例如“运行故障定级”“故障善后处置”。
+3. 一个函数一条，例如“查询运行指标函数说明”。
+4. 单条正文建议控制在 800 到 1500 个中文字符，复杂表格类条目可适当放宽。
+5. 不要把整份制度原文作为一个向量文档直接入库。
+6. 表格必须保留表头，避免阈值脱离适用对象。
+7. 涉及“或”“且”“大于等于”“小于”等逻辑关系时，必须保留原始逻辑。
+## 7. 命名规范
+文件名建议格式：
+```text
+{doc_type}-{domain_or_system}-{topic}-v{version}.md
+```
+示例：
+```text
+scenario-netunion-runtime-risk-v1.md
+severity-netunion-runtime-classification-v1.md
+function-netunion-query-runtime-metrics-v1.md
+```
+中文标题可以放在 `title` 字段中，文件名建议使用英文，便于程序处理和跨平台同步。
+## 8. RAG 入库字段建议
+入库时建议至少写入以下字段：
+| 字段 | 来源 | 用途 |
+| --- | --- | --- |
+| `content` | Markdown 正文 | 向量检索 |
+| `title` | Front Matter | 展示和关键词检索 |
+| `doc_type` | Front Matter | 检索过滤 |
+| `business_modules` | Front Matter | 业务模块过滤 |
+| `tags` | Front Matter | 关键词召回 |
+| `source_doc` | Front Matter | 溯源 |
+| `source_section` | Front Matter | 溯源 |
+| `risk_level` | Front Matter | 工具调用安全控制 |
+| `applicable_roles` | Front Matter | 角色匹配 |
+| `status` | Front Matter | 过滤废弃条目 |
+## 9. 质量校验规范
+每条知识入库前应检查：
+1. 是否有唯一 `kb_id`。
+2. 是否能追溯到来源文档和章节。
+3. 是否包含适用范围。
+4. 阈值是否保留单位、比较符和持续时间。
+5. 处置策略是否按角色拆分。
+6. 是否标注风险等级。
+7. 是否存在过期或冲突内容。
+8. 是否包含检索提示。
+9. 是否避免把高风险操作描述成可自动执行。
+10. 是否与已有知识条目重复或冲突。
+## 10. 一期推荐知识库目录
+```text
+trainingDocs/
+  knowledgeBase/
+    联合运维知识库建立规范.md
+    samples/
+      scenario-netunion-runtime-risk-v1.md
+      scenario-netunion-business-exception-joint-handling-v1.md
+      scenario-netunion-business-exception-channel-close-v1.md
+      severity-netunion-runtime-classification-v1.md
+      metric-netunion-runtime-indicators-v1.md
+      function-netunion-query-runtime-metrics-v1.md
+```
+一期建议优先沉淀以下条目：
+1. 业务范围与单位分类。
+2. 生产运行风险场景。
+3. 业务异常联合处置场景。
+4. 业务异常关闭渠道场景。
+5. 处置升级策略。
+6. 恢复策略。
+7. 运行故障定级。
+8. 故障善后处置。
+9. 运行指标定义。
+10. 查询类函数说明。

package/requirements.txt ADDED Viewed

@@ -0,0 +1,9 @@
+# Offline-only optional dependencies. Do not add these to the app runtime unless needed.
+pyyaml>=6.0.1
+zhipuai>=2.1.0
+sniffio>=1.3.0
+# Docling slim plus only file-format backends used by this offline tool.
+# Do not install `docling` or `docling-slim[standard]` here: those pull OCR,
+# layout/table ML models, torch/onnxruntime, and may try to download artifacts.
+docling-slim[format-pdf-docling,format-docx,format-markdown]>=2.70.0; python_version >= "3.10"

package/schemas.py ADDED Viewed

@@ -0,0 +1,85 @@
+from __future__ import annotations
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Dict, List
+DOC_TYPES = {
+    "biz",
+    "scenario",
+    "sop",
+    "metric",
+    "severity",
+    "change",
+    "function",
+    "evaluation",
+}
+@dataclass
+class ParsedBlock:
+    source_doc: str
+    source_section: str
+    content: str
+    pages: List[int] = field(default_factory=list)
+    order: int = 0
+@dataclass
+class ParsedDocument:
+    source_path: Path
+    source_doc: str
+    markdown: str
+    blocks: List[ParsedBlock]
+@dataclass
+class KnowledgeItem:
+    kb_id: str
+    title: str
+    doc_type: str
+    domain: str
+    business_modules: List[str]
+    source_doc: str
+    source_version: str
+    source_section: str
+    effective_date: str
+    owner: str
+    confidentiality: str
+    risk_level: str
+    applicable_roles: List[str]
+    tags: List[str]
+    status: str
+    body: str
+    review_status: str = "pending"
+    source_trace: str = ""
+    def metadata(self) -> Dict:
+        return {
+            "kb_id": self.kb_id,
+            "title": self.title,
+            "doc_type": self.doc_type,
+            "domain": self.domain,
+            "business_modules": self.business_modules,
+            "source_doc": self.source_doc,
+            "source_version": self.source_version,
+            "source_section": self.source_section,
+            "effective_date": self.effective_date,
+            "owner": self.owner,
+            "confidentiality": self.confidentiality,
+            "risk_level": self.risk_level,
+            "applicable_roles": self.applicable_roles,
+            "tags": self.tags,
+            "status": self.status,
+            "review_status": self.review_status,
+            "source_trace": self.source_trace,
+        }
+@dataclass
+class ValidationIssue:
+    path: Path
+    level: str
+    message: str