npm - union_kb_ingest - Versions diffs - 1.0.3 → 1.0.5 - Mend

union_kb_ingest 1.0.3 → 1.0.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

package/README.md +1 -1
package/app_config.py +9 -0
package/config/config.yaml +2 -2
package/ingest.py +19 -13
package/normalizer.py +591 -104
package/package.json +1 -1
package/parser.py +311 -13
package/prompts/generate_kb_items.md +2 -2
package/prompts//{350/201/224/345/220/210/350/277/220/347/273/264/347 → 347}/237/245/350/257/206/345/272/223/345/273/272/347/253/213/350/247/204/350/214/203.md +9 -3
package/schemas.py +20 -2
package/splitter.py +13 -15
package/validator.py +70 -2
package/writer.py +4 -0

package/README.md CHANGED Viewed

@@ -61,7 +61,7 @@ python ingest.py validate
 `draft` 默认按 `config/config.yaml` 的 `draft.max_chars` 控制单次送入模型的原文长度，并额外提供文档目录和相邻片段摘要作为辅助上下文。这样可以降低私有模型单轮负载，同时尽量保留前后章节关系。命令行仍可用 `--max-chars` 临时覆盖。
-每条知识库文件会写入分类画像元数据：`category`、`category_description` 和 `category_keywords`。这些字段优先来自源文件一级标题、首页标题、章节目录和文件名，用于标识一个批次/业务场景的大类。后续 RAG 入库和检索时，应把这些字段写入向量库 metadata，并用于分类过滤、查询路由或重排加权，降低不同场景之间因为相似词命中而串场的概率。
+每条知识库文件会写入分类画像元数据：`category`、`subcategory`、`category_keywords` 和 `related_items`。这些字段优先来自源文件一级标题、首页标题、章节目录、文件名、当前小类正文和关联小类语义，用于标识知识大类、小类、关键词和条目间关系。后续 RAG 入库和检索时，应把这些字段写入向量库 metadata，并用于分类过滤、查询路由或重排加权，降低不同场景之间因为相似词命中而串场的概率。
 生成结果会按原始输入遍历顺序写入 `source_order`，并用 `000001-...md` 这样的文件名前缀保持目录排序与原文从上到下的顺序一致。页码只写入 Front Matter 的 `source_pages`/`source_trace` 和正文 `## 5. 来源依据`，不会进入正文 `## 1. 核心内容` 到 `## 4. 关联能力`。

package/app_config.py CHANGED Viewed

@@ -15,6 +15,7 @@ DEFAULT_CONFIG_PATH = CURRENT_DIR / "config" / "config.yaml"
 @dataclass(frozen=True)
 class LlmConfig:
+    """LLM 调用配置，支持配置文件和环境变量覆盖。"""
     enabled: bool = False
     base_url: str = ""
     api_key: str = ""
@@ -26,6 +27,7 @@ class LlmConfig:
 @dataclass(frozen=True)
 class DraftConfig:
+    """草稿生成阶段的切分和上下文配置。"""
     max_chars: int = 3600
     context_chars: int = 800
     outline_max_sections: int = 40
@@ -33,6 +35,7 @@ class DraftConfig:
 @lru_cache(maxsize=1)
 def get_llm_config() -> LlmConfig:
+    """读取并合并 LLM 配置。"""
     raw = _read_config().get("llm", {})
     if not isinstance(raw, dict):
         raw = {}
@@ -50,6 +53,7 @@ def get_llm_config() -> LlmConfig:
 @lru_cache(maxsize=1)
 def get_draft_config() -> DraftConfig:
+    """读取并合并草稿生成配置。"""
     raw = _read_config().get("draft", {})
     if not isinstance(raw, dict):
         raw = {}
@@ -62,6 +66,7 @@ def get_draft_config() -> DraftConfig:
 def _read_config() -> Dict[str, Any]:
+    """读取 YAML 配置文件并返回字典。"""
     path = Path(os.environ.get("KB_INGEST_CONFIG", DEFAULT_CONFIG_PATH))
     if not path.exists():
         return {}
@@ -70,6 +75,7 @@ def _read_config() -> Dict[str, Any]:
 def _env_bool(name: str, default: bool) -> bool:
+    """读取布尔环境变量并回退到默认值。"""
     value = os.environ.get(name)
     if value is None:
         return default
@@ -77,6 +83,7 @@ def _env_bool(name: str, default: bool) -> bool:
 def _env_int(name: str, value: Any, default: int) -> int:
+    """读取整数环境变量并回退到默认值。"""
     raw = os.environ.get(name, value)
     try:
         return int(raw)
@@ -85,6 +92,7 @@ def _env_int(name: str, value: Any, default: int) -> int:
 def _env_float(name: str, value: Any, default: float) -> float:
+    """读取浮点环境变量并回退到默认值。"""
     raw = os.environ.get(name, value)
     try:
         return float(raw)
@@ -93,6 +101,7 @@ def _env_float(name: str, value: Any, default: float) -> float:
 def _as_bool(value: Any, default: bool) -> bool:
+    """把配置值转换为布尔值。"""
     if isinstance(value, bool):
         return value
     if isinstance(value, str):

package/config/config.yaml CHANGED Viewed

@@ -1,9 +1,9 @@
 llm:
-  enabled: false
+  enabled: true
   timeout_seconds: 120
   max_tokens: 8192
   temperature: 0.1
-  api_key: "your-zhipu-api-key"
+  api_key: "15f066c4509845038027ea5746524af5.w4CLSC6ODiKVC1wK"
   model: "GLM-4.7-Flash"
   base_url: "https://open.bigmodel.cn/api/paas/v4/"

package/ingest.py CHANGED Viewed

@@ -3,6 +3,7 @@ from __future__ import annotations
 import argparse
 import sys
+from dataclasses import replace
 from pathlib import Path
 from typing import List
@@ -23,6 +24,7 @@ IGNORED_EXISTING_FILES = {".gitkeep", ".DS_Store"}
 def cmd_parse(args) -> int:
+    """执行解析子命令。"""
     input_path = Path(args.input)
     output_dir = Path(args.output)
     output_dir.mkdir(parents=True, exist_ok=True)
@@ -38,6 +40,7 @@ def cmd_parse(args) -> int:
 def cmd_draft(args) -> int:
+    """执行草稿生成子命令。"""
     input_path = Path(args.input)
     output_dir = Path(args.output)
@@ -78,6 +81,7 @@ def cmd_draft(args) -> int:
 def _list_effective_files(path: Path) -> list[Path]:
+    """列出目录下需要考虑的已有文件。"""
     if not path.exists():
         return []
     return sorted(
@@ -90,6 +94,7 @@ def _confirm_overwrite(
     output_dir: Path,
     existing: list[Path],
 ) -> bool:
+    """询问用户是否覆盖已有生成文件。"""
     print(f"found {len(existing)} existing file(s) in {output_dir}.")
     print("Continuing will delete existing generated files under:")
     print(f"- {output_dir}")
@@ -98,6 +103,7 @@ def _confirm_overwrite(
 def _clear_generated_files(*dirs: Path) -> None:
+    """删除指定目录下的已有生成文件。"""
     for directory in dirs:
         for path in _list_effective_files(directory):
             path.unlink()
@@ -109,6 +115,7 @@ def _attach_block_context(
     context_chars: int,
     outline_max_sections: int,
 ) -> List[ParsedBlock]:
+    """为片段附加目录和邻近片段上下文。"""
     if context_chars <= 0:
         return blocks
@@ -118,11 +125,11 @@ def _attach_block_context(
         parts = []
         if outline:
             parts.append(f"文档章节目录：\n{outline}")
-        if block.category_description:
+        if block.category or block.subcategory or block.category_keywords:
             parts.append(
-                "知识大类说明：\n"
-                f"大类：{block.category}\n"
-                f"说明：{block.category_description}\n"
+                "知识分类：\n"
+                f"大类标题：{block.category}\n"
+                f"小类标题：{block.subcategory}\n"
                 f"关键词：{', '.join(block.category_keywords)}"
             )
         if idx > 0:
@@ -137,21 +144,15 @@ def _attach_block_context(
                 f"章节：{blocks[idx + 1].source_section}\n"
                 f"{_compact_context_text(blocks[idx + 1].content, context_chars // 2)}"
             )
-        output.append(ParsedBlock(
-            source_doc=block.source_doc,
-            source_section=block.source_section,
-            content=block.content,
-            pages=block.pages,
-            order=block.order,
+        output.append(replace(
+            block,
             context="\n\n".join(parts),
-            category=block.category,
-            category_description=block.category_description,
-            category_keywords=block.category_keywords,
         ))
     return output
 def _document_outline(blocks: List[ParsedBlock], max_sections: int) -> str:
+    """生成文档片段目录摘要。"""
     sections = []
     seen = set()
     for block in blocks:
@@ -169,6 +170,7 @@ def _document_outline(blocks: List[ParsedBlock], max_sections: int) -> str:
 def _compact_context_text(text: str, limit: int) -> str:
+    """压缩上下文文本到指定长度。"""
     compact = " ".join(text.split())
     if limit <= 0 or len(compact) <= limit:
         return compact
@@ -176,6 +178,7 @@ def _compact_context_text(text: str, limit: int) -> str:
 def _source_trace(block: ParsedBlock) -> str:
+    """生成来源章节和页码追踪信息。"""
     parts = [f"section={block.source_section}"]
     if block.pages:
         parts.append(f"pages={','.join(map(str, sorted(set(block.pages))))}")
@@ -183,6 +186,7 @@ def _source_trace(block: ParsedBlock) -> str:
 def cmd_validate(args) -> int:
+    """执行校验子命令。"""
     issues = validate_dir(Path(args.input))
     for issue in issues:
         print(f"{issue.level}: {issue.path}: {issue.message}")
@@ -191,6 +195,7 @@ def cmd_validate(args) -> int:
 def build_parser() -> argparse.ArgumentParser:
+    """构建命令行参数解析器。"""
     parser = argparse.ArgumentParser(description="Offline document-to-knowledge Markdown generator.")
     sub = parser.add_subparsers(dest="command", required=True)
@@ -214,6 +219,7 @@ def build_parser() -> argparse.ArgumentParser:
 def main() -> int:
+    """命令行入口。"""
     parser = build_parser()
     args = parser.parse_args()
     return args.func(args)