union_kb_ingest 1.0.3 → 1.0.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +1 -1
- package/config/config.yaml +1 -1
- package/ingest.py +11 -4
- package/normalizer.py +502 -49
- package/package.json +1 -1
- package/parser.py +291 -6
- package/prompts/generate_kb_items.md +2 -2
- package/prompts//{350/201/224/345/220/210/350/277/220/347/273/264/347 → 347}/237/245/350/257/206/345/272/223/345/273/272/347/253/213/350/247/204/350/214/203.md +9 -3
- package/schemas.py +15 -2
- package/splitter.py +14 -0
- package/validator.py +60 -2
package/README.md
CHANGED
|
@@ -61,7 +61,7 @@ python ingest.py validate
|
|
|
61
61
|
|
|
62
62
|
`draft` 默认按 `config/config.yaml` 的 `draft.max_chars` 控制单次送入模型的原文长度,并额外提供文档目录和相邻片段摘要作为辅助上下文。这样可以降低私有模型单轮负载,同时尽量保留前后章节关系。命令行仍可用 `--max-chars` 临时覆盖。
|
|
63
63
|
|
|
64
|
-
每条知识库文件会写入分类画像元数据:`category`、`
|
|
64
|
+
每条知识库文件会写入分类画像元数据:`category`、`subcategory`、`category_keywords` 和 `related_items`。这些字段优先来自源文件一级标题、首页标题、章节目录、文件名、当前小类正文和关联小类语义,用于标识知识大类、小类、关键词和条目间关系。后续 RAG 入库和检索时,应把这些字段写入向量库 metadata,并用于分类过滤、查询路由或重排加权,降低不同场景之间因为相似词命中而串场的概率。
|
|
65
65
|
|
|
66
66
|
生成结果会按原始输入遍历顺序写入 `source_order`,并用 `000001-...md` 这样的文件名前缀保持目录排序与原文从上到下的顺序一致。页码只写入 Front Matter 的 `source_pages`/`source_trace` 和正文 `## 5. 来源依据`,不会进入正文 `## 1. 核心内容` 到 `## 4. 关联能力`。
|
|
67
67
|
|
package/config/config.yaml
CHANGED
package/ingest.py
CHANGED
|
@@ -118,11 +118,11 @@ def _attach_block_context(
|
|
|
118
118
|
parts = []
|
|
119
119
|
if outline:
|
|
120
120
|
parts.append(f"文档章节目录:\n{outline}")
|
|
121
|
-
if block.
|
|
121
|
+
if block.category or block.subcategory or block.category_keywords:
|
|
122
122
|
parts.append(
|
|
123
|
-
"
|
|
124
|
-
f"
|
|
125
|
-
f"
|
|
123
|
+
"知识分类:\n"
|
|
124
|
+
f"大类标题:{block.category}\n"
|
|
125
|
+
f"小类标题:{block.subcategory}\n"
|
|
126
126
|
f"关键词:{', '.join(block.category_keywords)}"
|
|
127
127
|
)
|
|
128
128
|
if idx > 0:
|
|
@@ -147,6 +147,13 @@ def _attach_block_context(
|
|
|
147
147
|
category=block.category,
|
|
148
148
|
category_description=block.category_description,
|
|
149
149
|
category_keywords=block.category_keywords,
|
|
150
|
+
source_doc_description=block.source_doc_description,
|
|
151
|
+
subcategory=block.subcategory,
|
|
152
|
+
subcategory_description=block.subcategory_description,
|
|
153
|
+
category_path=block.category_path,
|
|
154
|
+
related_categories=block.related_categories,
|
|
155
|
+
relation_notes=block.relation_notes,
|
|
156
|
+
related_items=block.related_items,
|
|
150
157
|
))
|
|
151
158
|
return output
|
|
152
159
|
|