union_kb_ingest 1.0.5 → 1.0.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/normalizer.py +61 -4
- package/package.json +1 -1
- package/schemas.py +1 -0
package/normalizer.py
CHANGED
|
@@ -156,8 +156,12 @@ def _normalize_with_llm(block: ParsedBlock, status: str) -> List[KnowledgeItem]:
|
|
|
156
156
|
f"preview={_preview(';'.join(coverage_issues[:3]))}"
|
|
157
157
|
)
|
|
158
158
|
if attempt >= LLM_MAX_RETRIES:
|
|
159
|
-
|
|
160
|
-
|
|
159
|
+
print(
|
|
160
|
+
"WARNING: source fact coverage failed after "
|
|
161
|
+
f"{LLM_MAX_RETRIES} attempts; releasing draft for manual review"
|
|
162
|
+
)
|
|
163
|
+
return _items_with_coverage_warning(items, block, coverage_issues)
|
|
164
|
+
coverage_retry_feedback = _coverage_retry_prompt(block, coverage_issues, items)
|
|
161
165
|
time.sleep(min(2 ** (attempt - 1), 30))
|
|
162
166
|
continue
|
|
163
167
|
return items
|
|
@@ -195,14 +199,31 @@ def _compact_retry_prompt(base_prompt: str) -> str:
|
|
|
195
199
|
)
|
|
196
200
|
|
|
197
201
|
|
|
198
|
-
def _coverage_retry_prompt(
|
|
202
|
+
def _coverage_retry_prompt(
|
|
203
|
+
block: ParsedBlock,
|
|
204
|
+
missing_facts: List[str],
|
|
205
|
+
items: List[KnowledgeItem],
|
|
206
|
+
) -> str:
|
|
199
207
|
"""构造事实覆盖不足时的重试提示。"""
|
|
200
208
|
lines = "\n".join(f"- {fact}" for fact in missing_facts[:8])
|
|
209
|
+
source_excerpt = _preview(block.content)[:4000] or "无"
|
|
210
|
+
context_excerpt = _preview(block.context)[:2000] or "无"
|
|
211
|
+
current_core = _preview(
|
|
212
|
+
"\n\n".join(_core_sections_for_coverage(item.body) for item in items)
|
|
213
|
+
)[:3000] or "无"
|
|
201
214
|
return (
|
|
202
215
|
"重试补充要求:上一次输出遗漏了以下来源正文事实。"
|
|
203
216
|
"请重新生成 JSON,并把这些事实写入 ## 1. 核心内容、## 2. 适用边界 或 ## 3. 使用要求。"
|
|
204
217
|
"短定义句、简称句、阈值句和规则句应优先保留原句或等价完整表述,不要只概括关键词。\n"
|
|
205
|
-
f"{lines}"
|
|
218
|
+
f"{lines}\n\n"
|
|
219
|
+
f"来源文档:{block.source_doc}\n"
|
|
220
|
+
f"来源章节:{block.source_section or '全文'}\n\n"
|
|
221
|
+
"当前来源原文片段如下,请结合上下文判断缺失事实应补入哪个正文小节:\n"
|
|
222
|
+
f"{source_excerpt}\n\n"
|
|
223
|
+
"辅助上下文如下。辅助上下文只用于理解位置和术语,不要把其中独有事实写入正文:\n"
|
|
224
|
+
f"{context_excerpt}\n\n"
|
|
225
|
+
"上一轮生成的核心正文如下,请在此基础上补齐遗漏事实并保持 JSON 根结构不变:\n"
|
|
226
|
+
f"{current_core}"
|
|
206
227
|
)
|
|
207
228
|
|
|
208
229
|
|
|
@@ -719,6 +740,42 @@ def _normalize_heuristically(block: ParsedBlock, status: str) -> KnowledgeItem:
|
|
|
719
740
|
)
|
|
720
741
|
|
|
721
742
|
|
|
743
|
+
def _items_with_coverage_warning(
|
|
744
|
+
items: List[KnowledgeItem],
|
|
745
|
+
block: ParsedBlock,
|
|
746
|
+
missing_facts: List[str],
|
|
747
|
+
) -> List[KnowledgeItem]:
|
|
748
|
+
"""把覆盖不足的 LLM 结果降级为需人工复核的草稿。"""
|
|
749
|
+
for item in items:
|
|
750
|
+
item.status = "draft"
|
|
751
|
+
item.review_status = "coverage_warning"
|
|
752
|
+
if "需人工复核" not in item.tags:
|
|
753
|
+
item.tags.append("需人工复核")
|
|
754
|
+
item.body = _body_with_coverage_warning(item.body, block, missing_facts)
|
|
755
|
+
return items
|
|
756
|
+
|
|
757
|
+
|
|
758
|
+
def _body_with_coverage_warning(
|
|
759
|
+
body: str,
|
|
760
|
+
block: ParsedBlock,
|
|
761
|
+
missing_facts: List[str],
|
|
762
|
+
) -> str:
|
|
763
|
+
"""在正文核心章节标注覆盖不足的来源事实。"""
|
|
764
|
+
fact_lines = "\n".join(f"> - {fact}" for fact in missing_facts[:20])
|
|
765
|
+
note = (
|
|
766
|
+
"\n\n"
|
|
767
|
+
"> WARNING: LLM 多次重试后仍未通过来源事实覆盖校验,本条目已降为草稿,需人工复核。\n"
|
|
768
|
+
f"> 来源文档:{block.source_doc}\n"
|
|
769
|
+
f"> 来源章节:{block.source_section or '全文'}\n"
|
|
770
|
+
"> 待人工补齐或确认的来源事实:\n"
|
|
771
|
+
f"{fact_lines}"
|
|
772
|
+
)
|
|
773
|
+
pattern = r"(^##\s+1\.\s+核心内容\s*$)"
|
|
774
|
+
if re.search(pattern, body, flags=re.M):
|
|
775
|
+
return re.sub(pattern, r"\1" + note, body, count=1, flags=re.M)
|
|
776
|
+
return f"{body.rstrip()}\n\n## 人工复核提示{note}"
|
|
777
|
+
|
|
778
|
+
|
|
722
779
|
def _source_fact_coverage_issues(block: ParsedBlock, items: List[KnowledgeItem]) -> List[str]:
|
|
723
780
|
"""检查来源事实是否进入核心正文。"""
|
|
724
781
|
facts = _source_fact_sentences(block.content)
|
package/package.json
CHANGED