union_kb_ingest 1.0.5 → 1.0.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/normalizer.py CHANGED
@@ -156,8 +156,12 @@ def _normalize_with_llm(block: ParsedBlock, status: str) -> List[KnowledgeItem]:
156
156
  f"preview={_preview(';'.join(coverage_issues[:3]))}"
157
157
  )
158
158
  if attempt >= LLM_MAX_RETRIES:
159
- _abort_llm("source fact coverage failed after 10 attempts", block)
160
- coverage_retry_feedback = _coverage_retry_prompt(coverage_issues)
159
+ print(
160
+ "WARNING: source fact coverage failed after "
161
+ f"{LLM_MAX_RETRIES} attempts; releasing draft for manual review"
162
+ )
163
+ return _items_with_coverage_warning(items, block, coverage_issues)
164
+ coverage_retry_feedback = _coverage_retry_prompt(block, coverage_issues, items)
161
165
  time.sleep(min(2 ** (attempt - 1), 30))
162
166
  continue
163
167
  return items
@@ -195,14 +199,31 @@ def _compact_retry_prompt(base_prompt: str) -> str:
195
199
  )
196
200
 
197
201
 
198
- def _coverage_retry_prompt(missing_facts: List[str]) -> str:
202
+ def _coverage_retry_prompt(
203
+ block: ParsedBlock,
204
+ missing_facts: List[str],
205
+ items: List[KnowledgeItem],
206
+ ) -> str:
199
207
  """构造事实覆盖不足时的重试提示。"""
200
208
  lines = "\n".join(f"- {fact}" for fact in missing_facts[:8])
209
+ source_excerpt = _preview(block.content)[:4000] or "无"
210
+ context_excerpt = _preview(block.context)[:2000] or "无"
211
+ current_core = _preview(
212
+ "\n\n".join(_core_sections_for_coverage(item.body) for item in items)
213
+ )[:3000] or "无"
201
214
  return (
202
215
  "重试补充要求:上一次输出遗漏了以下来源正文事实。"
203
216
  "请重新生成 JSON,并把这些事实写入 ## 1. 核心内容、## 2. 适用边界 或 ## 3. 使用要求。"
204
217
  "短定义句、简称句、阈值句和规则句应优先保留原句或等价完整表述,不要只概括关键词。\n"
205
- f"{lines}"
218
+ f"{lines}\n\n"
219
+ f"来源文档:{block.source_doc}\n"
220
+ f"来源章节:{block.source_section or '全文'}\n\n"
221
+ "当前来源原文片段如下,请结合上下文判断缺失事实应补入哪个正文小节:\n"
222
+ f"{source_excerpt}\n\n"
223
+ "辅助上下文如下。辅助上下文只用于理解位置和术语,不要把其中独有事实写入正文:\n"
224
+ f"{context_excerpt}\n\n"
225
+ "上一轮生成的核心正文如下,请在此基础上补齐遗漏事实并保持 JSON 根结构不变:\n"
226
+ f"{current_core}"
206
227
  )
207
228
 
208
229
 
@@ -719,6 +740,42 @@ def _normalize_heuristically(block: ParsedBlock, status: str) -> KnowledgeItem:
719
740
  )
720
741
 
721
742
 
743
+ def _items_with_coverage_warning(
744
+ items: List[KnowledgeItem],
745
+ block: ParsedBlock,
746
+ missing_facts: List[str],
747
+ ) -> List[KnowledgeItem]:
748
+ """把覆盖不足的 LLM 结果降级为需人工复核的草稿。"""
749
+ for item in items:
750
+ item.status = "draft"
751
+ item.review_status = "coverage_warning"
752
+ if "需人工复核" not in item.tags:
753
+ item.tags.append("需人工复核")
754
+ item.body = _body_with_coverage_warning(item.body, block, missing_facts)
755
+ return items
756
+
757
+
758
+ def _body_with_coverage_warning(
759
+ body: str,
760
+ block: ParsedBlock,
761
+ missing_facts: List[str],
762
+ ) -> str:
763
+ """在正文核心章节标注覆盖不足的来源事实。"""
764
+ fact_lines = "\n".join(f"> - {fact}" for fact in missing_facts[:20])
765
+ note = (
766
+ "\n\n"
767
+ "> WARNING: LLM 多次重试后仍未通过来源事实覆盖校验,本条目已降为草稿,需人工复核。\n"
768
+ f"> 来源文档:{block.source_doc}\n"
769
+ f"> 来源章节:{block.source_section or '全文'}\n"
770
+ "> 待人工补齐或确认的来源事实:\n"
771
+ f"{fact_lines}"
772
+ )
773
+ pattern = r"(^##\s+1\.\s+核心内容\s*$)"
774
+ if re.search(pattern, body, flags=re.M):
775
+ return re.sub(pattern, r"\1" + note, body, count=1, flags=re.M)
776
+ return f"{body.rstrip()}\n\n## 人工复核提示{note}"
777
+
778
+
722
779
  def _source_fact_coverage_issues(block: ParsedBlock, items: List[KnowledgeItem]) -> List[str]:
723
780
  """检查来源事实是否进入核心正文。"""
724
781
  facts = _source_fact_sentences(block.content)
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "union_kb_ingest",
3
- "version": "1.0.5",
3
+ "version": "1.0.6",
4
4
  "description": "Offline knowledge-base ingest helper for PDF, Word, Markdown and TXT documents.",
5
5
  "bin": {
6
6
  "union_kb_ingest": "bin/union_kb_ingest"
package/schemas.py CHANGED
@@ -104,6 +104,7 @@ class KnowledgeItem:
104
104
  "applicable_roles": self.applicable_roles,
105
105
  "tags": self.tags,
106
106
  "status": self.status,
107
+ "review_status": self.review_status,
107
108
  }
108
109
 
109
110