npm - union_kb_ingest - Versions diffs - 1.0.0 → 1.0.1 - Mend

union_kb_ingest 1.0.0 → 1.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (4) hide show

package/config/config.yaml CHANGED Viewed

@@ -1,8 +1,8 @@
 llm:
-  enabled: false
+  enabled: true
   timeout_seconds: 120
   max_tokens: 4096
   temperature: 0.1
-  api_key: "your-api-key"          # API Key
-  model: "your-model"              # 模型名称
-  base_url: "https://your-model-endpoint"  # 完整模型调用地址
+  api_key: "15f066c4509845038027ea5746524af5.w4CLSC6ODiKVC1wK"          # GLM API Key
+  model: "glm-4.7-flash"                 # 模型名称，可选: glm-4, glm-4-plus, glm-4-flash 等
+  base_url: "https://open.bigmodel.cn/api/paas/v4/"  # 模型服务地址

package/normalizer.py CHANGED Viewed

@@ -53,13 +53,22 @@ def _normalize_with_llm(block: ParsedBlock, status: str) -> List[KnowledgeItem]:
             response = client.chat.completions.create(
                 model=config.model,
                 messages=[
-                    {"role": "system", "content": "你是严谨的运维知识库整理助手，只能依据输入原文生成结构化知识条目。"},
+                    {
+                        "role": "system",
+                        "content": (
+                            "你是严谨的运维知识库整理助手，只能依据输入原文生成结构化知识条目。"
+                            "你必须只返回一个 JSON object，根节点必须只有 items 字段，"
+                            "且 items 必须是数组。不要返回 Markdown、解释文字或其他根字段。"
+                        ),
+                    },
                     {"role": "user", "content": prompt},
                 ],
+                stream=False,
                 max_tokens=config.max_tokens,
                 temperature=config.temperature,
+                do_sample=False,
                 response_format={"type": "json_object"},
-                thinking={"type": "disabled"},
+                thinking={"type": "disabled", "clear_thinking": True},
             )
         except Exception as exc:
             elapsed = time.monotonic() - started_at
@@ -94,7 +103,10 @@ def _normalize_with_llm(block: ParsedBlock, status: str) -> List[KnowledgeItem]:
         raw_items = parsed.get("items") if isinstance(parsed, dict) else parsed
         if not isinstance(raw_items, list):
-            print("llm parse failed: JSON does not contain an items list")
+            print(
+                "llm parse failed: JSON does not contain an items list "
+                f"top_level={_json_shape(parsed)} preview={_preview(content)}"
+            )
             if attempt >= LLM_MAX_RETRIES:
                 _abort_llm("JSON does not contain an items list after 10 attempts", block)
             time.sleep(min(2 ** (attempt - 1), 30))
@@ -116,6 +128,13 @@ def _normalize_with_llm(block: ParsedBlock, status: str) -> List[KnowledgeItem]:
 def _get_zhipu_client_class():
+    try:
+        from zai import ZaiClient
+        return ZaiClient
+    except (ImportError, AttributeError):
+        pass
     try:
         from zai import ZhipuAiClient
@@ -177,10 +196,15 @@ def _first_choice(response):
 def _response_debug(response) -> str:
-    text = repr(response)
-    if len(text) > 300:
-        text = text[:300] + "..."
-    return _preview(text, limit=300)
+    return _preview(repr(response))
+def _json_shape(value) -> str:
+    if isinstance(value, dict):
+        return f"object keys={list(value.keys())[:10]}"
+    if isinstance(value, list):
+        return f"array len={len(value)}"
+    return type(value).__name__
 def _abort_llm(message: str, block: ParsedBlock) -> None:
@@ -205,6 +229,7 @@ def _build_prompt(block: ParsedBlock, status: str) -> str:
 6. 涉及表格、阈值、比较符、单位、持续时间、笔数、适用对象时必须保留原始逻辑。
 7. 输出严格 JSON 对象，不要 Markdown 代码围栏，不要解释文字，不要在 JSON 前后添加任何内容。
 8. status 固定为 "{status}"。
+9. JSON 根节点必须严格为一个对象：{{"items": [...]}}。禁止返回单个 item 对象、禁止返回纯数组、禁止返回 result/data/records/knowledge_items 等其他根字段。
 doc_type 只能取：
 {", ".join(sorted(DOC_TYPES))}
@@ -281,8 +306,8 @@ def _json_candidates(text: str) -> List[str]:
     return candidates
-def _preview(text: str, limit: int = 500) -> str:
-    return re.sub(r"\s+", " ", text).strip()[:limit]
+def _preview(text: str) -> str:
+    return re.sub(r"\s+", " ", text).strip()
 def _normalize_heuristically(block: ParsedBlock, status: str) -> KnowledgeItem:

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "union_kb_ingest",
-  "version": "1.0.0",
+  "version": "1.0.1",
   "description": "Offline knowledge-base ingest helper for PDF, Word, Markdown and TXT documents.",
   "bin": {
     "union_kb_ingest": "bin/union_kb_ingest"

package/requirements.txt CHANGED Viewed

@@ -3,6 +3,9 @@ pyyaml>=6.0.1
 zhipuai>=2.1.0
 sniffio>=1.3.0
+# Pin numpy to avoid incompatible x86-64-v2 wheels on older offline machines.
+numpy==1.26.4
 # Docling slim plus only file-format backends used by this offline tool.
 # Do not install `docling` or `docling-slim[standard]` here: those pull OCR,
 # layout/table ML models, torch/onnxruntime, and may try to download artifacts.