union_kb_ingest 1.0.0 → 1.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,8 +1,8 @@
1
1
  llm:
2
- enabled: false
2
+ enabled: true
3
3
  timeout_seconds: 120
4
4
  max_tokens: 4096
5
5
  temperature: 0.1
6
- api_key: "your-api-key" # API Key
7
- model: "your-model" # 模型名称
8
- base_url: "https://your-model-endpoint" # 完整模型调用地址
6
+ api_key: "15f066c4509845038027ea5746524af5.w4CLSC6ODiKVC1wK" # GLM API Key
7
+ model: "glm-4.7-flash" # 模型名称,可选: glm-4, glm-4-plus, glm-4-flash 等
8
+ base_url: "https://open.bigmodel.cn/api/paas/v4/" # 模型服务地址
package/normalizer.py CHANGED
@@ -53,13 +53,22 @@ def _normalize_with_llm(block: ParsedBlock, status: str) -> List[KnowledgeItem]:
53
53
  response = client.chat.completions.create(
54
54
  model=config.model,
55
55
  messages=[
56
- {"role": "system", "content": "你是严谨的运维知识库整理助手,只能依据输入原文生成结构化知识条目。"},
56
+ {
57
+ "role": "system",
58
+ "content": (
59
+ "你是严谨的运维知识库整理助手,只能依据输入原文生成结构化知识条目。"
60
+ "你必须只返回一个 JSON object,根节点必须只有 items 字段,"
61
+ "且 items 必须是数组。不要返回 Markdown、解释文字或其他根字段。"
62
+ ),
63
+ },
57
64
  {"role": "user", "content": prompt},
58
65
  ],
66
+ stream=False,
59
67
  max_tokens=config.max_tokens,
60
68
  temperature=config.temperature,
69
+ do_sample=False,
61
70
  response_format={"type": "json_object"},
62
- thinking={"type": "disabled"},
71
+ thinking={"type": "disabled", "clear_thinking": True},
63
72
  )
64
73
  except Exception as exc:
65
74
  elapsed = time.monotonic() - started_at
@@ -94,7 +103,10 @@ def _normalize_with_llm(block: ParsedBlock, status: str) -> List[KnowledgeItem]:
94
103
 
95
104
  raw_items = parsed.get("items") if isinstance(parsed, dict) else parsed
96
105
  if not isinstance(raw_items, list):
97
- print("llm parse failed: JSON does not contain an items list")
106
+ print(
107
+ "llm parse failed: JSON does not contain an items list "
108
+ f"top_level={_json_shape(parsed)} preview={_preview(content)}"
109
+ )
98
110
  if attempt >= LLM_MAX_RETRIES:
99
111
  _abort_llm("JSON does not contain an items list after 10 attempts", block)
100
112
  time.sleep(min(2 ** (attempt - 1), 30))
@@ -116,6 +128,13 @@ def _normalize_with_llm(block: ParsedBlock, status: str) -> List[KnowledgeItem]:
116
128
 
117
129
 
118
130
  def _get_zhipu_client_class():
131
+ try:
132
+ from zai import ZaiClient
133
+
134
+ return ZaiClient
135
+ except (ImportError, AttributeError):
136
+ pass
137
+
119
138
  try:
120
139
  from zai import ZhipuAiClient
121
140
 
@@ -177,10 +196,15 @@ def _first_choice(response):
177
196
 
178
197
 
179
198
  def _response_debug(response) -> str:
180
- text = repr(response)
181
- if len(text) > 300:
182
- text = text[:300] + "..."
183
- return _preview(text, limit=300)
199
+ return _preview(repr(response))
200
+
201
+
202
+ def _json_shape(value) -> str:
203
+ if isinstance(value, dict):
204
+ return f"object keys={list(value.keys())[:10]}"
205
+ if isinstance(value, list):
206
+ return f"array len={len(value)}"
207
+ return type(value).__name__
184
208
 
185
209
 
186
210
  def _abort_llm(message: str, block: ParsedBlock) -> None:
@@ -205,6 +229,7 @@ def _build_prompt(block: ParsedBlock, status: str) -> str:
205
229
  6. 涉及表格、阈值、比较符、单位、持续时间、笔数、适用对象时必须保留原始逻辑。
206
230
  7. 输出严格 JSON 对象,不要 Markdown 代码围栏,不要解释文字,不要在 JSON 前后添加任何内容。
207
231
  8. status 固定为 "{status}"。
232
+ 9. JSON 根节点必须严格为一个对象:{{"items": [...]}}。禁止返回单个 item 对象、禁止返回纯数组、禁止返回 result/data/records/knowledge_items 等其他根字段。
208
233
 
209
234
  doc_type 只能取:
210
235
  {", ".join(sorted(DOC_TYPES))}
@@ -281,8 +306,8 @@ def _json_candidates(text: str) -> List[str]:
281
306
  return candidates
282
307
 
283
308
 
284
- def _preview(text: str, limit: int = 500) -> str:
285
- return re.sub(r"\s+", " ", text).strip()[:limit]
309
+ def _preview(text: str) -> str:
310
+ return re.sub(r"\s+", " ", text).strip()
286
311
 
287
312
 
288
313
  def _normalize_heuristically(block: ParsedBlock, status: str) -> KnowledgeItem:
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "union_kb_ingest",
3
- "version": "1.0.0",
3
+ "version": "1.0.1",
4
4
  "description": "Offline knowledge-base ingest helper for PDF, Word, Markdown and TXT documents.",
5
5
  "bin": {
6
6
  "union_kb_ingest": "bin/union_kb_ingest"
package/requirements.txt CHANGED
@@ -3,6 +3,9 @@ pyyaml>=6.0.1
3
3
  zhipuai>=2.1.0
4
4
  sniffio>=1.3.0
5
5
 
6
+ # Pin numpy to avoid incompatible x86-64-v2 wheels on older offline machines.
7
+ numpy==1.26.4
8
+
6
9
  # Docling slim plus only file-format backends used by this offline tool.
7
10
  # Do not install `docling` or `docling-slim[standard]` here: those pull OCR,
8
11
  # layout/table ML models, torch/onnxruntime, and may try to download artifacts.