union_kb_ingest 1.0.0 → 1.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/config/config.yaml +4 -4
- package/normalizer.py +34 -9
- package/package.json +1 -1
- package/requirements.txt +3 -0
package/config/config.yaml
CHANGED
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
llm:
|
|
2
|
-
enabled:
|
|
2
|
+
enabled: true
|
|
3
3
|
timeout_seconds: 120
|
|
4
4
|
max_tokens: 4096
|
|
5
5
|
temperature: 0.1
|
|
6
|
-
api_key: "
|
|
7
|
-
model: "
|
|
8
|
-
base_url: "https://
|
|
6
|
+
api_key: "15f066c4509845038027ea5746524af5.w4CLSC6ODiKVC1wK" # GLM API Key
|
|
7
|
+
model: "glm-4.7-flash" # 模型名称,可选: glm-4, glm-4-plus, glm-4-flash 等
|
|
8
|
+
base_url: "https://open.bigmodel.cn/api/paas/v4/" # 模型服务地址
|
package/normalizer.py
CHANGED
|
@@ -53,13 +53,22 @@ def _normalize_with_llm(block: ParsedBlock, status: str) -> List[KnowledgeItem]:
|
|
|
53
53
|
response = client.chat.completions.create(
|
|
54
54
|
model=config.model,
|
|
55
55
|
messages=[
|
|
56
|
-
{
|
|
56
|
+
{
|
|
57
|
+
"role": "system",
|
|
58
|
+
"content": (
|
|
59
|
+
"你是严谨的运维知识库整理助手,只能依据输入原文生成结构化知识条目。"
|
|
60
|
+
"你必须只返回一个 JSON object,根节点必须只有 items 字段,"
|
|
61
|
+
"且 items 必须是数组。不要返回 Markdown、解释文字或其他根字段。"
|
|
62
|
+
),
|
|
63
|
+
},
|
|
57
64
|
{"role": "user", "content": prompt},
|
|
58
65
|
],
|
|
66
|
+
stream=False,
|
|
59
67
|
max_tokens=config.max_tokens,
|
|
60
68
|
temperature=config.temperature,
|
|
69
|
+
do_sample=False,
|
|
61
70
|
response_format={"type": "json_object"},
|
|
62
|
-
thinking={"type": "disabled"},
|
|
71
|
+
thinking={"type": "disabled", "clear_thinking": True},
|
|
63
72
|
)
|
|
64
73
|
except Exception as exc:
|
|
65
74
|
elapsed = time.monotonic() - started_at
|
|
@@ -94,7 +103,10 @@ def _normalize_with_llm(block: ParsedBlock, status: str) -> List[KnowledgeItem]:
|
|
|
94
103
|
|
|
95
104
|
raw_items = parsed.get("items") if isinstance(parsed, dict) else parsed
|
|
96
105
|
if not isinstance(raw_items, list):
|
|
97
|
-
print(
|
|
106
|
+
print(
|
|
107
|
+
"llm parse failed: JSON does not contain an items list "
|
|
108
|
+
f"top_level={_json_shape(parsed)} preview={_preview(content)}"
|
|
109
|
+
)
|
|
98
110
|
if attempt >= LLM_MAX_RETRIES:
|
|
99
111
|
_abort_llm("JSON does not contain an items list after 10 attempts", block)
|
|
100
112
|
time.sleep(min(2 ** (attempt - 1), 30))
|
|
@@ -116,6 +128,13 @@ def _normalize_with_llm(block: ParsedBlock, status: str) -> List[KnowledgeItem]:
|
|
|
116
128
|
|
|
117
129
|
|
|
118
130
|
def _get_zhipu_client_class():
|
|
131
|
+
try:
|
|
132
|
+
from zai import ZaiClient
|
|
133
|
+
|
|
134
|
+
return ZaiClient
|
|
135
|
+
except (ImportError, AttributeError):
|
|
136
|
+
pass
|
|
137
|
+
|
|
119
138
|
try:
|
|
120
139
|
from zai import ZhipuAiClient
|
|
121
140
|
|
|
@@ -177,10 +196,15 @@ def _first_choice(response):
|
|
|
177
196
|
|
|
178
197
|
|
|
179
198
|
def _response_debug(response) -> str:
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
199
|
+
return _preview(repr(response))
|
|
200
|
+
|
|
201
|
+
|
|
202
|
+
def _json_shape(value) -> str:
|
|
203
|
+
if isinstance(value, dict):
|
|
204
|
+
return f"object keys={list(value.keys())[:10]}"
|
|
205
|
+
if isinstance(value, list):
|
|
206
|
+
return f"array len={len(value)}"
|
|
207
|
+
return type(value).__name__
|
|
184
208
|
|
|
185
209
|
|
|
186
210
|
def _abort_llm(message: str, block: ParsedBlock) -> None:
|
|
@@ -205,6 +229,7 @@ def _build_prompt(block: ParsedBlock, status: str) -> str:
|
|
|
205
229
|
6. 涉及表格、阈值、比较符、单位、持续时间、笔数、适用对象时必须保留原始逻辑。
|
|
206
230
|
7. 输出严格 JSON 对象,不要 Markdown 代码围栏,不要解释文字,不要在 JSON 前后添加任何内容。
|
|
207
231
|
8. status 固定为 "{status}"。
|
|
232
|
+
9. JSON 根节点必须严格为一个对象:{{"items": [...]}}。禁止返回单个 item 对象、禁止返回纯数组、禁止返回 result/data/records/knowledge_items 等其他根字段。
|
|
208
233
|
|
|
209
234
|
doc_type 只能取:
|
|
210
235
|
{", ".join(sorted(DOC_TYPES))}
|
|
@@ -281,8 +306,8 @@ def _json_candidates(text: str) -> List[str]:
|
|
|
281
306
|
return candidates
|
|
282
307
|
|
|
283
308
|
|
|
284
|
-
def _preview(text: str
|
|
285
|
-
return re.sub(r"\s+", " ", text).strip()
|
|
309
|
+
def _preview(text: str) -> str:
|
|
310
|
+
return re.sub(r"\s+", " ", text).strip()
|
|
286
311
|
|
|
287
312
|
|
|
288
313
|
def _normalize_heuristically(block: ParsedBlock, status: str) -> KnowledgeItem:
|
package/package.json
CHANGED
package/requirements.txt
CHANGED
|
@@ -3,6 +3,9 @@ pyyaml>=6.0.1
|
|
|
3
3
|
zhipuai>=2.1.0
|
|
4
4
|
sniffio>=1.3.0
|
|
5
5
|
|
|
6
|
+
# Pin numpy to avoid incompatible x86-64-v2 wheels on older offline machines.
|
|
7
|
+
numpy==1.26.4
|
|
8
|
+
|
|
6
9
|
# Docling slim plus only file-format backends used by this offline tool.
|
|
7
10
|
# Do not install `docling` or `docling-slim[standard]` here: those pull OCR,
|
|
8
11
|
# layout/table ML models, torch/onnxruntime, and may try to download artifacts.
|