union_kb_ingest 1.0.1 → 1.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/ArkKickidcService.java +578 -0
- package/README.md +15 -8
- package/app_config.py +20 -0
- package/config/config.yaml +9 -4
- package/ingest.py +73 -2
- package/normalizer.py +130 -29
- package/package.json +2 -1
- package/requirements.txt +0 -3
- package/schemas.py +1 -1
- package/splitter.py +2 -0
|
@@ -0,0 +1,578 @@
|
|
|
1
|
+
import com.alibaba.fastjson.JSON;
|
|
2
|
+
import com.alibaba.fastjson.JSONArray;
|
|
3
|
+
import com.alibaba.fastjson.JSONObject;
|
|
4
|
+
import com.alibaba.fastjson.TypeReference;
|
|
5
|
+
import lombok.Data;
|
|
6
|
+
import lombok.extern.slf4j.Slf4j;
|
|
7
|
+
import org.slf4j.Logger;
|
|
8
|
+
import org.slf4j.LoggerFactory;
|
|
9
|
+
import org.slf4j.MDC;
|
|
10
|
+
import org.springframework.beans.factory.annotation.Autowired;
|
|
11
|
+
import org.springframework.beans.factory.annotation.Value;
|
|
12
|
+
import org.springframework.stereotype.Service;
|
|
13
|
+
|
|
14
|
+
import java.util.ArrayList;
|
|
15
|
+
import java.util.Date;
|
|
16
|
+
import java.util.HashMap;
|
|
17
|
+
import java.util.List;
|
|
18
|
+
import java.util.Map;
|
|
19
|
+
|
|
20
|
+
/**
|
|
21
|
+
* Created by liuhonghui on 2019/10/7.
|
|
22
|
+
*/
|
|
23
|
+
@Service
|
|
24
|
+
@Slf4j
|
|
25
|
+
@Data
|
|
26
|
+
public class ArkKickidcService {
|
|
27
|
+
|
|
28
|
+
Logger securityLogger = LoggerFactory.getLogger(LogFileType.ARK_SECURITY.getLogFileType());
|
|
29
|
+
|
|
30
|
+
@Value("${db.name}")
|
|
31
|
+
private String dbName;
|
|
32
|
+
|
|
33
|
+
@Value("${db.key}")
|
|
34
|
+
private String dbKey;
|
|
35
|
+
|
|
36
|
+
@Value("${application.name}")
|
|
37
|
+
private String applicationName;
|
|
38
|
+
|
|
39
|
+
@Value("${exchange.db.url}")
|
|
40
|
+
private String dbUrl;
|
|
41
|
+
|
|
42
|
+
@Value("${exchange.db.type}")
|
|
43
|
+
private String dbType;
|
|
44
|
+
|
|
45
|
+
@Value("${exchange.db.golden.url}")
|
|
46
|
+
private String goldenDbUrl;
|
|
47
|
+
|
|
48
|
+
@Autowired
|
|
49
|
+
private TaskServiceClient taskServiceClient;
|
|
50
|
+
|
|
51
|
+
@Autowired
|
|
52
|
+
private ArkControlBaseFacade arkControlBaseFacade;
|
|
53
|
+
|
|
54
|
+
@Autowired
|
|
55
|
+
private DictCacheService dictCacheService;
|
|
56
|
+
|
|
57
|
+
private static final String UPDATEDBY = "kickidc";
|
|
58
|
+
private static final String GOLDEN_DB_TYPE = "goldenDb";
|
|
59
|
+
private static final String GOLDEN_DB_SUCCESS_CODE = "20000";
|
|
60
|
+
private static final String GOLDEN_DB_PARAM_ERROR_CODE = "10001";
|
|
61
|
+
private static final String GOLDEN_DB_DUPLICATE_REQUEST_CODE = "10002";
|
|
62
|
+
private static final String GOLDEN_DB_SWITCH_FAIL_CODE = "10003";
|
|
63
|
+
private static final String GOLDEN_DB_INTERNAL_ERROR_CODE = "10004";
|
|
64
|
+
private static final String GOLDEN_DB_PLAN_SWITCH = "planswitch";
|
|
65
|
+
|
|
66
|
+
/**
|
|
67
|
+
* 提机房步骤
|
|
68
|
+
*/
|
|
69
|
+
public List<String> arkKickidc(String idc, String requestTime) {
|
|
70
|
+
List<String> list = new ArrayList<>();
|
|
71
|
+
createMDCParam(idc, requestTime);
|
|
72
|
+
|
|
73
|
+
// 1. 确定在制定idc是否有部署服务
|
|
74
|
+
// 有部署服务
|
|
75
|
+
AnnounceIDCInfoEnum kickIdc = AnnounceIDCInfoEnum.explainIdc(idc);
|
|
76
|
+
|
|
77
|
+
// KickIdcParamEnum kickIdc = KickIdcParamEnum.explain(idc);
|
|
78
|
+
if (null == kickIdc) {
|
|
79
|
+
MDC.put("idc", idc);
|
|
80
|
+
MDC.put("requestType", "2");
|
|
81
|
+
MDC.put("ip", "-");
|
|
82
|
+
MDC.put("errCode", "联合运维团队-" + idc + "无业务部署");
|
|
83
|
+
log.error("arkKickidc ark-web idc error,idc={}", idc);
|
|
84
|
+
securityLogger.info("arkKickidc ark-web idc error,idc={}", idc);
|
|
85
|
+
list.add("联合运维团队-" + idc + "无业务部署");
|
|
86
|
+
return list;
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
String targetIdc = kickIdc.getTargetIdc();
|
|
90
|
+
try {
|
|
91
|
+
List<AnnounceDataDictionaryDTO> targetIdcList = dictCacheService.getDictModelList("targetIdc");
|
|
92
|
+
targetIdc = targetIdcList.stream()
|
|
93
|
+
.filter(c -> kickIdc.getKickIdc().equals(c.getDataName()))
|
|
94
|
+
.findFirst()
|
|
95
|
+
.get()
|
|
96
|
+
.getDataValue();
|
|
97
|
+
} catch (Exception e) {
|
|
98
|
+
log.error("获取切换目标IDC失败,使用默认配置");
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
return arkChangeIdc(targetIdc, idc);
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
public List<String> arkRecoveridc(String targetIdc, String requestTime) {
|
|
105
|
+
List<String> list = new ArrayList<>();
|
|
106
|
+
createMDCParam(targetIdc, requestTime);
|
|
107
|
+
|
|
108
|
+
// 1. 确定在目标idc是否有部署服务
|
|
109
|
+
AnnounceIDCInfoEnum targetIdcEnum = AnnounceIDCInfoEnum.explainIdc(targetIdc);
|
|
110
|
+
// KickIdcParamEnum targetIdcEnum = KickIdcParamEnum.explain(targetIdc);
|
|
111
|
+
if (null == targetIdcEnum) {
|
|
112
|
+
MDC.put("requestType", "2");
|
|
113
|
+
MDC.put("ip", "-");
|
|
114
|
+
MDC.put("errCode", "联合运维团队-" + targetIdc + "无业务部署,无法切换");
|
|
115
|
+
log.error("联合运维团队{}机房无业务部署", targetIdc);
|
|
116
|
+
list.add("联合运维团队-" + targetIdc + "无业务部署");
|
|
117
|
+
return list;
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
return arkChangeIdc(targetIdc, null);
|
|
121
|
+
}
|
|
122
|
+
|
|
123
|
+
/**
|
|
124
|
+
* 切机房步骤
|
|
125
|
+
*/
|
|
126
|
+
public List<String> arkChangeIdc(String targetIdc, String kickIdc) {
|
|
127
|
+
// 踢机房流程开始,需要区分mysql和goldenDb
|
|
128
|
+
// 如果能读取到开关就用开关控制,不能就配置文件控制
|
|
129
|
+
boolean useGoldenDb = GOLDEN_DB_TYPE.equals(dbType);
|
|
130
|
+
try {
|
|
131
|
+
useGoldenDb = dictCacheService.getDictModelListMap()
|
|
132
|
+
.getOrDefault("kickGoldenDb", new ArrayList<>())
|
|
133
|
+
.stream()
|
|
134
|
+
.anyMatch(c -> c.getDataValue().equals("1"));
|
|
135
|
+
} catch (Exception e) {
|
|
136
|
+
log.info("读取数据字典dbType配置失败,使用配置文件dbType, dbType={}", dbType);
|
|
137
|
+
}
|
|
138
|
+
|
|
139
|
+
if (useGoldenDb) {
|
|
140
|
+
return kickGoldenDb(targetIdc, kickIdc);
|
|
141
|
+
}
|
|
142
|
+
return kickMySql(targetIdc, kickIdc);
|
|
143
|
+
}
|
|
144
|
+
|
|
145
|
+
private List<String> kickGoldenDb(String targetIdc, String kickIdc) {
|
|
146
|
+
List<String> list = new ArrayList<>();
|
|
147
|
+
try {
|
|
148
|
+
String aliveIdc = getGoldenDbMasterIdcFlag();
|
|
149
|
+
MDC.put("idc", aliveIdc);
|
|
150
|
+
|
|
151
|
+
if (kickIdc != null && !aliveIdc.equals(kickIdc)) {
|
|
152
|
+
MDC.put("idc", kickIdc);
|
|
153
|
+
MDC.put("requestType", "2");
|
|
154
|
+
MDC.put("errCode", "联合运维团队-" + kickIdc + "不是主机房,无须切换");
|
|
155
|
+
log.error("arkKickidc ark-web goldenDb idc error,idc={}", kickIdc);
|
|
156
|
+
securityLogger.info("arkKickidc ark-web goldenDb idc error,idc={}", kickIdc);
|
|
157
|
+
list.add("联合运维团队-" + kickIdc + "不是主机房,无须切换");
|
|
158
|
+
return list;
|
|
159
|
+
}
|
|
160
|
+
|
|
161
|
+
if (aliveIdc.equals(targetIdc)) {
|
|
162
|
+
MDC.put("requestType", "2");
|
|
163
|
+
MDC.put("errCode", "联合运维团队-" + targetIdc + "已是主机房,无须切换");
|
|
164
|
+
log.error("arkChangeidc ark-web goldenDb idc error,idc={}", targetIdc);
|
|
165
|
+
securityLogger.info("arkChangeidc ark-web goldenDb idc error,idc={}", targetIdc);
|
|
166
|
+
list.add("联合运维团队-" + targetIdc + "已是主机房,无须切换");
|
|
167
|
+
return list;
|
|
168
|
+
}
|
|
169
|
+
|
|
170
|
+
MDC.put("ip", targetIdc);
|
|
171
|
+
MDC.put("errCode", "联合运维团队开始执行goldenDb切机房操作,当前机房为" + aliveIdc + ",目标机房为" + targetIdc);
|
|
172
|
+
log.error("arkChangeidc ark-web goldenDb idc start,idc={}", aliveIdc);
|
|
173
|
+
securityLogger.info("arkChangeidc ark-web goldenDb idc start,idc={}", aliveIdc);
|
|
174
|
+
list.add("联合运维团队开始执行goldenDb切机房操作,所属机房为" + aliveIdc + ",目标机房为" + targetIdc);
|
|
175
|
+
|
|
176
|
+
exchangeGoldenDb(aliveIdc, targetIdc);
|
|
177
|
+
MDC.put("errCode", "联合运维团队-执行goldenDb切换操作");
|
|
178
|
+
log.error("arkChangeidc ark-web goldenDb切换完成,idc={}", aliveIdc);
|
|
179
|
+
securityLogger.info("arkChangeidc ark-web goldenDb切换完成,idc={}", aliveIdc);
|
|
180
|
+
list.add("联合运维团队-执行goldenDb切换操作成功");
|
|
181
|
+
|
|
182
|
+
this.exchangeSchedule(aliveIdc, targetIdc);
|
|
183
|
+
MDC.put("errCode", "联合运维团队-执行定时任务起停操作");
|
|
184
|
+
log.error("arkChangeidc ark-web goldenDb切换应用完成,idc={}", aliveIdc);
|
|
185
|
+
securityLogger.info("arkChangeidc ark-web goldenDb切换应用完成,idc={}", aliveIdc);
|
|
186
|
+
list.add("联合运维团队-执行定时任务起停操作成功");
|
|
187
|
+
addChangeIdcSuccessResult(list, targetIdc);
|
|
188
|
+
return list;
|
|
189
|
+
} catch (BusinessException e) {
|
|
190
|
+
MDC.put("requestType", "0");
|
|
191
|
+
MDC.put("errCode", e.getMessage());
|
|
192
|
+
log.info("arkChangeidc ark-web goldenDb idc error,idc={}", targetIdc, e);
|
|
193
|
+
log.error("arkChangeidc ark-web goldenDb idc error,idc={}", targetIdc);
|
|
194
|
+
securityLogger.info("arkChangeidc ark-web goldenDb idc error,idc={}", targetIdc);
|
|
195
|
+
list.add("联合运维团队执行goldenDb踢机房失败:" + e.getMessage());
|
|
196
|
+
return list;
|
|
197
|
+
}
|
|
198
|
+
}
|
|
199
|
+
|
|
200
|
+
/**
|
|
201
|
+
* 隔离mysql主备,用老的流程
|
|
202
|
+
*/
|
|
203
|
+
private List<String> kickMySql(String targetIdc, String kickIdc) {
|
|
204
|
+
List<String> list = new ArrayList<>();
|
|
205
|
+
try {
|
|
206
|
+
String aliveIdc = getMasterIdcFlag();
|
|
207
|
+
MDC.put("idc", aliveIdc);
|
|
208
|
+
|
|
209
|
+
if (kickIdc != null && !aliveIdc.equals(kickIdc)) {
|
|
210
|
+
MDC.put("idc", kickIdc);
|
|
211
|
+
MDC.put("requestType", "2");
|
|
212
|
+
MDC.put("errCode", "联合运维团队-" + kickIdc + "不是主机房,无须切换");
|
|
213
|
+
log.error("arkKickidc ark-web idc error,idc={}", kickIdc);
|
|
214
|
+
securityLogger.info("arkKickidc ark-web idc error,idc={}", kickIdc);
|
|
215
|
+
list.add("联合运维团队-" + kickIdc + "不是主机房,无须切换");
|
|
216
|
+
return list;
|
|
217
|
+
}
|
|
218
|
+
|
|
219
|
+
// 当前机房既是目标机房
|
|
220
|
+
if (aliveIdc.equals(targetIdc)) {
|
|
221
|
+
MDC.put("requestType", "2");
|
|
222
|
+
MDC.put("errCode", "联合运维团队-" + targetIdc + "已是主机房,无须切换");
|
|
223
|
+
log.error("arkChangeidc ark-web idc error,idc={}", targetIdc);
|
|
224
|
+
securityLogger.info("arkChangeidc ark-web idc error,idc={}", targetIdc);
|
|
225
|
+
list.add("联合运维团队-" + targetIdc + "已是主机房,无须切换");
|
|
226
|
+
return list;
|
|
227
|
+
}
|
|
228
|
+
|
|
229
|
+
MDC.put("ip", targetIdc);
|
|
230
|
+
MDC.put("errCode", "联合运维团队开始执行切机房操作,当前机房为" + aliveIdc + ",目标机房为" + targetIdc);
|
|
231
|
+
log.error("arkChangeidc ark-web idc start,idc={}", aliveIdc);
|
|
232
|
+
securityLogger.info("arkKickidc ark-web idc start,idc={}", aliveIdc);
|
|
233
|
+
list.add("联合运维团队开始执行切机房操作,所属机房为" + aliveIdc + ",目标机房为" + targetIdc);
|
|
234
|
+
|
|
235
|
+
// 1) 调用db接口
|
|
236
|
+
String failcode = null;
|
|
237
|
+
failcode = this.exchangeDB(aliveIdc, targetIdc);
|
|
238
|
+
if ("11112".equals(failcode)) {
|
|
239
|
+
MDC.put("errCode", "联合运维团队-已在目标主库" + targetIdc + "无须切换");
|
|
240
|
+
log.error("arkChangeidc ark-web 切换DB完成,idc={}", aliveIdc);
|
|
241
|
+
securityLogger.info("arkChangeidc ark-web 切换DB完成,idc={}", aliveIdc);
|
|
242
|
+
MDC.put("requestType", "2");
|
|
243
|
+
MDC.put("errCode", "联合运维团队-所有切机房操作执行成功");
|
|
244
|
+
log.error("lhyw arkChangeidc success,idc={}", aliveIdc);
|
|
245
|
+
securityLogger.info("lhyw arkChangeidc success,idc={}", aliveIdc);
|
|
246
|
+
list.add("联合运维团队-已在目标主库" + aliveIdc + "无须切换");
|
|
247
|
+
list.add("联合运维团队-所有踢机房操作执行成功");
|
|
248
|
+
return list;
|
|
249
|
+
}
|
|
250
|
+
|
|
251
|
+
MDC.put("errCode", "联合运维团队-执行DB切换操作");
|
|
252
|
+
log.error("arkChangeidc ark-web 切换DB完成,idc={}", aliveIdc);
|
|
253
|
+
securityLogger.info("arkChangeidc ark-web 切换DB完成,idc={}", aliveIdc);
|
|
254
|
+
list.add("联合运维团队-执行DB切换操作切换成功");
|
|
255
|
+
|
|
256
|
+
// 2) 切换定时任务
|
|
257
|
+
this.exchangeSchedule(aliveIdc, targetIdc);
|
|
258
|
+
MDC.put("errCode", "联合运维团队-执行定时任务起停操作");
|
|
259
|
+
log.error("arkChangeidc ark-web 切换应用完成,idc={}", aliveIdc);
|
|
260
|
+
securityLogger.info("arkChangeidc ark-web 切换应用完成,idc={}", aliveIdc);
|
|
261
|
+
list.add("联合运维团队-执行定时任务起停操作成功");
|
|
262
|
+
addChangeIdcSuccessResult(list, targetIdc);
|
|
263
|
+
return list;
|
|
264
|
+
} catch (BusinessException e) {
|
|
265
|
+
MDC.put("requestType", "0");
|
|
266
|
+
MDC.put("errCode", e.getMessage());
|
|
267
|
+
log.info("arkChangeidc ark-web idc error,idc={}", targetIdc, e);
|
|
268
|
+
log.error("arkChangeidc ark-web idc error,idc={}", targetIdc);
|
|
269
|
+
securityLogger.info("arkChangeidc ark-web idc error,idc={}", targetIdc);
|
|
270
|
+
list.add("联合运维团队执行踢机房失败:" + e.getMessage());
|
|
271
|
+
return list;
|
|
272
|
+
}
|
|
273
|
+
}
|
|
274
|
+
|
|
275
|
+
private void addChangeIdcSuccessResult(List<String> list, String targetIdc) {
|
|
276
|
+
MDC.put("requestType", "2");
|
|
277
|
+
MDC.put("errCode", "联合运维团队-所有踢机房操作执行成功");
|
|
278
|
+
log.error("lhyw arkChangeidc success,idc={}", targetIdc);
|
|
279
|
+
securityLogger.info("lhyw arkChangeidc success,idc={}", targetIdc);
|
|
280
|
+
list.add("联合运维团队-所有踢机房操作执行成功");
|
|
281
|
+
}
|
|
282
|
+
|
|
283
|
+
/**
|
|
284
|
+
* 查询主机房
|
|
285
|
+
*
|
|
286
|
+
* @return
|
|
287
|
+
*/
|
|
288
|
+
private String getGoldenDbMasterIdcFlag() throws BusinessException {
|
|
289
|
+
log.info("开始查询goldenDb主机房");
|
|
290
|
+
JSONObject jsonObject = new JSONObject();
|
|
291
|
+
jsonObject.put("db_name", dbName);
|
|
292
|
+
|
|
293
|
+
String response = postGoldenDb(goldenDbUrl, jsonObject, "query goldenDb masterIdc flag");
|
|
294
|
+
log.info("query goldenDb masterIdc response={}", response);
|
|
295
|
+
JSONObject resJson = JSON.parseObject(response);
|
|
296
|
+
String errCode = resJson.getString("errorCode");
|
|
297
|
+
String errMsg = resJson.getString("errorMsg");
|
|
298
|
+
Boolean checkState = resJson.getBoolean("check_state");
|
|
299
|
+
String masterIdc = resJson.getString("idc_no");
|
|
300
|
+
|
|
301
|
+
if (Boolean.TRUE.equals(checkState) && !isBlank(masterIdc)
|
|
302
|
+
&& (isBlank(errCode) || "ES000000".equals(errCode))) {
|
|
303
|
+
return masterIdc;
|
|
304
|
+
}
|
|
305
|
+
|
|
306
|
+
log.error("query goldenDb masterIdc flag error,errorCode={},errorMsg={},checkState={},idcNo={}",
|
|
307
|
+
errCode, errMsg, checkState, masterIdc);
|
|
308
|
+
throw new BusinessException("查询goldenDb主机房失败:" + buildGoldenDbErrorMessage(errCode, errMsg));
|
|
309
|
+
}
|
|
310
|
+
|
|
311
|
+
private void exchangeGoldenDb(String fromIdc, String targetIdc) throws BusinessException {
|
|
312
|
+
JSONObject jsonObject = new JSONObject();
|
|
313
|
+
jsonObject.put("db_name", dbName);
|
|
314
|
+
jsonObject.put("key", dbKey);
|
|
315
|
+
jsonObject.put("to_idc", targetIdc);
|
|
316
|
+
jsonObject.put("switch_type", GOLDEN_DB_PLAN_SWITCH);
|
|
317
|
+
log.info("exchangeGoldenDb param jsonObject={}", jsonObject);
|
|
318
|
+
|
|
319
|
+
String response = postGoldenDb(goldenDbUrl, jsonObject, "exchange goldenDb");
|
|
320
|
+
log.info("exchangeGoldenDb response={}", response);
|
|
321
|
+
JSONObject resJson = JSON.parseObject(response);
|
|
322
|
+
String switchCode = resJson.getString("switch_code");
|
|
323
|
+
String errorMsg = resJson.getString("error_msg");
|
|
324
|
+
Boolean switchState = resJson.getBoolean("switch_state");
|
|
325
|
+
|
|
326
|
+
if (GOLDEN_DB_SUCCESS_CODE.equals(switchCode) && Boolean.TRUE.equals(switchState)) {
|
|
327
|
+
updateIDCStatus(fromIdc, targetIdc);
|
|
328
|
+
return;
|
|
329
|
+
}
|
|
330
|
+
|
|
331
|
+
String errorDetail = buildGoldenDbSwitchErrorMessage(switchCode, errorMsg, switchState);
|
|
332
|
+
MDC.put("errCode", errorDetail);
|
|
333
|
+
log.error("exchangeGoldenDb error idc={},targetIdc={},switchType={},switchCode={},errorMsg={},switchState={},errorDetail={}",
|
|
334
|
+
fromIdc, targetIdc, GOLDEN_DB_PLAN_SWITCH, switchCode, errorMsg, switchState, errorDetail);
|
|
335
|
+
throw new BusinessException(errorDetail);
|
|
336
|
+
}
|
|
337
|
+
|
|
338
|
+
private String postGoldenDb(String urlConfig, JSONObject jsonObject, String logPrefix) throws BusinessException {
|
|
339
|
+
if (isBlank(urlConfig)) {
|
|
340
|
+
log.error("{} url empty", logPrefix);
|
|
341
|
+
throw new BusinessException(logPrefix + "接口地址未配置");
|
|
342
|
+
}
|
|
343
|
+
|
|
344
|
+
String[] addresses = urlConfig.split(";");
|
|
345
|
+
Exception lastException = null;
|
|
346
|
+
for (int i = 0; i < addresses.length; i++) {
|
|
347
|
+
String address = addresses[i];
|
|
348
|
+
if (isBlank(address)) {
|
|
349
|
+
continue;
|
|
350
|
+
}
|
|
351
|
+
try {
|
|
352
|
+
return HttpUtils.httpPostReq(address.trim(), jsonObject, 180000);
|
|
353
|
+
} catch (Exception e) {
|
|
354
|
+
lastException = e;
|
|
355
|
+
log.error("{} address[{}] error,address={},e={}", logPrefix, i, address, e);
|
|
356
|
+
}
|
|
357
|
+
}
|
|
358
|
+
|
|
359
|
+
throw new BusinessException(logPrefix + "接口调用失败" +
|
|
360
|
+
(lastException == null ? "" : ":" + lastException.getMessage()));
|
|
361
|
+
}
|
|
362
|
+
|
|
363
|
+
private String buildGoldenDbErrorMessage(String code, String message) {
|
|
364
|
+
if (isBlank(code) && isBlank(message)) {
|
|
365
|
+
return "接口返回异常";
|
|
366
|
+
}
|
|
367
|
+
if (isBlank(message)) {
|
|
368
|
+
return code;
|
|
369
|
+
}
|
|
370
|
+
if (isBlank(code)) {
|
|
371
|
+
return message;
|
|
372
|
+
}
|
|
373
|
+
return code + "-" + message;
|
|
374
|
+
}
|
|
375
|
+
|
|
376
|
+
private String buildGoldenDbSwitchErrorMessage(String switchCode, String errorMsg, Boolean switchState) {
|
|
377
|
+
String reason;
|
|
378
|
+
if (GOLDEN_DB_PARAM_ERROR_CODE.equals(switchCode)) {
|
|
379
|
+
reason = "参数错误";
|
|
380
|
+
} else if (GOLDEN_DB_DUPLICATE_REQUEST_CODE.equals(switchCode)) {
|
|
381
|
+
reason = "重复请求";
|
|
382
|
+
} else if (GOLDEN_DB_SWITCH_FAIL_CODE.equals(switchCode)) {
|
|
383
|
+
reason = "切换失败";
|
|
384
|
+
} else if (GOLDEN_DB_INTERNAL_ERROR_CODE.equals(switchCode)) {
|
|
385
|
+
reason = "内部错误";
|
|
386
|
+
} else if (GOLDEN_DB_SUCCESS_CODE.equals(switchCode)) {
|
|
387
|
+
reason = "接口返回成功码但切换状态异常";
|
|
388
|
+
} else {
|
|
389
|
+
reason = "接口返回异常";
|
|
390
|
+
}
|
|
391
|
+
|
|
392
|
+
String detail = buildGoldenDbErrorMessage(switchCode, errorMsg);
|
|
393
|
+
return "联合运维goldenDb切换失败:" + reason + ",switchState=" + switchState + ",detail=" + detail;
|
|
394
|
+
}
|
|
395
|
+
|
|
396
|
+
private boolean isBlank(String value) {
|
|
397
|
+
return value == null || value.trim().isEmpty();
|
|
398
|
+
}
|
|
399
|
+
|
|
400
|
+
private String getMasterIdcFlag() throws BusinessException {
|
|
401
|
+
String masterIdc = null;
|
|
402
|
+
Request request = Request.builder()
|
|
403
|
+
.traceId(LoggerUtils.getTraceNo())
|
|
404
|
+
.bizType(BaseDataBizTypeConstants.ANNOUNCE + AnnounceBizTypeConstants.IDC_INFO)
|
|
405
|
+
.operateType(AnnounceOperateTypeConstants.IdcInfoConstants.GET_IDC_FLAG)
|
|
406
|
+
.systemId(SystemIdConstants.ARK_ID)
|
|
407
|
+
.requestTime(new Date())
|
|
408
|
+
.paramJson("getIdcFlag")
|
|
409
|
+
.build();
|
|
410
|
+
com.epcc.dubbo.result.Result<String> result = null;
|
|
411
|
+
try {
|
|
412
|
+
result = arkControlBaseFacade.processBiz(request);
|
|
413
|
+
masterIdc = JSON.parseObject(result.getResult(), new TypeReference<String>() {
|
|
414
|
+
});
|
|
415
|
+
log.info("current master idc={}", masterIdc);
|
|
416
|
+
} catch (Exception e1) {
|
|
417
|
+
log.error("kickidc getIDCFlag error,e1={}", e1);
|
|
418
|
+
try {
|
|
419
|
+
result = arkControlBaseFacade.processBiz(request);
|
|
420
|
+
masterIdc = JSON.parseObject(result.getResult(), new TypeReference<String>() {
|
|
421
|
+
});
|
|
422
|
+
} catch (Exception e2) {
|
|
423
|
+
log.error("kickidc updateIDCStatus error,e2={}", e2);
|
|
424
|
+
throw new BusinessException("远程获取数据库IDC有误");
|
|
425
|
+
}
|
|
426
|
+
}
|
|
427
|
+
|
|
428
|
+
return masterIdc;
|
|
429
|
+
}
|
|
430
|
+
|
|
431
|
+
private void createMDCParam(String idc, String requestTime) {
|
|
432
|
+
MDC.put("idc", idc);
|
|
433
|
+
MDC.put("requestTime", requestTime);
|
|
434
|
+
MDC.put("applicationName", applicationName);
|
|
435
|
+
MDC.put("updatedBy", UPDATEDBY);
|
|
436
|
+
}
|
|
437
|
+
|
|
438
|
+
protected void exchangeSchedule(String idc, String excIdc) throws BusinessException {
|
|
439
|
+
// 停止定时任务
|
|
440
|
+
com.epcc.dubbo.result.Result<JSONArray> resultArray = null;
|
|
441
|
+
JSONArray jsonArray = null;
|
|
442
|
+
JobProperties jobProperties = new JobProperties();
|
|
443
|
+
|
|
444
|
+
try {
|
|
445
|
+
resultArray = taskServiceClient.findJobs(idc);
|
|
446
|
+
jsonArray = resultArray.getResult();
|
|
447
|
+
for (int i = 0; i < jsonArray.size(); i++) {
|
|
448
|
+
JSONObject jsonObject = (JSONObject) jsonArray.get(i);
|
|
449
|
+
jobProperties.setName(jsonObject.getString("name"));
|
|
450
|
+
taskServiceClient.jobOperate(idc, JobOperateStatus.PAUSE, jobProperties);
|
|
451
|
+
}
|
|
452
|
+
} catch (Exception e1) {
|
|
453
|
+
log.info("exchangeSchedule PAUSE1 errorCode={}", e1);
|
|
454
|
+
try {
|
|
455
|
+
resultArray = taskServiceClient.findJobs(idc);
|
|
456
|
+
jsonArray = resultArray.getResult();
|
|
457
|
+
for (int i = 0; i < jsonArray.size(); i++) {
|
|
458
|
+
JSONObject jsonObject = (JSONObject) jsonArray.get(i);
|
|
459
|
+
jobProperties.setName(jsonObject.getString("name"));
|
|
460
|
+
taskServiceClient.jobOperate(idc, JobOperateStatus.PAUSE, jobProperties);
|
|
461
|
+
}
|
|
462
|
+
} catch (Exception e2) {
|
|
463
|
+
log.info("exchangeSchedule PAUSE2 errorCode={}", e2);
|
|
464
|
+
throw new BusinessException("lhyw 定时任务禁用失败");
|
|
465
|
+
}
|
|
466
|
+
}
|
|
467
|
+
|
|
468
|
+
// 启动定时任务
|
|
469
|
+
try {
|
|
470
|
+
resultArray = taskServiceClient.findJobs(excIdc);
|
|
471
|
+
jsonArray = resultArray.getResult();
|
|
472
|
+
for (int i = 0; i < jsonArray.size(); i++) {
|
|
473
|
+
JSONObject jsonObject = (JSONObject) jsonArray.get(i);
|
|
474
|
+
jobProperties.setName(jsonObject.getString("name"));
|
|
475
|
+
taskServiceClient.jobOperate(excIdc, JobOperateStatus.RESUME, jobProperties);
|
|
476
|
+
}
|
|
477
|
+
} catch (Exception e3) {
|
|
478
|
+
log.info("exchangeSchedule RESUME e3={}", e3);
|
|
479
|
+
try {
|
|
480
|
+
resultArray = taskServiceClient.findJobs(excIdc);
|
|
481
|
+
jsonArray = resultArray.getResult();
|
|
482
|
+
for (int i = 0; i < jsonArray.size(); i++) {
|
|
483
|
+
JSONObject jsonObject = (JSONObject) jsonArray.get(i);
|
|
484
|
+
jobProperties.setName(jsonObject.getString("name"));
|
|
485
|
+
taskServiceClient.jobOperate(excIdc, JobOperateStatus.RESUME, jobProperties);
|
|
486
|
+
}
|
|
487
|
+
} catch (Exception e4) {
|
|
488
|
+
log.info("exchangeSchedule RESUME2 e4={}", e4);
|
|
489
|
+
throw new BusinessException("lhyw 定时任务启用失败");
|
|
490
|
+
}
|
|
491
|
+
}
|
|
492
|
+
}
|
|
493
|
+
|
|
494
|
+
protected String exchangeDB(String kickIdc, String targetIdc) throws BusinessException {
|
|
495
|
+
JSONObject jsonObject = new JSONObject();
|
|
496
|
+
jsonObject.put("db_name", dbName);
|
|
497
|
+
jsonObject.put("key", dbKey);
|
|
498
|
+
jsonObject.put("ts", Long.parseLong(DateTimeUtil.format(new Date())));
|
|
499
|
+
jsonObject.put("to_idc", targetIdc);
|
|
500
|
+
jsonObject.put("allow_data_loss", 0);
|
|
501
|
+
log.info("exchangeDB parma jsonObject={},failover_code={}", jsonObject);
|
|
502
|
+
String response = null;
|
|
503
|
+
|
|
504
|
+
String[] address = dbUrl.split(";");
|
|
505
|
+
try {
|
|
506
|
+
response = HttpUtils.httpPostReq(address[0], jsonObject, 180000);
|
|
507
|
+
} catch (Exception e1) {
|
|
508
|
+
log.error("db exchange address[0] error,e={}", e1);
|
|
509
|
+
try {
|
|
510
|
+
response = HttpUtils.httpPostReq(address[1], jsonObject, 180000);
|
|
511
|
+
} catch (Exception e2) {
|
|
512
|
+
log.error("db exchange address[1] error,e={}", e2);
|
|
513
|
+
try {
|
|
514
|
+
response = HttpUtils.httpPostReq(address[2], jsonObject, 180000);
|
|
515
|
+
} catch (Exception e3) {
|
|
516
|
+
log.error("db exchange address[2] error,e={}", e3);
|
|
517
|
+
throw new BusinessException("联合运维DB接口调用失败");
|
|
518
|
+
}
|
|
519
|
+
}
|
|
520
|
+
}
|
|
521
|
+
|
|
522
|
+
log.info("exchangeDB response={}", response);
|
|
523
|
+
JSONObject resJson = JSON.parseObject(response);
|
|
524
|
+
String failCode = resJson.getString("failover_code");
|
|
525
|
+
|
|
526
|
+
if ("20000".equals(failCode)) {
|
|
527
|
+
// 修改数据库标识位
|
|
528
|
+
updateIDCStatus(kickIdc, targetIdc);
|
|
529
|
+
return failCode;
|
|
530
|
+
}
|
|
531
|
+
|
|
532
|
+
if ("11112".equals(failCode)) {
|
|
533
|
+
return failCode;
|
|
534
|
+
}
|
|
535
|
+
|
|
536
|
+
log.info("exchangeDB error idc={},failover_code={}", kickIdc, failCode);
|
|
537
|
+
throw new BusinessException("联合运维DB切换失败");
|
|
538
|
+
}
|
|
539
|
+
|
|
540
|
+
private void updateIDCStatus(String fromIdc, String targetIdc) throws BusinessException {
|
|
541
|
+
this.updateIDCStatus(fromIdc, targetIdc, UPDATEDBY);
|
|
542
|
+
}
|
|
543
|
+
|
|
544
|
+
public void updateIDCStatus(String fromIdc, String targetIdc, String updatedBy) throws BusinessException {
|
|
545
|
+
Map<String, Object> map = new HashMap<>();
|
|
546
|
+
map.put("fromIdc", fromIdc);
|
|
547
|
+
map.put("toIdc", targetIdc);
|
|
548
|
+
map.put("updatedBy", updatedBy);
|
|
549
|
+
map.put("status", 1);
|
|
550
|
+
|
|
551
|
+
Request request = Request.builder()
|
|
552
|
+
.traceId(LoggerUtils.getTraceNo())
|
|
553
|
+
.bizType(BaseDataBizTypeConstants.ANNOUNCE + AnnounceBizTypeConstants.IDC_INFO)
|
|
554
|
+
.operateType(AnnounceOperateTypeConstants.IdcInfoConstants.UPDATE_IDC_STATUS)
|
|
555
|
+
.systemId(SystemIdConstants.ARK_ID)
|
|
556
|
+
.requestTime(new Date())
|
|
557
|
+
.paramJson(JSON.toJSONString(map))
|
|
558
|
+
.build();
|
|
559
|
+
|
|
560
|
+
com.epcc.dubbo.result.Result<String> result = null;
|
|
561
|
+
try {
|
|
562
|
+
result = arkControlBaseFacade.processBiz(request);
|
|
563
|
+
} catch (Exception e1) {
|
|
564
|
+
log.error("kickidc updateIDCStatus error,e1={}", e1);
|
|
565
|
+
try {
|
|
566
|
+
result = arkControlBaseFacade.processBiz(request);
|
|
567
|
+
} catch (Exception e2) {
|
|
568
|
+
log.error("kickidc updateIDCStatus error,e2={}", e2);
|
|
569
|
+
throw new BusinessException("远程更新数据库更新IDC有误");
|
|
570
|
+
}
|
|
571
|
+
}
|
|
572
|
+
|
|
573
|
+
if (!result.isSuccess()) {
|
|
574
|
+
log.error("更新IDC信息表失败,errCode: [{}],errMsg: [{}]", result.getErrorCode(), result.getErrorMsg());
|
|
575
|
+
throw new BusinessException("数据库更新IDC出错");
|
|
576
|
+
}
|
|
577
|
+
}
|
|
578
|
+
}
|
package/README.md
CHANGED
|
@@ -65,6 +65,8 @@ python ingest.py promote
|
|
|
65
65
|
|
|
66
66
|
默认目录为 `input/`、`parsed/`、`drafts/`、`approved/` 和 `result/`。只有需要处理其他目录时,才使用 `--input`、`--output` 或 `--result-dir` 覆盖。
|
|
67
67
|
|
|
68
|
+
`draft` 默认按 `config/config.yaml` 的 `draft.max_chars` 控制单次送入模型的原文长度,并额外提供文档目录和相邻片段摘要作为辅助上下文。这样可以降低私有模型单轮负载,同时尽量保留前后章节关系。命令行仍可用 `--max-chars` 临时覆盖。
|
|
69
|
+
|
|
68
70
|
## 大模型配置
|
|
69
71
|
|
|
70
72
|
默认不强制调用大模型,会使用启发式模板生成 `draft` 文件。
|
|
@@ -74,24 +76,29 @@ python ingest.py promote
|
|
|
74
76
|
```yaml
|
|
75
77
|
llm:
|
|
76
78
|
enabled: true
|
|
77
|
-
base_url: "https://
|
|
78
|
-
api_key: "your-api-key"
|
|
79
|
-
model: "
|
|
79
|
+
base_url: "https://open.bigmodel.cn/api/paas/v4/"
|
|
80
|
+
api_key: "your-zhipu-api-key"
|
|
81
|
+
model: "glm-4.7"
|
|
80
82
|
timeout_seconds: 120
|
|
81
|
-
max_tokens:
|
|
83
|
+
max_tokens: 8192
|
|
82
84
|
temperature: 0.1
|
|
85
|
+
|
|
86
|
+
draft:
|
|
87
|
+
max_chars: 3600
|
|
88
|
+
context_chars: 800
|
|
89
|
+
outline_max_sections: 40
|
|
83
90
|
```
|
|
84
91
|
|
|
85
92
|
也可以继续使用环境变量覆盖配置文件:
|
|
86
93
|
|
|
87
94
|
```bash
|
|
88
95
|
export KB_LLM_ENABLED=true
|
|
89
|
-
export KB_LLM_BASE_URL="https://
|
|
90
|
-
export KB_LLM_API_KEY="your-api-key"
|
|
91
|
-
export KB_LLM_MODEL="
|
|
96
|
+
export KB_LLM_BASE_URL="https://open.bigmodel.cn/api/paas/v4/"
|
|
97
|
+
export KB_LLM_API_KEY="your-zhipu-api-key"
|
|
98
|
+
export KB_LLM_MODEL="glm-4.7"
|
|
92
99
|
```
|
|
93
100
|
|
|
94
|
-
|
|
101
|
+
工具通过 ZhipuAI SDK 调用 GLM,不再包含 OpenAI 调用路径。`base_url` 使用 ZhipuAI SDK 的服务根地址即可。工具不 import 项目 `src` 代码。
|
|
95
102
|
|
|
96
103
|
## 与线上项目的关系
|
|
97
104
|
|
package/app_config.py
CHANGED
|
@@ -24,6 +24,13 @@ class LlmConfig:
|
|
|
24
24
|
temperature: float = 0.1
|
|
25
25
|
|
|
26
26
|
|
|
27
|
+
@dataclass(frozen=True)
|
|
28
|
+
class DraftConfig:
|
|
29
|
+
max_chars: int = 3600
|
|
30
|
+
context_chars: int = 800
|
|
31
|
+
outline_max_sections: int = 40
|
|
32
|
+
|
|
33
|
+
|
|
27
34
|
@lru_cache(maxsize=1)
|
|
28
35
|
def get_llm_config() -> LlmConfig:
|
|
29
36
|
raw = _read_config().get("llm", {})
|
|
@@ -41,6 +48,19 @@ def get_llm_config() -> LlmConfig:
|
|
|
41
48
|
)
|
|
42
49
|
|
|
43
50
|
|
|
51
|
+
@lru_cache(maxsize=1)
|
|
52
|
+
def get_draft_config() -> DraftConfig:
|
|
53
|
+
raw = _read_config().get("draft", {})
|
|
54
|
+
if not isinstance(raw, dict):
|
|
55
|
+
raw = {}
|
|
56
|
+
|
|
57
|
+
return DraftConfig(
|
|
58
|
+
max_chars=_env_int("KB_DRAFT_MAX_CHARS", raw.get("max_chars"), 3600),
|
|
59
|
+
context_chars=_env_int("KB_DRAFT_CONTEXT_CHARS", raw.get("context_chars"), 800),
|
|
60
|
+
outline_max_sections=_env_int("KB_DRAFT_OUTLINE_MAX_SECTIONS", raw.get("outline_max_sections"), 40),
|
|
61
|
+
)
|
|
62
|
+
|
|
63
|
+
|
|
44
64
|
def _read_config() -> Dict[str, Any]:
|
|
45
65
|
path = Path(os.environ.get("KB_INGEST_CONFIG", DEFAULT_CONFIG_PATH))
|
|
46
66
|
if not path.exists():
|
package/config/config.yaml
CHANGED
|
@@ -1,8 +1,13 @@
|
|
|
1
1
|
llm:
|
|
2
2
|
enabled: true
|
|
3
3
|
timeout_seconds: 120
|
|
4
|
-
max_tokens:
|
|
4
|
+
max_tokens: 8192
|
|
5
5
|
temperature: 0.1
|
|
6
|
-
api_key: "15f066c4509845038027ea5746524af5.w4CLSC6ODiKVC1wK"
|
|
7
|
-
model: "
|
|
8
|
-
base_url: "https://open.bigmodel.cn/api/paas/v4/"
|
|
6
|
+
api_key: "15f066c4509845038027ea5746524af5.w4CLSC6ODiKVC1wK"
|
|
7
|
+
model: "GLM-4.7-Flash"
|
|
8
|
+
base_url: "https://open.bigmodel.cn/api/paas/v4/"
|
|
9
|
+
|
|
10
|
+
draft:
|
|
11
|
+
max_chars: 3600
|
|
12
|
+
context_chars: 800
|
|
13
|
+
outline_max_sections: 40
|
package/ingest.py
CHANGED
|
@@ -5,13 +5,16 @@ import argparse
|
|
|
5
5
|
import shutil
|
|
6
6
|
import sys
|
|
7
7
|
from pathlib import Path
|
|
8
|
+
from typing import List
|
|
8
9
|
|
|
9
10
|
CURRENT_DIR = Path(__file__).resolve().parent
|
|
10
11
|
if str(CURRENT_DIR) not in sys.path:
|
|
11
12
|
sys.path.insert(0, str(CURRENT_DIR))
|
|
12
13
|
|
|
14
|
+
from app_config import get_draft_config
|
|
13
15
|
from normalizer import normalize_block
|
|
14
16
|
from parser import iter_input_files, parse_document
|
|
17
|
+
from schemas import ParsedBlock
|
|
15
18
|
from splitter import split_blocks
|
|
16
19
|
from validator import validate_dir
|
|
17
20
|
from writer import write_item
|
|
@@ -52,10 +55,17 @@ def cmd_draft(args) -> int:
|
|
|
52
55
|
output_dir.mkdir(parents=True, exist_ok=True)
|
|
53
56
|
|
|
54
57
|
total_items = 0
|
|
58
|
+
draft_config = get_draft_config()
|
|
59
|
+
max_chars = args.max_chars or draft_config.max_chars
|
|
55
60
|
files = iter_input_files(input_path)
|
|
56
61
|
for path in files:
|
|
57
62
|
parsed = parse_document(path)
|
|
58
|
-
blocks = split_blocks(parsed.blocks, max_chars=
|
|
63
|
+
blocks = split_blocks(parsed.blocks, max_chars=max_chars)
|
|
64
|
+
blocks = _attach_block_context(
|
|
65
|
+
blocks,
|
|
66
|
+
context_chars=draft_config.context_chars,
|
|
67
|
+
outline_max_sections=draft_config.outline_max_sections,
|
|
68
|
+
)
|
|
59
69
|
for block in blocks:
|
|
60
70
|
for item in normalize_block(block, status="draft"):
|
|
61
71
|
write_item(item, output_dir)
|
|
@@ -96,6 +106,67 @@ def _clear_generated_files(*dirs: Path) -> None:
|
|
|
96
106
|
print(f"deleted: {path}")
|
|
97
107
|
|
|
98
108
|
|
|
109
|
+
def _attach_block_context(
|
|
110
|
+
blocks: List[ParsedBlock],
|
|
111
|
+
context_chars: int,
|
|
112
|
+
outline_max_sections: int,
|
|
113
|
+
) -> List[ParsedBlock]:
|
|
114
|
+
if context_chars <= 0:
|
|
115
|
+
return blocks
|
|
116
|
+
|
|
117
|
+
outline = _document_outline(blocks, outline_max_sections)
|
|
118
|
+
output: List[ParsedBlock] = []
|
|
119
|
+
for idx, block in enumerate(blocks):
|
|
120
|
+
parts = []
|
|
121
|
+
if outline:
|
|
122
|
+
parts.append(f"文档章节目录:\n{outline}")
|
|
123
|
+
if idx > 0:
|
|
124
|
+
parts.append(
|
|
125
|
+
"上一片段摘要:\n"
|
|
126
|
+
f"章节:{blocks[idx - 1].source_section}\n"
|
|
127
|
+
f"{_compact_context_text(blocks[idx - 1].content, context_chars // 2)}"
|
|
128
|
+
)
|
|
129
|
+
if idx + 1 < len(blocks):
|
|
130
|
+
parts.append(
|
|
131
|
+
"下一片段摘要:\n"
|
|
132
|
+
f"章节:{blocks[idx + 1].source_section}\n"
|
|
133
|
+
f"{_compact_context_text(blocks[idx + 1].content, context_chars // 2)}"
|
|
134
|
+
)
|
|
135
|
+
output.append(ParsedBlock(
|
|
136
|
+
source_doc=block.source_doc,
|
|
137
|
+
source_section=block.source_section,
|
|
138
|
+
content=block.content,
|
|
139
|
+
pages=block.pages,
|
|
140
|
+
order=block.order,
|
|
141
|
+
context="\n\n".join(parts),
|
|
142
|
+
))
|
|
143
|
+
return output
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
def _document_outline(blocks: List[ParsedBlock], max_sections: int) -> str:
|
|
147
|
+
sections = []
|
|
148
|
+
seen = set()
|
|
149
|
+
for block in blocks:
|
|
150
|
+
section = block.source_section.strip()
|
|
151
|
+
if not section or section in seen:
|
|
152
|
+
continue
|
|
153
|
+
seen.add(section)
|
|
154
|
+
sections.append(f"- {section}")
|
|
155
|
+
if len(sections) >= max_sections:
|
|
156
|
+
remaining = len(blocks) - len(sections)
|
|
157
|
+
if remaining > 0:
|
|
158
|
+
sections.append(f"- ... 其余 {remaining} 个片段")
|
|
159
|
+
break
|
|
160
|
+
return "\n".join(sections)
|
|
161
|
+
|
|
162
|
+
|
|
163
|
+
def _compact_context_text(text: str, limit: int) -> str:
|
|
164
|
+
compact = " ".join(text.split())
|
|
165
|
+
if limit <= 0 or len(compact) <= limit:
|
|
166
|
+
return compact
|
|
167
|
+
return compact[:limit].rstrip() + "..."
|
|
168
|
+
|
|
169
|
+
|
|
99
170
|
def cmd_validate(args) -> int:
|
|
100
171
|
issues = validate_dir(Path(args.input))
|
|
101
172
|
for issue in issues:
|
|
@@ -132,7 +203,7 @@ def build_parser() -> argparse.ArgumentParser:
|
|
|
132
203
|
draft_cmd.add_argument("--output", default=str(CURRENT_DIR / "drafts"))
|
|
133
204
|
draft_cmd.add_argument("--approved-dir", default=str(CURRENT_DIR / "approved"))
|
|
134
205
|
draft_cmd.add_argument("--result-dir", default=str(CURRENT_DIR / "result"))
|
|
135
|
-
draft_cmd.add_argument("--max-chars", type=int, default=
|
|
206
|
+
draft_cmd.add_argument("--max-chars", type=int, default=None)
|
|
136
207
|
draft_cmd.set_defaults(func=cmd_draft)
|
|
137
208
|
|
|
138
209
|
validate_cmd = sub.add_parser("validate", help="Validate generated Markdown files.")
|
package/normalizer.py
CHANGED
|
@@ -33,7 +33,7 @@ def _normalize_with_llm(block: ParsedBlock, status: str) -> List[KnowledgeItem]:
|
|
|
33
33
|
if not (config.base_url and config.api_key and config.model):
|
|
34
34
|
_abort_llm("missing base_url, api_key, or model", block)
|
|
35
35
|
|
|
36
|
-
|
|
36
|
+
base_prompt = _build_prompt(block, status)
|
|
37
37
|
started_at = time.monotonic()
|
|
38
38
|
print(
|
|
39
39
|
"llm start: "
|
|
@@ -45,34 +45,24 @@ def _normalize_with_llm(block: ParsedBlock, status: str) -> List[KnowledgeItem]:
|
|
|
45
45
|
except ImportError as exc:
|
|
46
46
|
print(f"llm error: cannot load ZhipuAI SDK ({exc})")
|
|
47
47
|
return []
|
|
48
|
-
|
|
49
48
|
client = client_cls(api_key=config.api_key, base_url=config.base_url)
|
|
49
|
+
|
|
50
|
+
compact_retry = False
|
|
50
51
|
for attempt in range(1, LLM_MAX_RETRIES + 1):
|
|
52
|
+
prompt = _compact_retry_prompt(base_prompt) if compact_retry else base_prompt
|
|
51
53
|
try:
|
|
52
|
-
print(
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
messages=[
|
|
56
|
-
{
|
|
57
|
-
"role": "system",
|
|
58
|
-
"content": (
|
|
59
|
-
"你是严谨的运维知识库整理助手,只能依据输入原文生成结构化知识条目。"
|
|
60
|
-
"你必须只返回一个 JSON object,根节点必须只有 items 字段,"
|
|
61
|
-
"且 items 必须是数组。不要返回 Markdown、解释文字或其他根字段。"
|
|
62
|
-
),
|
|
63
|
-
},
|
|
64
|
-
{"role": "user", "content": prompt},
|
|
65
|
-
],
|
|
66
|
-
stream=False,
|
|
67
|
-
max_tokens=config.max_tokens,
|
|
68
|
-
temperature=config.temperature,
|
|
69
|
-
do_sample=False,
|
|
70
|
-
response_format={"type": "json_object"},
|
|
71
|
-
thinking={"type": "disabled", "clear_thinking": True},
|
|
54
|
+
print(
|
|
55
|
+
"llm request: "
|
|
56
|
+
f"provider=zhipu base_url={config.base_url} attempt={attempt}/{LLM_MAX_RETRIES}"
|
|
72
57
|
)
|
|
58
|
+
response = _create_zhipu_completion(client, config, prompt)
|
|
73
59
|
except Exception as exc:
|
|
74
60
|
elapsed = time.monotonic() - started_at
|
|
75
|
-
print(
|
|
61
|
+
print(
|
|
62
|
+
"llm error: "
|
|
63
|
+
f"{type(exc).__name__} attempt={attempt}/{LLM_MAX_RETRIES} "
|
|
64
|
+
f"after {elapsed:.1f}s detail={exc}"
|
|
65
|
+
)
|
|
76
66
|
if attempt >= LLM_MAX_RETRIES:
|
|
77
67
|
_abort_llm(f"request failed after {LLM_MAX_RETRIES} attempts: {type(exc).__name__}", block)
|
|
78
68
|
time.sleep(min(2 ** (attempt - 1), 30))
|
|
@@ -80,12 +70,19 @@ def _normalize_with_llm(block: ParsedBlock, status: str) -> List[KnowledgeItem]:
|
|
|
80
70
|
|
|
81
71
|
content = _extract_response_content(response)
|
|
82
72
|
elapsed = time.monotonic() - started_at
|
|
83
|
-
|
|
73
|
+
finish_reason = _finish_reason(response)
|
|
74
|
+
print(
|
|
75
|
+
"llm response: "
|
|
76
|
+
f"{len(content)} chars in {elapsed:.1f}s attempt={attempt}/{LLM_MAX_RETRIES} "
|
|
77
|
+
f"finish_reason={finish_reason or 'unknown'}"
|
|
78
|
+
)
|
|
79
|
+
if content:
|
|
80
|
+
print(f"llm response content:\n{_preview(content)}")
|
|
84
81
|
if not content.strip():
|
|
85
82
|
reasoning = _extract_reasoning_content(response)
|
|
86
83
|
print(
|
|
87
84
|
"llm parse failed: empty response content "
|
|
88
|
-
f"finish_reason={
|
|
85
|
+
f"finish_reason={finish_reason} reasoning_chars={len(reasoning)} "
|
|
89
86
|
f"response={_response_debug(response)}"
|
|
90
87
|
)
|
|
91
88
|
if attempt >= LLM_MAX_RETRIES:
|
|
@@ -95,16 +92,25 @@ def _normalize_with_llm(block: ParsedBlock, status: str) -> List[KnowledgeItem]:
|
|
|
95
92
|
|
|
96
93
|
parsed = _extract_json(content)
|
|
97
94
|
if not parsed:
|
|
98
|
-
|
|
95
|
+
compact_retry = compact_retry or _looks_truncated(content, finish_reason)
|
|
96
|
+
print(
|
|
97
|
+
"llm parse failed: response is not valid JSON "
|
|
98
|
+
f"finish_reason={finish_reason or 'unknown'} "
|
|
99
|
+
f"truncated={_looks_truncated(content, finish_reason)} "
|
|
100
|
+
f"preview={_preview(content)}"
|
|
101
|
+
)
|
|
99
102
|
if attempt >= LLM_MAX_RETRIES:
|
|
100
103
|
_abort_llm("response is not valid JSON after 10 attempts", block)
|
|
101
104
|
time.sleep(min(2 ** (attempt - 1), 30))
|
|
102
105
|
continue
|
|
103
106
|
|
|
104
|
-
raw_items =
|
|
107
|
+
raw_items = _coerce_raw_items(parsed)
|
|
105
108
|
if not isinstance(raw_items, list):
|
|
109
|
+
compact_retry = compact_retry or _looks_truncated(content, finish_reason)
|
|
106
110
|
print(
|
|
107
111
|
"llm parse failed: JSON does not contain an items list "
|
|
112
|
+
f"finish_reason={finish_reason or 'unknown'} "
|
|
113
|
+
f"truncated={_looks_truncated(content, finish_reason)} "
|
|
108
114
|
f"top_level={_json_shape(parsed)} preview={_preview(content)}"
|
|
109
115
|
)
|
|
110
116
|
if attempt >= LLM_MAX_RETRIES:
|
|
@@ -127,6 +133,43 @@ def _normalize_with_llm(block: ParsedBlock, status: str) -> List[KnowledgeItem]:
|
|
|
127
133
|
_abort_llm("model call failed", block)
|
|
128
134
|
|
|
129
135
|
|
|
136
|
+
def _system_message() -> str:
|
|
137
|
+
return (
|
|
138
|
+
"你是严谨的运维知识库整理助手,只能依据输入原文生成结构化知识条目。"
|
|
139
|
+
"你必须只返回一个 JSON object,根节点必须只有 items 字段,"
|
|
140
|
+
"且 items 必须是数组。不要返回 Markdown、解释文字或其他根字段。"
|
|
141
|
+
)
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
def _messages(prompt: str) -> List[Dict[str, str]]:
|
|
145
|
+
return [
|
|
146
|
+
{"role": "system", "content": _system_message()},
|
|
147
|
+
{"role": "user", "content": prompt},
|
|
148
|
+
]
|
|
149
|
+
|
|
150
|
+
|
|
151
|
+
def _compact_retry_prompt(base_prompt: str) -> str:
|
|
152
|
+
return (
|
|
153
|
+
base_prompt
|
|
154
|
+
+ "\n\n重试补充要求:上一次输出疑似过长或结构不完整。"
|
|
155
|
+
"本次必须只生成 1 个 item,保留规范要求的正文 1-7 节,但每节只写当前原文中最必要、最确定的信息。"
|
|
156
|
+
"不要省略 JSON 外层 items,不要输出多个条目,不要输出解释文字。"
|
|
157
|
+
)
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
def _create_zhipu_completion(client, config, prompt: str):
|
|
161
|
+
return client.chat.completions.create(
|
|
162
|
+
model=config.model,
|
|
163
|
+
messages=_messages(prompt),
|
|
164
|
+
stream=False,
|
|
165
|
+
max_tokens=config.max_tokens,
|
|
166
|
+
temperature=config.temperature,
|
|
167
|
+
do_sample=False,
|
|
168
|
+
response_format={"type": "json_object"},
|
|
169
|
+
thinking={"type": "disabled", "clear_thinking": True},
|
|
170
|
+
)
|
|
171
|
+
|
|
172
|
+
|
|
130
173
|
def _get_zhipu_client_class():
|
|
131
174
|
try:
|
|
132
175
|
from zai import ZaiClient
|
|
@@ -207,6 +250,59 @@ def _json_shape(value) -> str:
|
|
|
207
250
|
return type(value).__name__
|
|
208
251
|
|
|
209
252
|
|
|
253
|
+
def _coerce_raw_items(parsed):
|
|
254
|
+
if isinstance(parsed, dict):
|
|
255
|
+
items = parsed.get("items")
|
|
256
|
+
if isinstance(items, list):
|
|
257
|
+
return items
|
|
258
|
+
|
|
259
|
+
for key in ("knowledge_items", "records", "data", "result", "results"):
|
|
260
|
+
value = parsed.get(key)
|
|
261
|
+
if isinstance(value, list):
|
|
262
|
+
print(f"llm parse notice: using non-standard list field '{key}' as items")
|
|
263
|
+
return value
|
|
264
|
+
if isinstance(value, dict):
|
|
265
|
+
nested = _coerce_raw_items(value)
|
|
266
|
+
if isinstance(nested, list):
|
|
267
|
+
print(f"llm parse notice: using nested field '{key}' as items")
|
|
268
|
+
return nested
|
|
269
|
+
|
|
270
|
+
if _looks_like_single_item(parsed):
|
|
271
|
+
print("llm parse notice: wrapping single item object as items[0]")
|
|
272
|
+
return [parsed]
|
|
273
|
+
|
|
274
|
+
if isinstance(parsed, list):
|
|
275
|
+
print("llm parse notice: wrapping root array as items")
|
|
276
|
+
return parsed
|
|
277
|
+
|
|
278
|
+
return None
|
|
279
|
+
|
|
280
|
+
|
|
281
|
+
def _looks_like_single_item(value: Dict) -> bool:
|
|
282
|
+
required_signal = {"title", "body"}
|
|
283
|
+
item_fields = {
|
|
284
|
+
"title",
|
|
285
|
+
"doc_type",
|
|
286
|
+
"business_modules",
|
|
287
|
+
"source_version",
|
|
288
|
+
"risk_level",
|
|
289
|
+
"applicable_roles",
|
|
290
|
+
"tags",
|
|
291
|
+
"body",
|
|
292
|
+
"split_reason",
|
|
293
|
+
}
|
|
294
|
+
return required_signal.issubset(value.keys()) and len(item_fields.intersection(value.keys())) >= 4
|
|
295
|
+
|
|
296
|
+
|
|
297
|
+
def _looks_truncated(content: str, finish_reason: str) -> bool:
|
|
298
|
+
if finish_reason == "length":
|
|
299
|
+
return True
|
|
300
|
+
stripped = content.strip()
|
|
301
|
+
if not stripped:
|
|
302
|
+
return False
|
|
303
|
+
return stripped.count("{") > stripped.count("}") or stripped.count("[") > stripped.count("]")
|
|
304
|
+
|
|
305
|
+
|
|
210
306
|
def _abort_llm(message: str, block: ParsedBlock) -> None:
|
|
211
307
|
print(
|
|
212
308
|
"ALERT: llm draft failed; aborting. "
|
|
@@ -230,6 +326,8 @@ def _build_prompt(block: ParsedBlock, status: str) -> str:
|
|
|
230
326
|
7. 输出严格 JSON 对象,不要 Markdown 代码围栏,不要解释文字,不要在 JSON 前后添加任何内容。
|
|
231
327
|
8. status 固定为 "{status}"。
|
|
232
328
|
9. JSON 根节点必须严格为一个对象:{{"items": [...]}}。禁止返回单个 item 对象、禁止返回纯数组、禁止返回 result/data/records/knowledge_items 等其他根字段。
|
|
329
|
+
10. 为避免输出被截断,优先生成 1 个覆盖本片段核心内容的综合 item;只有原文明确包含多个相互独立主题时,才拆分为多个 items。
|
|
330
|
+
11. “辅助上下文”只用于理解当前片段在全文中的位置、术语和前后关系;不要把辅助上下文中独有而当前原文没有的事实写成正文依据。
|
|
233
331
|
|
|
234
332
|
doc_type 只能取:
|
|
235
333
|
{", ".join(sorted(DOC_TYPES))}
|
|
@@ -258,8 +356,11 @@ doc_type 只能取:
|
|
|
258
356
|
来源章节:{block.source_section}
|
|
259
357
|
来源页码:{",".join(map(str, block.pages)) if block.pages else ""}
|
|
260
358
|
|
|
359
|
+
辅助上下文:
|
|
360
|
+
{block.context or "无"}
|
|
361
|
+
|
|
261
362
|
原文:
|
|
262
|
-
{block.content
|
|
363
|
+
{block.content}
|
|
263
364
|
""".strip()
|
|
264
365
|
|
|
265
366
|
|
|
@@ -307,7 +408,7 @@ def _json_candidates(text: str) -> List[str]:
|
|
|
307
408
|
|
|
308
409
|
|
|
309
410
|
def _preview(text: str) -> str:
|
|
310
|
-
return
|
|
411
|
+
return text.strip()
|
|
311
412
|
|
|
312
413
|
|
|
313
414
|
def _normalize_heuristically(block: ParsedBlock, status: str) -> KnowledgeItem:
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "union_kb_ingest",
|
|
3
|
-
"version": "1.0.
|
|
3
|
+
"version": "1.0.2",
|
|
4
4
|
"description": "Offline knowledge-base ingest helper for PDF, Word, Markdown and TXT documents.",
|
|
5
5
|
"bin": {
|
|
6
6
|
"union_kb_ingest": "bin/union_kb_ingest"
|
|
@@ -9,6 +9,7 @@
|
|
|
9
9
|
"README.md",
|
|
10
10
|
"requirements.txt",
|
|
11
11
|
"bin/union_kb_ingest",
|
|
12
|
+
"ArkKickidcService.java",
|
|
12
13
|
"*.py",
|
|
13
14
|
"config/config.yaml",
|
|
14
15
|
"prompts/",
|
package/requirements.txt
CHANGED
|
@@ -3,9 +3,6 @@ pyyaml>=6.0.1
|
|
|
3
3
|
zhipuai>=2.1.0
|
|
4
4
|
sniffio>=1.3.0
|
|
5
5
|
|
|
6
|
-
# Pin numpy to avoid incompatible x86-64-v2 wheels on older offline machines.
|
|
7
|
-
numpy==1.26.4
|
|
8
|
-
|
|
9
6
|
# Docling slim plus only file-format backends used by this offline tool.
|
|
10
7
|
# Do not install `docling` or `docling-slim[standard]` here: those pull OCR,
|
|
11
8
|
# layout/table ML models, torch/onnxruntime, and may try to download artifacts.
|
package/schemas.py
CHANGED
package/splitter.py
CHANGED
|
@@ -34,6 +34,7 @@ def _merge_parent_child_blocks(blocks: List[ParsedBlock]) -> List[ParsedBlock]:
|
|
|
34
34
|
content=f"{current.content.rstrip()}\n\n{block.content.strip()}",
|
|
35
35
|
pages=sorted(set(current.pages + block.pages)),
|
|
36
36
|
order=current.order,
|
|
37
|
+
context=current.context,
|
|
37
38
|
)
|
|
38
39
|
continue
|
|
39
40
|
|
|
@@ -79,6 +80,7 @@ def _split_one(block: ParsedBlock, max_chars: int) -> List[ParsedBlock]:
|
|
|
79
80
|
content=text,
|
|
80
81
|
pages=block.pages,
|
|
81
82
|
order=block.order * 100 + idx,
|
|
83
|
+
context=block.context,
|
|
82
84
|
))
|
|
83
85
|
return output
|
|
84
86
|
|