openclaw-diag-cli 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,651 @@
1
+ #!/usr/bin/env python3
2
+ """模块 4:Gateway 状态(进程、端口、生命周期、WS 错误码统一视图)。"""
3
+
4
+ from __future__ import annotations
5
+
6
+ import json
7
+ import os
8
+ import re
9
+ import subprocess
10
+ import sys
11
+ from collections import defaultdict
12
+ from datetime import datetime
13
+ from pathlib import Path
14
+ from typing import List, Optional
15
+
16
+ sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
17
+
18
+ from ocdiag import cli, output, recent_logs
19
+ from ocdiag.jsonlog import get_log_subsystem, parse_log_msg
20
+
21
+
22
+ def run(cmd, timeout=8):
23
+ try:
24
+ r = subprocess.run(cmd, capture_output=True, text=True,
25
+ timeout=timeout, check=False)
26
+ return r.returncode, r.stdout, r.stderr
27
+ except (subprocess.TimeoutExpired, FileNotFoundError, OSError):
28
+ return 1, "", ""
29
+
30
+
31
+ # ── 4.1: process & port ──
32
+
33
+ def section_process_port(out: output.Output, args, port: int) -> None:
34
+ rc, stdout, stderr = run(["systemctl", "--user", "status", "openclaw-gateway"])
35
+ svc_status = (stdout or "") + (stderr or "")
36
+ if "Active:" in svc_status:
37
+ for ln in svc_status.splitlines():
38
+ if "Active:" in ln:
39
+ out.item(f"Systemd: {ln.strip()}")
40
+ if "Main PID:" in ln:
41
+ out.item(ln.strip())
42
+ else:
43
+ out.item("Systemd: 未以 systemd 管理或无法获取状态")
44
+
45
+ rc, pids, _ = run(["pgrep", "-f", "openclaw-gatewa"])
46
+ pid_list = pids.splitlines()[:5] if pids else []
47
+ if pid_list:
48
+ rc, ps_out, _ = run(["ps", "-p", ",".join(pid_list),
49
+ "-o", "pid,ppid,etime,%mem,rss,args", "--no-headers"])
50
+ if rc == 0 and ps_out.strip():
51
+ out.item("进程: " + " | ".join(ps_out.strip().splitlines()))
52
+ out.set_data("pids", pid_list)
53
+
54
+ rc, ss_out, _ = run(["ss", "-tlnp", f"sport = :{port}"])
55
+ listening = bool(re.search(rf":{port}\b", ss_out))
56
+ rc, http_out, _ = run([
57
+ "curl", "-s", "-m5", "-o", "/dev/null", "-w", "%{http_code}",
58
+ f"http://127.0.0.1:{port}/",
59
+ ])
60
+ gw_http = http_out.strip() or "000"
61
+ out.item(f"端口 {port} 监听: {'是' if listening else '否'} | HTTP 健康检查: {gw_http}")
62
+ out.set_data("port", port)
63
+ out.set_data("port_listening", listening)
64
+ out.set_data("http_health_code", gw_http)
65
+
66
+
67
+ # ── 4.2: 24h restart events ──
68
+
69
+ def section_restart_events(out: output.Output) -> None:
70
+ rc, raw, _ = run([
71
+ "journalctl", "--user", "-u", "openclaw-gateway",
72
+ "--since", "24 hours ago", "--no-pager",
73
+ ], timeout=15)
74
+ if not raw:
75
+ out.item("24h 启停事件: 无 — 近 24h 无重启记录")
76
+ out.set_data("restart_events", [])
77
+ return
78
+
79
+ lifecycle = [ln for ln in raw.splitlines()
80
+ if re.search(r"Started openclaw|Stopped openclaw|Main process exited|"
81
+ r"SIGTERM|SIGKILL|OOM Killer", ln, re.I)]
82
+ restart_count = sum(1 for ln in raw.splitlines()
83
+ if re.search(r"Started openclaw", ln, re.I))
84
+
85
+ if not lifecycle:
86
+ out.item(f"24h 启停事件: {restart_count} 次启动 — 近 24h 无重启/停止记录")
87
+ out.set_data("restart_count_24h", restart_count)
88
+ return
89
+
90
+ rc, json_out, _ = run([
91
+ "journalctl", "--user", "-u", "openclaw-gateway",
92
+ "--since", "24 hours ago", "--no-pager", "-o", "json",
93
+ ], timeout=15)
94
+
95
+ seen = set()
96
+ results = []
97
+ for line in json_out.splitlines() if json_out else []:
98
+ if not line.strip():
99
+ continue
100
+ try:
101
+ obj = json.loads(line)
102
+ except Exception:
103
+ continue
104
+ msg = obj.get("MESSAGE", "") or ""
105
+ ts_us = int(obj.get("__REALTIME_TIMESTAMP", 0) or 0)
106
+ ts_str = datetime.fromtimestamp(ts_us / 1_000_000).strftime("%m月 %d %H:%M:%S") if ts_us else ""
107
+ pid = obj.get("_PID", "")
108
+ syslog_id = obj.get("SYSLOG_IDENTIFIER", "")
109
+ is_systemd = syslog_id == "systemd"
110
+ if re.search(r"Started", msg, re.I):
111
+ etype = "启动"
112
+ elif re.search(r"Stopped|stop", msg, re.I):
113
+ etype = "停止"
114
+ elif re.search(r"SIGTERM", msg, re.I):
115
+ etype = "SIGTERM"
116
+ elif re.search(r"SIGKILL", msg, re.I):
117
+ etype = "SIGKILL"
118
+ elif re.search(r"Main process exited", msg, re.I):
119
+ m2 = re.search(r"code=(\w+)", msg)
120
+ m3 = re.search(r"status=(\d+)", msg)
121
+ code_info = f" code={m2.group(1)}" if m2 else ""
122
+ status_info = f" status={m3.group(1)}" if m3 else ""
123
+ etype = f"进程退出{code_info}{status_info}"
124
+ elif re.search(r"OOM", msg, re.I):
125
+ etype = "OOM"
126
+ else:
127
+ continue
128
+ key = f"{ts_str}|{etype}"
129
+ if key in seen:
130
+ continue
131
+ seen.add(key)
132
+ if is_systemd or not pid:
133
+ results.append((ts_us, f"[{ts_str}] {etype}"))
134
+ else:
135
+ results.append((ts_us, f"[{ts_str}] PID={pid} {etype}"))
136
+
137
+ results.sort()
138
+ out.item(f"24h 启停事件: {restart_count} 次启动")
139
+ if results:
140
+ out.evidence("journalctl", "\n".join(line for _, line in results))
141
+ out.set_data("restart_count_24h", restart_count)
142
+ out.set_data("restart_events", [line for _, line in results])
143
+
144
+
145
+ # ── 4.3: model API connectivity ──
146
+
147
+ def section_model_api(out: output.Output, args) -> None:
148
+ if not os.path.isfile(args.config):
149
+ return
150
+ try:
151
+ with open(args.config) as f:
152
+ cfg = json.load(f)
153
+ except Exception:
154
+ return
155
+ models = cfg.get("models", {}) or {}
156
+ all_cfgs = {}
157
+ if isinstance(models.get("configs"), dict):
158
+ all_cfgs.update(models["configs"])
159
+ if isinstance(models.get("providers"), dict):
160
+ all_cfgs.update(models["providers"])
161
+ seen_urls = set()
162
+ api_results = []
163
+ for name, v in all_cfgs.items():
164
+ if not isinstance(v, dict):
165
+ continue
166
+ base_url = v.get("baseURL") or v.get("baseUrl")
167
+ if not base_url:
168
+ continue
169
+ url_key = base_url.split("/v1", 1)[0].rstrip("/")
170
+ if url_key in seen_urls:
171
+ continue
172
+ seen_urls.add(url_key)
173
+ rc, stdout, _ = run([
174
+ "curl", "-s", "-m5", "-o", "/dev/null", "-w", "%{http_code}", url_key,
175
+ ])
176
+ api_http = stdout.strip() or "000"
177
+ out.item(f"模型 API [{url_key}]: HTTP {api_http}")
178
+ api_results.append({"url": url_key, "http_code": api_http})
179
+ out.set_data("model_api", api_results)
180
+
181
+
182
+ # ── 4.4: WS lifecycle analysis ──
183
+
184
+ VALID_SUBSYSTEMS = ("feishu/core/lark-client", "feishu/channel/monitor", "gateway/health-monitor")
185
+
186
+
187
+ def parse_ws_ts(ts_str):
188
+ if not ts_str:
189
+ return None
190
+ try:
191
+ return datetime.fromisoformat(ts_str.replace("Z", "+00:00"))
192
+ except Exception:
193
+ return None
194
+
195
+
196
+ def extract_account(msg: str) -> str:
197
+ m = re.search(r"feishu\[([^\]]+)\]", msg)
198
+ if m:
199
+ return m.group(1)
200
+ m = re.search(r"accountId=([A-Za-z0-9_.-]+)", msg)
201
+ if m:
202
+ return m.group(1)
203
+ m = re.search(r"account\s+([A-Za-z0-9_.-]+)", msg)
204
+ if m:
205
+ return m.group(1)
206
+ return ""
207
+
208
+
209
+ def section_ws_lifecycle(out: output.Output, app_log: str) -> None:
210
+ if not app_log or not os.path.isfile(app_log):
211
+ return
212
+ keyword_re = re.compile(
213
+ r"ws ready|WS ready|websocket|ws error|ws close|ws reconnect|"
214
+ r"health.monitor|channel.*connect|expired.*discard|starting.*WebSocket|"
215
+ r"event-dispatch is ready|stopping feishu|stopped feishu|starting feishu|"
216
+ r"disconnecting WebSocket",
217
+ re.IGNORECASE,
218
+ )
219
+ events = []
220
+ expired = []
221
+ ws_errors = []
222
+ health_count = 0
223
+ try:
224
+ with open(app_log, errors="replace") as f:
225
+ for raw in f:
226
+ if not keyword_re.search(raw):
227
+ continue
228
+ raw = raw.strip()
229
+ if not raw:
230
+ continue
231
+ try:
232
+ obj = json.loads(raw)
233
+ except Exception:
234
+ obj = None
235
+ if obj is None:
236
+ low = raw.lower()
237
+ if "expired" in low and "discard" in low:
238
+ m = re.match(r"\[?(\d{4}-\d{2}-\d{2}T?\d{2}:\d{2}:\d{2})", raw)
239
+ ts = m.group(1) if m else ""
240
+ expired.append((ts, raw[:200]))
241
+ continue
242
+ ts_raw = obj.get("time", "")
243
+ ts_dt = parse_ws_ts(ts_raw)
244
+ ts_str = ts_raw[11:19] if ts_raw else ""
245
+ sub = get_log_subsystem(obj)
246
+ msg = parse_log_msg(obj)
247
+ low = msg.lower()
248
+ is_valid_subsystem = (
249
+ sub in VALID_SUBSYSTEMS or sub.startswith("gateway/channels/")
250
+ )
251
+ is_event_dispatch_ready = (not sub) and "event-dispatch is ready" in low
252
+ if not is_valid_subsystem and not is_event_dispatch_ready:
253
+ continue
254
+ if "health-monitor" in sub or "health-monitor" in low:
255
+ health_count += 1
256
+ continue
257
+ account = extract_account(msg)
258
+ kind = None
259
+ if "expired" in low and "discard" in low:
260
+ expired.append((ts_str, msg[:200]))
261
+ continue
262
+ if "event-dispatch is ready" in low:
263
+ kind = "ready"
264
+ elif re.search(r"starting feishu\[[^\]]+\]\s*\(mode:\s*websocket\)", msg, re.I):
265
+ kind = "init"
266
+ elif re.search(r"feishu\[[^\]]+\]:\s*starting WebSocket connection", msg, re.I):
267
+ kind = "start_ws"
268
+ elif re.search(r"feishu\[[^\]]+\]:\s*WebSocket client started", msg, re.I):
269
+ kind = "client_up"
270
+ elif re.search(r"websocket started for account", msg, re.I):
271
+ kind = "monitor_up"
272
+ elif (re.search(r"^stopping feishu\[[^\]]+\]\s*$", msg.strip(), re.I) or
273
+ re.search(r"\|\s*stopping feishu\[", msg, re.I)):
274
+ kind = "stopping"
275
+ elif (re.search(r"^stopped feishu\[[^\]]+\]\s*$", msg.strip(), re.I) or
276
+ re.search(r"\|\s*stopped feishu\[", msg, re.I)):
277
+ kind = "stopped"
278
+ elif "disconnecting websocket" in low:
279
+ kind = "disconnecting"
280
+ elif any(x in low for x in ["ws close", "ws error", "closed before connect", "connection lost"]):
281
+ kind = "ws_error"
282
+ code_m = re.search(r"code[=: ]+(\d+)", msg)
283
+ reason_m = re.search(r"reason[=: ]+([^\s,)]+)", msg)
284
+ detail = []
285
+ if code_m: detail.append(f"code={code_m.group(1)}")
286
+ if reason_m: detail.append(f"reason={reason_m.group(1)}")
287
+ ws_errors.append((ts_str, account or "?", " ".join(detail) or msg[:120]))
288
+ elif "reconnect" in low and "websocket" in low:
289
+ kind = "reconnect"
290
+ else:
291
+ continue
292
+ events.append((ts_dt, ts_str, account, kind, msg))
293
+ except OSError:
294
+ out.item("Channel WS: 读取应用日志失败")
295
+ return
296
+
297
+ if not events and not expired:
298
+ out.item("Channel WS: 今日无 WS 相关事件记录")
299
+ return
300
+
301
+ by_account = defaultdict(list)
302
+ ready_events = []
303
+ for e in events:
304
+ if e[3] == "ready":
305
+ ready_events.append(e)
306
+ elif e[2]:
307
+ by_account[e[2]].append(e)
308
+ else:
309
+ by_account["?"].append(e)
310
+
311
+ ready_events.sort(key=lambda x: (x[0] or datetime.min))
312
+ candidates = []
313
+ for acc, evs in by_account.items():
314
+ if acc == "?":
315
+ continue
316
+ for ev in evs:
317
+ if ev[3] == "init" and ev[0] is not None:
318
+ candidates.append([ev[0], acc, False])
319
+
320
+ for r in ready_events:
321
+ r_ts = r[0]
322
+ if r_ts is None:
323
+ by_account["?"].append(r)
324
+ continue
325
+ best = None
326
+ best_dt = 999999
327
+ for c in candidates:
328
+ if c[2]:
329
+ continue
330
+ try:
331
+ delta = (r_ts - c[0]).total_seconds()
332
+ except Exception:
333
+ continue
334
+ if 0 <= delta <= 30 and delta < best_dt:
335
+ best_dt = delta
336
+ best = c
337
+ if best is not None:
338
+ best[2] = True
339
+ by_account[best[1]].append(r)
340
+ else:
341
+ by_account["?"].append(r)
342
+
343
+ for acc in by_account:
344
+ by_account[acc].sort(key=lambda x: (x[0] or datetime.min, x[1]))
345
+
346
+ cycle_summaries = []
347
+ per_account_cycles = defaultdict(int)
348
+
349
+ def flush(acc, cur_list):
350
+ if not cur_list:
351
+ return
352
+ kinds = [k for _, _, _, k, _ in cur_list]
353
+ t0 = cur_list[0][1]
354
+ if "ready" in kinds:
355
+ try:
356
+ s_dt = next(e[0] for e in cur_list if e[3] in ("init", "start_ws"))
357
+ r_dt = next(e[0] for e in cur_list if e[3] == "ready")
358
+ dur = (r_dt - s_dt).total_seconds() if (s_dt and r_dt) else None
359
+ except StopIteration:
360
+ dur = None
361
+ if any(k in kinds for k in ("stopping", "disconnecting")) and any(
362
+ k in kinds for k in ("init", "start_ws")
363
+ ):
364
+ label = "重连→就绪"
365
+ elif "init" in kinds or "start_ws" in kinds:
366
+ label = "建连→就绪"
367
+ else:
368
+ label = "就绪"
369
+ extra = f" (耗时 {dur:.1f}s)" if dur is not None and dur >= 0 else ""
370
+ cycle_summaries.append((t0, acc, f"{label}{extra}", True))
371
+ elif any(k in kinds for k in ("init", "start_ws", "client_up", "monitor_up")):
372
+ cycle_summaries.append((t0, acc, "建连(未见 ready)", False))
373
+ elif any(k in kinds for k in ("stopping", "stopped", "disconnecting")):
374
+ cycle_summaries.append((t0, acc, "停止", None))
375
+ elif "ws_error" in kinds:
376
+ cycle_summaries.append((t0, acc, "错误", False))
377
+ per_account_cycles[acc] += 1
378
+
379
+ for acc, evs in by_account.items():
380
+ if acc == "?":
381
+ continue
382
+ cur = []
383
+ for e in evs:
384
+ if not cur:
385
+ cur.append(e)
386
+ continue
387
+ prev_dt = cur[-1][0]
388
+ cur_dt = e[0]
389
+ try:
390
+ gap = (cur_dt - prev_dt).total_seconds() if (prev_dt and cur_dt) else 0
391
+ except Exception:
392
+ gap = 0
393
+ if gap > 60:
394
+ flush(acc, cur)
395
+ cur = [e]
396
+ else:
397
+ cur.append(e)
398
+ flush(acc, cur)
399
+
400
+ cycle_summaries.sort(key=lambda x: x[0])
401
+
402
+ total_ready = sum(1 for _, _, _, k, _ in events if k == "ready")
403
+ total_attempts = sum(1 for _, _, _, k, _ in events if k == "init")
404
+ total_stops = sum(1 for _, _, _, k, _ in events if k == "stopping")
405
+ total_errors = len(ws_errors)
406
+
407
+ def attempt_times(evs):
408
+ ts_list = [e[0] for e in evs if e[3] == "init" and e[0] is not None]
409
+ if not ts_list:
410
+ ts_list = [e[0] for e in evs if e[3] == "start_ws" and e[0] is not None]
411
+ ts_list.sort()
412
+ return ts_list
413
+
414
+ freq_flags = []
415
+ for acc, evs in by_account.items():
416
+ if acc == "?":
417
+ continue
418
+ attempts = attempt_times(evs)
419
+ for i in range(len(attempts)):
420
+ window = [t for t in attempts[i:] if (t - attempts[i]).total_seconds() <= 300]
421
+ if len(window) >= 3:
422
+ freq_flags.append((acc, attempts[i].strftime("%H:%M:%S"), len(window)))
423
+ break
424
+
425
+ intervals = []
426
+ for acc, evs in by_account.items():
427
+ if acc == "?":
428
+ continue
429
+ attempts = attempt_times(evs)
430
+ for a, b in zip(attempts, attempts[1:]):
431
+ intervals.append((b - a).total_seconds())
432
+ avg_interval = sum(intervals) / len(intervals) if intervals else None
433
+
434
+ body = []
435
+ body.append(f"概览: {total_attempts} 次建连尝试, {total_ready} 次就绪, {total_stops} 次停止/断开, {total_errors} 次错误")
436
+ per_acc_str = ", ".join(f"{a}={n}" for a, n in sorted(per_account_cycles.items()))
437
+ if per_acc_str:
438
+ body.append(f"各账号生命周期片段数: {per_acc_str}")
439
+ if avg_interval is not None:
440
+ body.append(f"平均重连间隔: {avg_interval:.1f}s")
441
+ if freq_flags:
442
+ for acc, t0, n in freq_flags:
443
+ body.append(f"频繁重连: {acc} 在 {t0} 起 5 分钟内建连 {n} 次")
444
+ if cycle_summaries:
445
+ body.append("")
446
+ body.append("生命周期时间线:")
447
+ for ts_str, acc, summary, _ok in cycle_summaries:
448
+ body.append(f" [{ts_str}] feishu[{acc}]: {summary}")
449
+ if ws_errors:
450
+ body.append("")
451
+ body.append(f"WS 错误明细: {len(ws_errors)} 条")
452
+ for ts_str, acc, detail in ws_errors[:10]:
453
+ body.append(f" [{ts_str}] feishu[{acc}]: {detail}")
454
+ if len(ws_errors) > 10:
455
+ body.append(f" ... 共 {len(ws_errors)} 条")
456
+ if expired:
457
+ body.append("")
458
+ body.append(f"过期丢弃: {len(expired)} 条消息")
459
+ for ts_str, msg in expired[:10]:
460
+ body.append(f" [{ts_str}] {msg}")
461
+ if len(expired) > 10:
462
+ body.append(f" ... 共 {len(expired)} 条")
463
+ if health_count > 0:
464
+ body.append("")
465
+ body.append(f"health-monitor: {health_count} 条心跳记录(未展开)")
466
+
467
+ if body:
468
+ out.item("Channel WS 状态 — WebSocket 消息通道的连接、断连和消息丢失记录:")
469
+ out.evidence("应用日志", "\n".join(body))
470
+ out.set_data("ws_summary", {
471
+ "attempts": total_attempts,
472
+ "ready": total_ready,
473
+ "stops": total_stops,
474
+ "errors": total_errors,
475
+ "avg_interval_s": avg_interval,
476
+ "freq_reconnect": [{"account": a, "from": t, "count": n} for a, t, n in freq_flags],
477
+ "expired_count": len(expired),
478
+ "health_count": health_count,
479
+ })
480
+
481
+
482
+ # ── 4.5: gateway error codes (auth + WS close) ──
483
+
484
+ KNOWN_REASONS = [
485
+ ("HTTP", 401, r"trusted_proxy_user_missing", "可信代理用户缺失,auth.mode=trustedProxy 时需在请求头设置 X-Remote-User"),
486
+ ("HTTP", 401, r"unauthorized", "内部 API 认证失败,通常是 auth.mode 配置问题"),
487
+ ("HTTP", 401, r".*", "内部 API 认证失败"),
488
+ ("WS", 1008, r"pairing required", "节点未配对,需要执行 openclaw devices list 并批准配对请求"),
489
+ ("WS", 1008, r"not_paired", "节点未配对"),
490
+ ("WS", 1008, r"slow consumer", "客户端消费消息太慢,被服务端主动断开"),
491
+ ("WS", 1008, r"connect challenge missing", "连接握手缺少 nonce,认证流程异常"),
492
+ ("WS", 1008, r"connect challenge timeout", "连接握手超时,可能是网络延迟或服务端未响应"),
493
+ ("WS", 1008, r"connect failed", "连接认证失败,可能是密钥/证书不匹配"),
494
+ ("WS", 1008, r"Missing callSid", "语音通话缺少 callSid(Twilio 集成问题)"),
495
+ ("WS", 1008, r"Unknown call", "未知的语音通话 ID"),
496
+ ("WS", 1008, r"Start timeout", "语音会话启动超时"),
497
+ ("WS", 1006, r".*", "连接异常断开,未收到 close frame — 通常是网络中断、进程崩溃或超时"),
498
+ ("WS", 1001, r".*", "端点正在离开(服务器关闭或客户端断开)"),
499
+ ("WS", 1011, r".*", "服务器内部错误导致关闭"),
500
+ ("WS", 1012, r".*", "服务器正在重启"),
501
+ ("WS", 1013, r".*", "服务器暂时不可用,请稍后重试"),
502
+ ("WS", 1000, r".*", "正常关闭"),
503
+ ]
504
+
505
+
506
+ def explain(kind, code, reason):
507
+ for k, c, pat, expl in KNOWN_REASONS:
508
+ if k == kind and c == code and re.search(pat, reason or "", re.I):
509
+ return expl
510
+ return None
511
+
512
+
513
+ def classify_err(msg: str):
514
+ low = msg.lower()
515
+ if "trusted_proxy_user_missing" in low:
516
+ return ("HTTP", 401, "trusted_proxy_user_missing")
517
+ if "unauthorized" in low:
518
+ return ("HTTP", 401, "unauthorized")
519
+ m = re.search(r"gateway closed\s*\((\d+)\)\s*:\s*(.*)", msg, re.I)
520
+ if m:
521
+ return ("WS", int(m.group(1)), m.group(2).strip())
522
+ m = re.search(r"(?:ws close|ws error|closed).*?code[=: ]*(\d{4})", msg, re.I)
523
+ if m:
524
+ code = int(m.group(1))
525
+ m2 = re.search(r"reason[=: ]*([^\s,)]+.*?)(?:\s*$|\s*[,|)])", msg, re.I)
526
+ return ("WS", code, (m2.group(1).strip() if m2 else ""))
527
+ if re.search(r"closed before connect|abnormal clos", msg, re.I):
528
+ return ("WS", 1006, "abnormal closure")
529
+ return None
530
+
531
+
532
+ VALID_GATEWAY_PREFIXES = ("gateway/", "feishu/core/lark-client", "feishu/channel/")
533
+
534
+
535
+ def section_gateway_errors(out: output.Output, app_log: str) -> None:
536
+ if not app_log or not os.path.isfile(app_log):
537
+ return
538
+ keyword_re = re.compile(
539
+ r"unauthorized|trusted_proxy_user_missing|gateway closed|ws close|ws error|closed before",
540
+ re.IGNORECASE,
541
+ )
542
+ events = []
543
+ try:
544
+ with open(app_log, errors="replace") as f:
545
+ for raw in f:
546
+ if not keyword_re.search(raw):
547
+ continue
548
+ raw = raw.strip()
549
+ if not raw:
550
+ continue
551
+ try:
552
+ obj = json.loads(raw)
553
+ except Exception:
554
+ continue
555
+ sub = get_log_subsystem(obj)
556
+ if not sub:
557
+ continue
558
+ if not any(sub == p or sub.startswith(p) for p in VALID_GATEWAY_PREFIXES):
559
+ continue
560
+ msg = parse_log_msg(obj)
561
+ ts = obj.get("time", "")[:19]
562
+ r = classify_err(msg)
563
+ if r is None:
564
+ continue
565
+ kind, code, reason = r
566
+ events.append((ts, kind, code, reason or "(no reason)"))
567
+ except OSError:
568
+ return
569
+
570
+ if not events:
571
+ out.item("Gateway 错误码: 0 条")
572
+ out.set_data("gateway_errors", {"total": 0})
573
+ return
574
+
575
+ total = len(events)
576
+ auth_count = sum(1 for e in events if e[1] == "HTTP")
577
+ ws_count = sum(1 for e in events if e[1] == "WS")
578
+ body = [f"共 {total} 条(认证 {auth_count} 条, WS 关闭 {ws_count} 条)", ""]
579
+
580
+ combo = defaultdict(lambda: {"count": 0, "timeline": []})
581
+ for ts, kind, code, reason in events:
582
+ combo[(kind, code, reason)]["count"] += 1
583
+ combo[(kind, code, reason)]["timeline"].append(ts)
584
+
585
+ structured = []
586
+ for (kind, code, reason), data in sorted(combo.items(), key=lambda x: -x[1]["count"]):
587
+ count = data["count"]
588
+ timeline = data["timeline"]
589
+ expl = explain(kind, code, reason)
590
+ body.append(f"{kind} {code}: {reason} ({count} 次)")
591
+ if expl:
592
+ body.append(f" {expl}")
593
+ shown = timeline[-5:] if len(timeline) > 5 else timeline
594
+ for t in shown:
595
+ body.append(f" [{t}]")
596
+ if len(timeline) > 5:
597
+ body.append(f" ... 共 {len(timeline)} 条,仅显示最近 5 条")
598
+ body.append("")
599
+ structured.append({
600
+ "kind": kind, "code": code, "reason": reason,
601
+ "count": count, "explanation": expl,
602
+ })
603
+
604
+ out.item("Gateway 错误码 — 认证 401 + WS 关闭码统一视图:")
605
+ out.evidence("应用日志", "\n".join(body))
606
+ out.set_data("gateway_errors", {
607
+ "total": total,
608
+ "auth_count": auth_count,
609
+ "ws_count": ws_count,
610
+ "by_reason": structured,
611
+ })
612
+
613
+
614
+ def main() -> int:
615
+ parser = cli.build_common_parser(
616
+ description="模块 4:Gateway 状态采集",
617
+ prog="04_gateway",
618
+ )
619
+ args = parser.parse_args()
620
+
621
+ out = output.init("gateway", json_mode=args.json, no_color=args.no_color)
622
+ out.section("模块 4:Gateway 状态")
623
+
624
+ port = 18789
625
+ if os.path.isfile(args.config):
626
+ try:
627
+ with open(args.config) as f:
628
+ cfg = json.load(f)
629
+ cp = cfg.get("gateway", {}).get("port")
630
+ if cp:
631
+ port = int(cp)
632
+ except Exception:
633
+ pass
634
+
635
+ section_process_port(out, args, port)
636
+ section_restart_events(out)
637
+ section_model_api(out, args)
638
+
639
+ app_log = recent_logs.latest_app_log(args.log_dir)
640
+ if app_log:
641
+ section_ws_lifecycle(out, app_log)
642
+ section_gateway_errors(out, app_log)
643
+ else:
644
+ out.item("Channel WS: 未找到应用日志文件")
645
+ out.item("Gateway 错误码: 未找到应用日志文件")
646
+
647
+ return out.done()
648
+
649
+
650
+ if __name__ == "__main__":
651
+ sys.exit(main())