openclaw-diag-cli 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +260 -0
- package/bin/ocdiag +14 -0
- package/bin/openclaw-diag.js +275 -0
- package/diag/01_sys_health.py +443 -0
- package/diag/02_environment.py +292 -0
- package/diag/03_configuration.py +131 -0
- package/diag/04_gateway.py +651 -0
- package/diag/05_recent_errors.py +246 -0
- package/diag/06_cron_jobs.py +694 -0
- package/diag/07_performance.py +687 -0
- package/diag/08_sessions.py +518 -0
- package/diag/09_plugin_diag.py +535 -0
- package/diag/10_shell_history.py +121 -0
- package/diag/__init__.py +0 -0
- package/lib/bundle.py +204 -0
- package/ocdiag/__init__.py +3 -0
- package/ocdiag/cli.py +39 -0
- package/ocdiag/dispatcher.py +137 -0
- package/ocdiag/jsonlog.py +65 -0
- package/ocdiag/output.py +131 -0
- package/ocdiag/paths.py +48 -0
- package/ocdiag/recent_logs.py +53 -0
- package/ocdiag/sensitive.py +41 -0
- package/ocdiag/timeutil.py +77 -0
- package/ocdiag/tokens.py +46 -0
- package/package.json +42 -0
- package/tools/__init__.py +0 -0
- package/tools/oc_session_extract.py +254 -0
- package/tools/oc_session_trace.py +715 -0
|
@@ -0,0 +1,651 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""模块 4:Gateway 状态(进程、端口、生命周期、WS 错误码统一视图)。"""
|
|
3
|
+
|
|
4
|
+
from __future__ import annotations
|
|
5
|
+
|
|
6
|
+
import json
|
|
7
|
+
import os
|
|
8
|
+
import re
|
|
9
|
+
import subprocess
|
|
10
|
+
import sys
|
|
11
|
+
from collections import defaultdict
|
|
12
|
+
from datetime import datetime
|
|
13
|
+
from pathlib import Path
|
|
14
|
+
from typing import List, Optional
|
|
15
|
+
|
|
16
|
+
sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
|
|
17
|
+
|
|
18
|
+
from ocdiag import cli, output, recent_logs
|
|
19
|
+
from ocdiag.jsonlog import get_log_subsystem, parse_log_msg
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def run(cmd, timeout=8):
|
|
23
|
+
try:
|
|
24
|
+
r = subprocess.run(cmd, capture_output=True, text=True,
|
|
25
|
+
timeout=timeout, check=False)
|
|
26
|
+
return r.returncode, r.stdout, r.stderr
|
|
27
|
+
except (subprocess.TimeoutExpired, FileNotFoundError, OSError):
|
|
28
|
+
return 1, "", ""
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
# ── 4.1: process & port ──
|
|
32
|
+
|
|
33
|
+
def section_process_port(out: output.Output, args, port: int) -> None:
|
|
34
|
+
rc, stdout, stderr = run(["systemctl", "--user", "status", "openclaw-gateway"])
|
|
35
|
+
svc_status = (stdout or "") + (stderr or "")
|
|
36
|
+
if "Active:" in svc_status:
|
|
37
|
+
for ln in svc_status.splitlines():
|
|
38
|
+
if "Active:" in ln:
|
|
39
|
+
out.item(f"Systemd: {ln.strip()}")
|
|
40
|
+
if "Main PID:" in ln:
|
|
41
|
+
out.item(ln.strip())
|
|
42
|
+
else:
|
|
43
|
+
out.item("Systemd: 未以 systemd 管理或无法获取状态")
|
|
44
|
+
|
|
45
|
+
rc, pids, _ = run(["pgrep", "-f", "openclaw-gatewa"])
|
|
46
|
+
pid_list = pids.splitlines()[:5] if pids else []
|
|
47
|
+
if pid_list:
|
|
48
|
+
rc, ps_out, _ = run(["ps", "-p", ",".join(pid_list),
|
|
49
|
+
"-o", "pid,ppid,etime,%mem,rss,args", "--no-headers"])
|
|
50
|
+
if rc == 0 and ps_out.strip():
|
|
51
|
+
out.item("进程: " + " | ".join(ps_out.strip().splitlines()))
|
|
52
|
+
out.set_data("pids", pid_list)
|
|
53
|
+
|
|
54
|
+
rc, ss_out, _ = run(["ss", "-tlnp", f"sport = :{port}"])
|
|
55
|
+
listening = bool(re.search(rf":{port}\b", ss_out))
|
|
56
|
+
rc, http_out, _ = run([
|
|
57
|
+
"curl", "-s", "-m5", "-o", "/dev/null", "-w", "%{http_code}",
|
|
58
|
+
f"http://127.0.0.1:{port}/",
|
|
59
|
+
])
|
|
60
|
+
gw_http = http_out.strip() or "000"
|
|
61
|
+
out.item(f"端口 {port} 监听: {'是' if listening else '否'} | HTTP 健康检查: {gw_http}")
|
|
62
|
+
out.set_data("port", port)
|
|
63
|
+
out.set_data("port_listening", listening)
|
|
64
|
+
out.set_data("http_health_code", gw_http)
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
# ── 4.2: 24h restart events ──
|
|
68
|
+
|
|
69
|
+
def section_restart_events(out: output.Output) -> None:
|
|
70
|
+
rc, raw, _ = run([
|
|
71
|
+
"journalctl", "--user", "-u", "openclaw-gateway",
|
|
72
|
+
"--since", "24 hours ago", "--no-pager",
|
|
73
|
+
], timeout=15)
|
|
74
|
+
if not raw:
|
|
75
|
+
out.item("24h 启停事件: 无 — 近 24h 无重启记录")
|
|
76
|
+
out.set_data("restart_events", [])
|
|
77
|
+
return
|
|
78
|
+
|
|
79
|
+
lifecycle = [ln for ln in raw.splitlines()
|
|
80
|
+
if re.search(r"Started openclaw|Stopped openclaw|Main process exited|"
|
|
81
|
+
r"SIGTERM|SIGKILL|OOM Killer", ln, re.I)]
|
|
82
|
+
restart_count = sum(1 for ln in raw.splitlines()
|
|
83
|
+
if re.search(r"Started openclaw", ln, re.I))
|
|
84
|
+
|
|
85
|
+
if not lifecycle:
|
|
86
|
+
out.item(f"24h 启停事件: {restart_count} 次启动 — 近 24h 无重启/停止记录")
|
|
87
|
+
out.set_data("restart_count_24h", restart_count)
|
|
88
|
+
return
|
|
89
|
+
|
|
90
|
+
rc, json_out, _ = run([
|
|
91
|
+
"journalctl", "--user", "-u", "openclaw-gateway",
|
|
92
|
+
"--since", "24 hours ago", "--no-pager", "-o", "json",
|
|
93
|
+
], timeout=15)
|
|
94
|
+
|
|
95
|
+
seen = set()
|
|
96
|
+
results = []
|
|
97
|
+
for line in json_out.splitlines() if json_out else []:
|
|
98
|
+
if not line.strip():
|
|
99
|
+
continue
|
|
100
|
+
try:
|
|
101
|
+
obj = json.loads(line)
|
|
102
|
+
except Exception:
|
|
103
|
+
continue
|
|
104
|
+
msg = obj.get("MESSAGE", "") or ""
|
|
105
|
+
ts_us = int(obj.get("__REALTIME_TIMESTAMP", 0) or 0)
|
|
106
|
+
ts_str = datetime.fromtimestamp(ts_us / 1_000_000).strftime("%m月 %d %H:%M:%S") if ts_us else ""
|
|
107
|
+
pid = obj.get("_PID", "")
|
|
108
|
+
syslog_id = obj.get("SYSLOG_IDENTIFIER", "")
|
|
109
|
+
is_systemd = syslog_id == "systemd"
|
|
110
|
+
if re.search(r"Started", msg, re.I):
|
|
111
|
+
etype = "启动"
|
|
112
|
+
elif re.search(r"Stopped|stop", msg, re.I):
|
|
113
|
+
etype = "停止"
|
|
114
|
+
elif re.search(r"SIGTERM", msg, re.I):
|
|
115
|
+
etype = "SIGTERM"
|
|
116
|
+
elif re.search(r"SIGKILL", msg, re.I):
|
|
117
|
+
etype = "SIGKILL"
|
|
118
|
+
elif re.search(r"Main process exited", msg, re.I):
|
|
119
|
+
m2 = re.search(r"code=(\w+)", msg)
|
|
120
|
+
m3 = re.search(r"status=(\d+)", msg)
|
|
121
|
+
code_info = f" code={m2.group(1)}" if m2 else ""
|
|
122
|
+
status_info = f" status={m3.group(1)}" if m3 else ""
|
|
123
|
+
etype = f"进程退出{code_info}{status_info}"
|
|
124
|
+
elif re.search(r"OOM", msg, re.I):
|
|
125
|
+
etype = "OOM"
|
|
126
|
+
else:
|
|
127
|
+
continue
|
|
128
|
+
key = f"{ts_str}|{etype}"
|
|
129
|
+
if key in seen:
|
|
130
|
+
continue
|
|
131
|
+
seen.add(key)
|
|
132
|
+
if is_systemd or not pid:
|
|
133
|
+
results.append((ts_us, f"[{ts_str}] {etype}"))
|
|
134
|
+
else:
|
|
135
|
+
results.append((ts_us, f"[{ts_str}] PID={pid} {etype}"))
|
|
136
|
+
|
|
137
|
+
results.sort()
|
|
138
|
+
out.item(f"24h 启停事件: {restart_count} 次启动")
|
|
139
|
+
if results:
|
|
140
|
+
out.evidence("journalctl", "\n".join(line for _, line in results))
|
|
141
|
+
out.set_data("restart_count_24h", restart_count)
|
|
142
|
+
out.set_data("restart_events", [line for _, line in results])
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
# ── 4.3: model API connectivity ──
|
|
146
|
+
|
|
147
|
+
def section_model_api(out: output.Output, args) -> None:
|
|
148
|
+
if not os.path.isfile(args.config):
|
|
149
|
+
return
|
|
150
|
+
try:
|
|
151
|
+
with open(args.config) as f:
|
|
152
|
+
cfg = json.load(f)
|
|
153
|
+
except Exception:
|
|
154
|
+
return
|
|
155
|
+
models = cfg.get("models", {}) or {}
|
|
156
|
+
all_cfgs = {}
|
|
157
|
+
if isinstance(models.get("configs"), dict):
|
|
158
|
+
all_cfgs.update(models["configs"])
|
|
159
|
+
if isinstance(models.get("providers"), dict):
|
|
160
|
+
all_cfgs.update(models["providers"])
|
|
161
|
+
seen_urls = set()
|
|
162
|
+
api_results = []
|
|
163
|
+
for name, v in all_cfgs.items():
|
|
164
|
+
if not isinstance(v, dict):
|
|
165
|
+
continue
|
|
166
|
+
base_url = v.get("baseURL") or v.get("baseUrl")
|
|
167
|
+
if not base_url:
|
|
168
|
+
continue
|
|
169
|
+
url_key = base_url.split("/v1", 1)[0].rstrip("/")
|
|
170
|
+
if url_key in seen_urls:
|
|
171
|
+
continue
|
|
172
|
+
seen_urls.add(url_key)
|
|
173
|
+
rc, stdout, _ = run([
|
|
174
|
+
"curl", "-s", "-m5", "-o", "/dev/null", "-w", "%{http_code}", url_key,
|
|
175
|
+
])
|
|
176
|
+
api_http = stdout.strip() or "000"
|
|
177
|
+
out.item(f"模型 API [{url_key}]: HTTP {api_http}")
|
|
178
|
+
api_results.append({"url": url_key, "http_code": api_http})
|
|
179
|
+
out.set_data("model_api", api_results)
|
|
180
|
+
|
|
181
|
+
|
|
182
|
+
# ── 4.4: WS lifecycle analysis ──
|
|
183
|
+
|
|
184
|
+
VALID_SUBSYSTEMS = ("feishu/core/lark-client", "feishu/channel/monitor", "gateway/health-monitor")
|
|
185
|
+
|
|
186
|
+
|
|
187
|
+
def parse_ws_ts(ts_str):
|
|
188
|
+
if not ts_str:
|
|
189
|
+
return None
|
|
190
|
+
try:
|
|
191
|
+
return datetime.fromisoformat(ts_str.replace("Z", "+00:00"))
|
|
192
|
+
except Exception:
|
|
193
|
+
return None
|
|
194
|
+
|
|
195
|
+
|
|
196
|
+
def extract_account(msg: str) -> str:
|
|
197
|
+
m = re.search(r"feishu\[([^\]]+)\]", msg)
|
|
198
|
+
if m:
|
|
199
|
+
return m.group(1)
|
|
200
|
+
m = re.search(r"accountId=([A-Za-z0-9_.-]+)", msg)
|
|
201
|
+
if m:
|
|
202
|
+
return m.group(1)
|
|
203
|
+
m = re.search(r"account\s+([A-Za-z0-9_.-]+)", msg)
|
|
204
|
+
if m:
|
|
205
|
+
return m.group(1)
|
|
206
|
+
return ""
|
|
207
|
+
|
|
208
|
+
|
|
209
|
+
def section_ws_lifecycle(out: output.Output, app_log: str) -> None:
|
|
210
|
+
if not app_log or not os.path.isfile(app_log):
|
|
211
|
+
return
|
|
212
|
+
keyword_re = re.compile(
|
|
213
|
+
r"ws ready|WS ready|websocket|ws error|ws close|ws reconnect|"
|
|
214
|
+
r"health.monitor|channel.*connect|expired.*discard|starting.*WebSocket|"
|
|
215
|
+
r"event-dispatch is ready|stopping feishu|stopped feishu|starting feishu|"
|
|
216
|
+
r"disconnecting WebSocket",
|
|
217
|
+
re.IGNORECASE,
|
|
218
|
+
)
|
|
219
|
+
events = []
|
|
220
|
+
expired = []
|
|
221
|
+
ws_errors = []
|
|
222
|
+
health_count = 0
|
|
223
|
+
try:
|
|
224
|
+
with open(app_log, errors="replace") as f:
|
|
225
|
+
for raw in f:
|
|
226
|
+
if not keyword_re.search(raw):
|
|
227
|
+
continue
|
|
228
|
+
raw = raw.strip()
|
|
229
|
+
if not raw:
|
|
230
|
+
continue
|
|
231
|
+
try:
|
|
232
|
+
obj = json.loads(raw)
|
|
233
|
+
except Exception:
|
|
234
|
+
obj = None
|
|
235
|
+
if obj is None:
|
|
236
|
+
low = raw.lower()
|
|
237
|
+
if "expired" in low and "discard" in low:
|
|
238
|
+
m = re.match(r"\[?(\d{4}-\d{2}-\d{2}T?\d{2}:\d{2}:\d{2})", raw)
|
|
239
|
+
ts = m.group(1) if m else ""
|
|
240
|
+
expired.append((ts, raw[:200]))
|
|
241
|
+
continue
|
|
242
|
+
ts_raw = obj.get("time", "")
|
|
243
|
+
ts_dt = parse_ws_ts(ts_raw)
|
|
244
|
+
ts_str = ts_raw[11:19] if ts_raw else ""
|
|
245
|
+
sub = get_log_subsystem(obj)
|
|
246
|
+
msg = parse_log_msg(obj)
|
|
247
|
+
low = msg.lower()
|
|
248
|
+
is_valid_subsystem = (
|
|
249
|
+
sub in VALID_SUBSYSTEMS or sub.startswith("gateway/channels/")
|
|
250
|
+
)
|
|
251
|
+
is_event_dispatch_ready = (not sub) and "event-dispatch is ready" in low
|
|
252
|
+
if not is_valid_subsystem and not is_event_dispatch_ready:
|
|
253
|
+
continue
|
|
254
|
+
if "health-monitor" in sub or "health-monitor" in low:
|
|
255
|
+
health_count += 1
|
|
256
|
+
continue
|
|
257
|
+
account = extract_account(msg)
|
|
258
|
+
kind = None
|
|
259
|
+
if "expired" in low and "discard" in low:
|
|
260
|
+
expired.append((ts_str, msg[:200]))
|
|
261
|
+
continue
|
|
262
|
+
if "event-dispatch is ready" in low:
|
|
263
|
+
kind = "ready"
|
|
264
|
+
elif re.search(r"starting feishu\[[^\]]+\]\s*\(mode:\s*websocket\)", msg, re.I):
|
|
265
|
+
kind = "init"
|
|
266
|
+
elif re.search(r"feishu\[[^\]]+\]:\s*starting WebSocket connection", msg, re.I):
|
|
267
|
+
kind = "start_ws"
|
|
268
|
+
elif re.search(r"feishu\[[^\]]+\]:\s*WebSocket client started", msg, re.I):
|
|
269
|
+
kind = "client_up"
|
|
270
|
+
elif re.search(r"websocket started for account", msg, re.I):
|
|
271
|
+
kind = "monitor_up"
|
|
272
|
+
elif (re.search(r"^stopping feishu\[[^\]]+\]\s*$", msg.strip(), re.I) or
|
|
273
|
+
re.search(r"\|\s*stopping feishu\[", msg, re.I)):
|
|
274
|
+
kind = "stopping"
|
|
275
|
+
elif (re.search(r"^stopped feishu\[[^\]]+\]\s*$", msg.strip(), re.I) or
|
|
276
|
+
re.search(r"\|\s*stopped feishu\[", msg, re.I)):
|
|
277
|
+
kind = "stopped"
|
|
278
|
+
elif "disconnecting websocket" in low:
|
|
279
|
+
kind = "disconnecting"
|
|
280
|
+
elif any(x in low for x in ["ws close", "ws error", "closed before connect", "connection lost"]):
|
|
281
|
+
kind = "ws_error"
|
|
282
|
+
code_m = re.search(r"code[=: ]+(\d+)", msg)
|
|
283
|
+
reason_m = re.search(r"reason[=: ]+([^\s,)]+)", msg)
|
|
284
|
+
detail = []
|
|
285
|
+
if code_m: detail.append(f"code={code_m.group(1)}")
|
|
286
|
+
if reason_m: detail.append(f"reason={reason_m.group(1)}")
|
|
287
|
+
ws_errors.append((ts_str, account or "?", " ".join(detail) or msg[:120]))
|
|
288
|
+
elif "reconnect" in low and "websocket" in low:
|
|
289
|
+
kind = "reconnect"
|
|
290
|
+
else:
|
|
291
|
+
continue
|
|
292
|
+
events.append((ts_dt, ts_str, account, kind, msg))
|
|
293
|
+
except OSError:
|
|
294
|
+
out.item("Channel WS: 读取应用日志失败")
|
|
295
|
+
return
|
|
296
|
+
|
|
297
|
+
if not events and not expired:
|
|
298
|
+
out.item("Channel WS: 今日无 WS 相关事件记录")
|
|
299
|
+
return
|
|
300
|
+
|
|
301
|
+
by_account = defaultdict(list)
|
|
302
|
+
ready_events = []
|
|
303
|
+
for e in events:
|
|
304
|
+
if e[3] == "ready":
|
|
305
|
+
ready_events.append(e)
|
|
306
|
+
elif e[2]:
|
|
307
|
+
by_account[e[2]].append(e)
|
|
308
|
+
else:
|
|
309
|
+
by_account["?"].append(e)
|
|
310
|
+
|
|
311
|
+
ready_events.sort(key=lambda x: (x[0] or datetime.min))
|
|
312
|
+
candidates = []
|
|
313
|
+
for acc, evs in by_account.items():
|
|
314
|
+
if acc == "?":
|
|
315
|
+
continue
|
|
316
|
+
for ev in evs:
|
|
317
|
+
if ev[3] == "init" and ev[0] is not None:
|
|
318
|
+
candidates.append([ev[0], acc, False])
|
|
319
|
+
|
|
320
|
+
for r in ready_events:
|
|
321
|
+
r_ts = r[0]
|
|
322
|
+
if r_ts is None:
|
|
323
|
+
by_account["?"].append(r)
|
|
324
|
+
continue
|
|
325
|
+
best = None
|
|
326
|
+
best_dt = 999999
|
|
327
|
+
for c in candidates:
|
|
328
|
+
if c[2]:
|
|
329
|
+
continue
|
|
330
|
+
try:
|
|
331
|
+
delta = (r_ts - c[0]).total_seconds()
|
|
332
|
+
except Exception:
|
|
333
|
+
continue
|
|
334
|
+
if 0 <= delta <= 30 and delta < best_dt:
|
|
335
|
+
best_dt = delta
|
|
336
|
+
best = c
|
|
337
|
+
if best is not None:
|
|
338
|
+
best[2] = True
|
|
339
|
+
by_account[best[1]].append(r)
|
|
340
|
+
else:
|
|
341
|
+
by_account["?"].append(r)
|
|
342
|
+
|
|
343
|
+
for acc in by_account:
|
|
344
|
+
by_account[acc].sort(key=lambda x: (x[0] or datetime.min, x[1]))
|
|
345
|
+
|
|
346
|
+
cycle_summaries = []
|
|
347
|
+
per_account_cycles = defaultdict(int)
|
|
348
|
+
|
|
349
|
+
def flush(acc, cur_list):
|
|
350
|
+
if not cur_list:
|
|
351
|
+
return
|
|
352
|
+
kinds = [k for _, _, _, k, _ in cur_list]
|
|
353
|
+
t0 = cur_list[0][1]
|
|
354
|
+
if "ready" in kinds:
|
|
355
|
+
try:
|
|
356
|
+
s_dt = next(e[0] for e in cur_list if e[3] in ("init", "start_ws"))
|
|
357
|
+
r_dt = next(e[0] for e in cur_list if e[3] == "ready")
|
|
358
|
+
dur = (r_dt - s_dt).total_seconds() if (s_dt and r_dt) else None
|
|
359
|
+
except StopIteration:
|
|
360
|
+
dur = None
|
|
361
|
+
if any(k in kinds for k in ("stopping", "disconnecting")) and any(
|
|
362
|
+
k in kinds for k in ("init", "start_ws")
|
|
363
|
+
):
|
|
364
|
+
label = "重连→就绪"
|
|
365
|
+
elif "init" in kinds or "start_ws" in kinds:
|
|
366
|
+
label = "建连→就绪"
|
|
367
|
+
else:
|
|
368
|
+
label = "就绪"
|
|
369
|
+
extra = f" (耗时 {dur:.1f}s)" if dur is not None and dur >= 0 else ""
|
|
370
|
+
cycle_summaries.append((t0, acc, f"{label}{extra}", True))
|
|
371
|
+
elif any(k in kinds for k in ("init", "start_ws", "client_up", "monitor_up")):
|
|
372
|
+
cycle_summaries.append((t0, acc, "建连(未见 ready)", False))
|
|
373
|
+
elif any(k in kinds for k in ("stopping", "stopped", "disconnecting")):
|
|
374
|
+
cycle_summaries.append((t0, acc, "停止", None))
|
|
375
|
+
elif "ws_error" in kinds:
|
|
376
|
+
cycle_summaries.append((t0, acc, "错误", False))
|
|
377
|
+
per_account_cycles[acc] += 1
|
|
378
|
+
|
|
379
|
+
for acc, evs in by_account.items():
|
|
380
|
+
if acc == "?":
|
|
381
|
+
continue
|
|
382
|
+
cur = []
|
|
383
|
+
for e in evs:
|
|
384
|
+
if not cur:
|
|
385
|
+
cur.append(e)
|
|
386
|
+
continue
|
|
387
|
+
prev_dt = cur[-1][0]
|
|
388
|
+
cur_dt = e[0]
|
|
389
|
+
try:
|
|
390
|
+
gap = (cur_dt - prev_dt).total_seconds() if (prev_dt and cur_dt) else 0
|
|
391
|
+
except Exception:
|
|
392
|
+
gap = 0
|
|
393
|
+
if gap > 60:
|
|
394
|
+
flush(acc, cur)
|
|
395
|
+
cur = [e]
|
|
396
|
+
else:
|
|
397
|
+
cur.append(e)
|
|
398
|
+
flush(acc, cur)
|
|
399
|
+
|
|
400
|
+
cycle_summaries.sort(key=lambda x: x[0])
|
|
401
|
+
|
|
402
|
+
total_ready = sum(1 for _, _, _, k, _ in events if k == "ready")
|
|
403
|
+
total_attempts = sum(1 for _, _, _, k, _ in events if k == "init")
|
|
404
|
+
total_stops = sum(1 for _, _, _, k, _ in events if k == "stopping")
|
|
405
|
+
total_errors = len(ws_errors)
|
|
406
|
+
|
|
407
|
+
def attempt_times(evs):
|
|
408
|
+
ts_list = [e[0] for e in evs if e[3] == "init" and e[0] is not None]
|
|
409
|
+
if not ts_list:
|
|
410
|
+
ts_list = [e[0] for e in evs if e[3] == "start_ws" and e[0] is not None]
|
|
411
|
+
ts_list.sort()
|
|
412
|
+
return ts_list
|
|
413
|
+
|
|
414
|
+
freq_flags = []
|
|
415
|
+
for acc, evs in by_account.items():
|
|
416
|
+
if acc == "?":
|
|
417
|
+
continue
|
|
418
|
+
attempts = attempt_times(evs)
|
|
419
|
+
for i in range(len(attempts)):
|
|
420
|
+
window = [t for t in attempts[i:] if (t - attempts[i]).total_seconds() <= 300]
|
|
421
|
+
if len(window) >= 3:
|
|
422
|
+
freq_flags.append((acc, attempts[i].strftime("%H:%M:%S"), len(window)))
|
|
423
|
+
break
|
|
424
|
+
|
|
425
|
+
intervals = []
|
|
426
|
+
for acc, evs in by_account.items():
|
|
427
|
+
if acc == "?":
|
|
428
|
+
continue
|
|
429
|
+
attempts = attempt_times(evs)
|
|
430
|
+
for a, b in zip(attempts, attempts[1:]):
|
|
431
|
+
intervals.append((b - a).total_seconds())
|
|
432
|
+
avg_interval = sum(intervals) / len(intervals) if intervals else None
|
|
433
|
+
|
|
434
|
+
body = []
|
|
435
|
+
body.append(f"概览: {total_attempts} 次建连尝试, {total_ready} 次就绪, {total_stops} 次停止/断开, {total_errors} 次错误")
|
|
436
|
+
per_acc_str = ", ".join(f"{a}={n}" for a, n in sorted(per_account_cycles.items()))
|
|
437
|
+
if per_acc_str:
|
|
438
|
+
body.append(f"各账号生命周期片段数: {per_acc_str}")
|
|
439
|
+
if avg_interval is not None:
|
|
440
|
+
body.append(f"平均重连间隔: {avg_interval:.1f}s")
|
|
441
|
+
if freq_flags:
|
|
442
|
+
for acc, t0, n in freq_flags:
|
|
443
|
+
body.append(f"频繁重连: {acc} 在 {t0} 起 5 分钟内建连 {n} 次")
|
|
444
|
+
if cycle_summaries:
|
|
445
|
+
body.append("")
|
|
446
|
+
body.append("生命周期时间线:")
|
|
447
|
+
for ts_str, acc, summary, _ok in cycle_summaries:
|
|
448
|
+
body.append(f" [{ts_str}] feishu[{acc}]: {summary}")
|
|
449
|
+
if ws_errors:
|
|
450
|
+
body.append("")
|
|
451
|
+
body.append(f"WS 错误明细: {len(ws_errors)} 条")
|
|
452
|
+
for ts_str, acc, detail in ws_errors[:10]:
|
|
453
|
+
body.append(f" [{ts_str}] feishu[{acc}]: {detail}")
|
|
454
|
+
if len(ws_errors) > 10:
|
|
455
|
+
body.append(f" ... 共 {len(ws_errors)} 条")
|
|
456
|
+
if expired:
|
|
457
|
+
body.append("")
|
|
458
|
+
body.append(f"过期丢弃: {len(expired)} 条消息")
|
|
459
|
+
for ts_str, msg in expired[:10]:
|
|
460
|
+
body.append(f" [{ts_str}] {msg}")
|
|
461
|
+
if len(expired) > 10:
|
|
462
|
+
body.append(f" ... 共 {len(expired)} 条")
|
|
463
|
+
if health_count > 0:
|
|
464
|
+
body.append("")
|
|
465
|
+
body.append(f"health-monitor: {health_count} 条心跳记录(未展开)")
|
|
466
|
+
|
|
467
|
+
if body:
|
|
468
|
+
out.item("Channel WS 状态 — WebSocket 消息通道的连接、断连和消息丢失记录:")
|
|
469
|
+
out.evidence("应用日志", "\n".join(body))
|
|
470
|
+
out.set_data("ws_summary", {
|
|
471
|
+
"attempts": total_attempts,
|
|
472
|
+
"ready": total_ready,
|
|
473
|
+
"stops": total_stops,
|
|
474
|
+
"errors": total_errors,
|
|
475
|
+
"avg_interval_s": avg_interval,
|
|
476
|
+
"freq_reconnect": [{"account": a, "from": t, "count": n} for a, t, n in freq_flags],
|
|
477
|
+
"expired_count": len(expired),
|
|
478
|
+
"health_count": health_count,
|
|
479
|
+
})
|
|
480
|
+
|
|
481
|
+
|
|
482
|
+
# ── 4.5: gateway error codes (auth + WS close) ──
|
|
483
|
+
|
|
484
|
+
KNOWN_REASONS = [
|
|
485
|
+
("HTTP", 401, r"trusted_proxy_user_missing", "可信代理用户缺失,auth.mode=trustedProxy 时需在请求头设置 X-Remote-User"),
|
|
486
|
+
("HTTP", 401, r"unauthorized", "内部 API 认证失败,通常是 auth.mode 配置问题"),
|
|
487
|
+
("HTTP", 401, r".*", "内部 API 认证失败"),
|
|
488
|
+
("WS", 1008, r"pairing required", "节点未配对,需要执行 openclaw devices list 并批准配对请求"),
|
|
489
|
+
("WS", 1008, r"not_paired", "节点未配对"),
|
|
490
|
+
("WS", 1008, r"slow consumer", "客户端消费消息太慢,被服务端主动断开"),
|
|
491
|
+
("WS", 1008, r"connect challenge missing", "连接握手缺少 nonce,认证流程异常"),
|
|
492
|
+
("WS", 1008, r"connect challenge timeout", "连接握手超时,可能是网络延迟或服务端未响应"),
|
|
493
|
+
("WS", 1008, r"connect failed", "连接认证失败,可能是密钥/证书不匹配"),
|
|
494
|
+
("WS", 1008, r"Missing callSid", "语音通话缺少 callSid(Twilio 集成问题)"),
|
|
495
|
+
("WS", 1008, r"Unknown call", "未知的语音通话 ID"),
|
|
496
|
+
("WS", 1008, r"Start timeout", "语音会话启动超时"),
|
|
497
|
+
("WS", 1006, r".*", "连接异常断开,未收到 close frame — 通常是网络中断、进程崩溃或超时"),
|
|
498
|
+
("WS", 1001, r".*", "端点正在离开(服务器关闭或客户端断开)"),
|
|
499
|
+
("WS", 1011, r".*", "服务器内部错误导致关闭"),
|
|
500
|
+
("WS", 1012, r".*", "服务器正在重启"),
|
|
501
|
+
("WS", 1013, r".*", "服务器暂时不可用,请稍后重试"),
|
|
502
|
+
("WS", 1000, r".*", "正常关闭"),
|
|
503
|
+
]
|
|
504
|
+
|
|
505
|
+
|
|
506
|
+
def explain(kind, code, reason):
|
|
507
|
+
for k, c, pat, expl in KNOWN_REASONS:
|
|
508
|
+
if k == kind and c == code and re.search(pat, reason or "", re.I):
|
|
509
|
+
return expl
|
|
510
|
+
return None
|
|
511
|
+
|
|
512
|
+
|
|
513
|
+
def classify_err(msg: str):
|
|
514
|
+
low = msg.lower()
|
|
515
|
+
if "trusted_proxy_user_missing" in low:
|
|
516
|
+
return ("HTTP", 401, "trusted_proxy_user_missing")
|
|
517
|
+
if "unauthorized" in low:
|
|
518
|
+
return ("HTTP", 401, "unauthorized")
|
|
519
|
+
m = re.search(r"gateway closed\s*\((\d+)\)\s*:\s*(.*)", msg, re.I)
|
|
520
|
+
if m:
|
|
521
|
+
return ("WS", int(m.group(1)), m.group(2).strip())
|
|
522
|
+
m = re.search(r"(?:ws close|ws error|closed).*?code[=: ]*(\d{4})", msg, re.I)
|
|
523
|
+
if m:
|
|
524
|
+
code = int(m.group(1))
|
|
525
|
+
m2 = re.search(r"reason[=: ]*([^\s,)]+.*?)(?:\s*$|\s*[,|)])", msg, re.I)
|
|
526
|
+
return ("WS", code, (m2.group(1).strip() if m2 else ""))
|
|
527
|
+
if re.search(r"closed before connect|abnormal clos", msg, re.I):
|
|
528
|
+
return ("WS", 1006, "abnormal closure")
|
|
529
|
+
return None
|
|
530
|
+
|
|
531
|
+
|
|
532
|
+
VALID_GATEWAY_PREFIXES = ("gateway/", "feishu/core/lark-client", "feishu/channel/")
|
|
533
|
+
|
|
534
|
+
|
|
535
|
+
def section_gateway_errors(out: output.Output, app_log: str) -> None:
|
|
536
|
+
if not app_log or not os.path.isfile(app_log):
|
|
537
|
+
return
|
|
538
|
+
keyword_re = re.compile(
|
|
539
|
+
r"unauthorized|trusted_proxy_user_missing|gateway closed|ws close|ws error|closed before",
|
|
540
|
+
re.IGNORECASE,
|
|
541
|
+
)
|
|
542
|
+
events = []
|
|
543
|
+
try:
|
|
544
|
+
with open(app_log, errors="replace") as f:
|
|
545
|
+
for raw in f:
|
|
546
|
+
if not keyword_re.search(raw):
|
|
547
|
+
continue
|
|
548
|
+
raw = raw.strip()
|
|
549
|
+
if not raw:
|
|
550
|
+
continue
|
|
551
|
+
try:
|
|
552
|
+
obj = json.loads(raw)
|
|
553
|
+
except Exception:
|
|
554
|
+
continue
|
|
555
|
+
sub = get_log_subsystem(obj)
|
|
556
|
+
if not sub:
|
|
557
|
+
continue
|
|
558
|
+
if not any(sub == p or sub.startswith(p) for p in VALID_GATEWAY_PREFIXES):
|
|
559
|
+
continue
|
|
560
|
+
msg = parse_log_msg(obj)
|
|
561
|
+
ts = obj.get("time", "")[:19]
|
|
562
|
+
r = classify_err(msg)
|
|
563
|
+
if r is None:
|
|
564
|
+
continue
|
|
565
|
+
kind, code, reason = r
|
|
566
|
+
events.append((ts, kind, code, reason or "(no reason)"))
|
|
567
|
+
except OSError:
|
|
568
|
+
return
|
|
569
|
+
|
|
570
|
+
if not events:
|
|
571
|
+
out.item("Gateway 错误码: 0 条")
|
|
572
|
+
out.set_data("gateway_errors", {"total": 0})
|
|
573
|
+
return
|
|
574
|
+
|
|
575
|
+
total = len(events)
|
|
576
|
+
auth_count = sum(1 for e in events if e[1] == "HTTP")
|
|
577
|
+
ws_count = sum(1 for e in events if e[1] == "WS")
|
|
578
|
+
body = [f"共 {total} 条(认证 {auth_count} 条, WS 关闭 {ws_count} 条)", ""]
|
|
579
|
+
|
|
580
|
+
combo = defaultdict(lambda: {"count": 0, "timeline": []})
|
|
581
|
+
for ts, kind, code, reason in events:
|
|
582
|
+
combo[(kind, code, reason)]["count"] += 1
|
|
583
|
+
combo[(kind, code, reason)]["timeline"].append(ts)
|
|
584
|
+
|
|
585
|
+
structured = []
|
|
586
|
+
for (kind, code, reason), data in sorted(combo.items(), key=lambda x: -x[1]["count"]):
|
|
587
|
+
count = data["count"]
|
|
588
|
+
timeline = data["timeline"]
|
|
589
|
+
expl = explain(kind, code, reason)
|
|
590
|
+
body.append(f"{kind} {code}: {reason} ({count} 次)")
|
|
591
|
+
if expl:
|
|
592
|
+
body.append(f" {expl}")
|
|
593
|
+
shown = timeline[-5:] if len(timeline) > 5 else timeline
|
|
594
|
+
for t in shown:
|
|
595
|
+
body.append(f" [{t}]")
|
|
596
|
+
if len(timeline) > 5:
|
|
597
|
+
body.append(f" ... 共 {len(timeline)} 条,仅显示最近 5 条")
|
|
598
|
+
body.append("")
|
|
599
|
+
structured.append({
|
|
600
|
+
"kind": kind, "code": code, "reason": reason,
|
|
601
|
+
"count": count, "explanation": expl,
|
|
602
|
+
})
|
|
603
|
+
|
|
604
|
+
out.item("Gateway 错误码 — 认证 401 + WS 关闭码统一视图:")
|
|
605
|
+
out.evidence("应用日志", "\n".join(body))
|
|
606
|
+
out.set_data("gateway_errors", {
|
|
607
|
+
"total": total,
|
|
608
|
+
"auth_count": auth_count,
|
|
609
|
+
"ws_count": ws_count,
|
|
610
|
+
"by_reason": structured,
|
|
611
|
+
})
|
|
612
|
+
|
|
613
|
+
|
|
614
|
+
def main() -> int:
|
|
615
|
+
parser = cli.build_common_parser(
|
|
616
|
+
description="模块 4:Gateway 状态采集",
|
|
617
|
+
prog="04_gateway",
|
|
618
|
+
)
|
|
619
|
+
args = parser.parse_args()
|
|
620
|
+
|
|
621
|
+
out = output.init("gateway", json_mode=args.json, no_color=args.no_color)
|
|
622
|
+
out.section("模块 4:Gateway 状态")
|
|
623
|
+
|
|
624
|
+
port = 18789
|
|
625
|
+
if os.path.isfile(args.config):
|
|
626
|
+
try:
|
|
627
|
+
with open(args.config) as f:
|
|
628
|
+
cfg = json.load(f)
|
|
629
|
+
cp = cfg.get("gateway", {}).get("port")
|
|
630
|
+
if cp:
|
|
631
|
+
port = int(cp)
|
|
632
|
+
except Exception:
|
|
633
|
+
pass
|
|
634
|
+
|
|
635
|
+
section_process_port(out, args, port)
|
|
636
|
+
section_restart_events(out)
|
|
637
|
+
section_model_api(out, args)
|
|
638
|
+
|
|
639
|
+
app_log = recent_logs.latest_app_log(args.log_dir)
|
|
640
|
+
if app_log:
|
|
641
|
+
section_ws_lifecycle(out, app_log)
|
|
642
|
+
section_gateway_errors(out, app_log)
|
|
643
|
+
else:
|
|
644
|
+
out.item("Channel WS: 未找到应用日志文件")
|
|
645
|
+
out.item("Gateway 错误码: 未找到应用日志文件")
|
|
646
|
+
|
|
647
|
+
return out.done()
|
|
648
|
+
|
|
649
|
+
|
|
650
|
+
if __name__ == "__main__":
|
|
651
|
+
sys.exit(main())
|