openclaw-diag-cli 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,443 @@
1
+ #!/usr/bin/env python3
2
+ """模块 1:系统健康检查(DNS、网络、CPU、内存、磁盘、IO、进程、时间同步)。"""
3
+
4
+ from __future__ import annotations
5
+
6
+ import json
7
+ import os
8
+ import re
9
+ import shutil
10
+ import socket
11
+ import subprocess
12
+ import sys
13
+ import time
14
+ from pathlib import Path
15
+ from typing import List, Optional
16
+
17
+ sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
18
+
19
+ from ocdiag import cli, output, paths
20
+
21
+
22
+ def have(cmd: str) -> bool:
23
+ return shutil.which(cmd) is not None
24
+
25
+
26
+ def run(cmd, timeout=8, shell=False):
27
+ try:
28
+ r = subprocess.run(
29
+ cmd, shell=shell, capture_output=True, text=True,
30
+ timeout=timeout, check=False,
31
+ )
32
+ return r.returncode, r.stdout, r.stderr
33
+ except (subprocess.TimeoutExpired, FileNotFoundError, OSError):
34
+ return 1, "", ""
35
+
36
+
37
+ def dns_targets_from_config(config_path: str) -> List[str]:
38
+ targets = set()
39
+ if not os.path.isfile(config_path):
40
+ return []
41
+ try:
42
+ with open(config_path) as f:
43
+ cfg = json.load(f)
44
+ except Exception:
45
+ return []
46
+ providers = cfg.get("models", {}).get("providers", {}) or {}
47
+ for _, pv in providers.items():
48
+ if not isinstance(pv, dict):
49
+ continue
50
+ url = pv.get("baseUrl", "") or pv.get("baseURL", "")
51
+ m = re.match(r"https?://([^/:]+)", url)
52
+ if m:
53
+ targets.add(m.group(1))
54
+ channels = cfg.get("channels", {}) or {}
55
+ for ch_name, ch_cfg in channels.items():
56
+ if not isinstance(ch_cfg, dict):
57
+ continue
58
+ if "feishu" in ch_name or "lark" in ch_name:
59
+ targets.add("open.feishu.cn")
60
+ if "telegram" in ch_name:
61
+ targets.add("api.telegram.org")
62
+ if "discord" in ch_name:
63
+ targets.add("discord.com")
64
+ for key in ("webhook", "baseUrl", "url"):
65
+ url = ch_cfg.get(key, "")
66
+ if url:
67
+ m = re.match(r"https?://([^/:]+)", url)
68
+ if m:
69
+ targets.add(m.group(1))
70
+ gw = cfg.get("gateway", {}) or {}
71
+ for key in ("trustedProxyUrl", "controlUrl"):
72
+ url = gw.get(key, "")
73
+ if url:
74
+ m = re.match(r"https?://([^/:]+)", url)
75
+ if m:
76
+ targets.add(m.group(1))
77
+ return sorted(t for t in targets if not t.startswith("127.")
78
+ and t not in ("localhost", "0.0.0.0"))
79
+
80
+
81
+ def detect_oc_pid() -> Optional[str]:
82
+ rc, stdout, _ = run(["pgrep", "-f", "openclaw.*gateway"])
83
+ if rc == 0 and stdout.strip():
84
+ return stdout.splitlines()[0].strip()
85
+ rc, stdout, _ = run(["systemctl", "--user", "show", "openclaw-gateway.service",
86
+ "--property=MainPID"])
87
+ if rc == 0 and "=" in stdout:
88
+ v = stdout.strip().split("=", 1)[1]
89
+ if v and v != "0":
90
+ return v
91
+ return None
92
+
93
+
94
+ def section_dns(out: output.Output, targets: List[str]) -> None:
95
+ out.line(" ── 1.1 DNS 解析 ──")
96
+ out.line("")
97
+ has_dig = have("dig")
98
+ has_getent = have("getent")
99
+ if not targets:
100
+ targets = ["dns.google"]
101
+ results = []
102
+ for h in targets:
103
+ if not h:
104
+ continue
105
+ ip = ""
106
+ start_ns = time.time_ns()
107
+ if has_dig:
108
+ rc, stdout, _ = run(["timeout", "2", "dig", "+short", "+time=2", "+tries=1", h], timeout=4)
109
+ if rc == 0:
110
+ for ln in stdout.splitlines():
111
+ if re.match(r"^\d+\.", ln):
112
+ ip = ln.strip()
113
+ break
114
+ elif has_getent:
115
+ rc, stdout, _ = run(["timeout", "2", "getent", "hosts", h], timeout=4)
116
+ if rc == 0 and stdout:
117
+ ip = stdout.split()[0]
118
+ elapsed_ms = (time.time_ns() - start_ns) // 1_000_000
119
+ if ip:
120
+ out.item(f"{h}: {ip} ({elapsed_ms}ms)")
121
+ results.append({"host": h, "ip": ip, "elapsed_ms": elapsed_ms})
122
+ else:
123
+ out.item(f"{h}: FAILED (timeout 2s)")
124
+ results.append({"host": h, "ip": None, "elapsed_ms": elapsed_ms})
125
+ if not has_dig and not has_getent:
126
+ out.item("dig/getent 均未安装,跳过 DNS 测试")
127
+ out.line("")
128
+ out.set_data("dns", results)
129
+
130
+
131
+ def section_network(out: output.Output, targets: List[str]) -> None:
132
+ out.line(" ── 1.2 网络连通性 ──")
133
+ out.line("")
134
+ if have("iptables"):
135
+ rc, stdout, _ = run(["iptables", "-L", "-n"], timeout=5)
136
+ ipt_count = sum(1 for ln in stdout.splitlines() if "DROP" in ln or "REJECT" in ln) if rc == 0 else 0
137
+ out.item(f"iptables: {ipt_count} 条 DROP/REJECT 规则")
138
+ out.set_data("iptables_drop_reject_count", ipt_count)
139
+ else:
140
+ out.item("iptables: 未安装")
141
+ if have("curl"):
142
+ first = targets[0] if targets else ""
143
+ if first:
144
+ rc, stdout, _ = run([
145
+ "curl", "-so", "/dev/null",
146
+ "-w", "%{http_code} %{time_connect}s",
147
+ "--connect-timeout", "3", "--max-time", "5",
148
+ f"https://{first}",
149
+ ], timeout=8)
150
+ curl_out = stdout.strip() or "FAILED"
151
+ out.item(f"{first}:443 连接: {curl_out}")
152
+ out.set_data("curl_test", {"host": first, "result": curl_out})
153
+ else:
154
+ out.item("curl: 未安装,跳过连通性测试")
155
+ out.line("")
156
+
157
+
158
+ def section_cpu(out: output.Output, oc_pid: Optional[str]) -> None:
159
+ out.line(" ── 1.3 CPU ──")
160
+ out.line("")
161
+ rc, stdout, _ = run(["nproc"])
162
+ try:
163
+ ncpu = int(stdout.strip()) if rc == 0 else 1
164
+ except Exception:
165
+ ncpu = 1
166
+ rc, stdout, _ = run(["uptime"])
167
+ load_line = "unknown"
168
+ if rc == 0:
169
+ m = re.search(r"load average:\s*(.*)", stdout)
170
+ if m:
171
+ load_line = m.group(1).strip()
172
+ out.item(f"核心数: {ncpu} | 负载: {load_line}")
173
+ out.set_data("cpu_count", ncpu)
174
+ out.set_data("load_average", load_line)
175
+ try:
176
+ load1 = float(re.split(r"[, ]+", load_line)[0])
177
+ if load1 > ncpu * 2:
178
+ out.item(f"注意: 1 分钟负载 {load1} 超过核心数 {ncpu} 的 2 倍")
179
+ except Exception:
180
+ pass
181
+
182
+ if oc_pid:
183
+ rc, stdout, _ = run(["ps", "-p", oc_pid, "-o", "pid,pcpu,pmem,rss,args", "--no-headers"])
184
+ if rc == 0 and stdout.strip():
185
+ parts = stdout.split(None, 4)
186
+ if len(parts) >= 5:
187
+ pcpu, pmem, rss = parts[1], parts[2], parts[3]
188
+ try:
189
+ rss_mb = int(int(rss) / 1024)
190
+ except Exception:
191
+ rss_mb = 0
192
+ out.item(f"OpenClaw 进程(PID={oc_pid}): CPU={pcpu}% MEM={pmem}% RSS={rss_mb}MB")
193
+ out.set_data("openclaw_proc", {"pid": oc_pid, "cpu_pct": pcpu, "mem_pct": pmem, "rss_mb": rss_mb})
194
+ else:
195
+ out.item(f"OpenClaw 进程(PID={oc_pid}): 无法读取 ps 信息")
196
+ else:
197
+ out.item("OpenClaw 进程: 未运行")
198
+ out.line("")
199
+
200
+
201
+ def section_memory(out: output.Output) -> None:
202
+ out.line(" ── 1.4 内存 ──")
203
+ out.line("")
204
+ if have("free"):
205
+ rc, stdout, _ = run(["free", "-m"])
206
+ if rc == 0:
207
+ for line in stdout.splitlines():
208
+ if line.startswith("Mem:"):
209
+ p = line.split()
210
+ if len(p) >= 7:
211
+ try:
212
+ total = int(p[1]); used = int(p[2]); avail = int(p[6])
213
+ out.item(
214
+ f"内存: 总 {total/1024:.0f}GB | 已用 {used/1024:.1f}GB | 可用 {avail/1024:.1f}GB"
215
+ )
216
+ out.set_data("memory", {"total_mb": total, "used_mb": used, "available_mb": avail})
217
+ except Exception:
218
+ pass
219
+ if line.startswith("Swap:"):
220
+ p = line.split()
221
+ if len(p) >= 3:
222
+ try:
223
+ total = int(p[1]); used = int(p[2])
224
+ pct = (used * 100 / total) if total > 0 else 0
225
+ out.item(
226
+ f"Swap: 总 {total/1024:.0f}GB | 已用 {used/1024:.1f}GB ({pct:.1f}%)"
227
+ )
228
+ out.set_data("swap", {"total_mb": total, "used_mb": used, "pct": pct})
229
+ except Exception:
230
+ pass
231
+ else:
232
+ out.item("free: 未安装")
233
+
234
+ oom_count = 0
235
+ if have("journalctl"):
236
+ rc, stdout, _ = run(["journalctl", "-k", "--since", "7 days ago", "--no-pager"], timeout=10)
237
+ if rc == 0:
238
+ oom_count = sum(1 for ln in stdout.splitlines()
239
+ if re.search(r"oom-killer|killed process|out of memory", ln, re.I))
240
+ elif have("dmesg"):
241
+ rc, stdout, _ = run(["dmesg"])
242
+ if rc == 0:
243
+ oom_count = sum(1 for ln in stdout.splitlines()
244
+ if re.search(r"oom-killer|killed process|out of memory", ln, re.I))
245
+ out.item(f"OOM kill(7天内): {oom_count} 次")
246
+ out.set_data("oom_count_7d", oom_count)
247
+ out.line("")
248
+
249
+
250
+ def section_disk_space(out: output.Output) -> None:
251
+ out.line(" ── 1.5 磁盘空间 ──")
252
+ out.line("")
253
+ paths_to_check = [paths.OPENCLAW_HOME, "/tmp/openclaw", "/"]
254
+ results = []
255
+ for p in paths_to_check:
256
+ if os.path.isdir(p):
257
+ rc, stdout, _ = run(["df", "-h", p])
258
+ if rc == 0:
259
+ lines = stdout.splitlines()
260
+ if len(lines) >= 2:
261
+ parts = lines[1].split()
262
+ if len(parts) >= 5:
263
+ pct = parts[4]
264
+ used = parts[2]; total = parts[1]
265
+ warn = ""
266
+ try:
267
+ pct_n = int(pct.rstrip("%"))
268
+ if pct_n >= 90:
269
+ warn = " [告警: 超过 90%]"
270
+ except Exception:
271
+ pass
272
+ out.item(f"{p}: {pct} ({used}/{total}){warn}")
273
+ results.append({"path": p, "used": used, "total": total, "pct": pct})
274
+ continue
275
+ out.item(f"{p}: df 读取失败")
276
+ else:
277
+ out.item(f"{p}: 路径不存在")
278
+ out.set_data("disk", results)
279
+ out.line("")
280
+
281
+
282
+ def section_disk_io(out: output.Output) -> None:
283
+ out.line(" ── 1.6 磁盘 I/O ──")
284
+ out.line("")
285
+ if have("iostat"):
286
+ rc, stdout, _ = run(["iostat", "-c", "1", "2"], timeout=5)
287
+ iowait = ""
288
+ if rc == 0:
289
+ for ln in stdout.splitlines():
290
+ if ln.strip().startswith(" "):
291
+ parts = ln.split()
292
+ if len(parts) >= 4:
293
+ iowait = parts[3]
294
+ if iowait:
295
+ out.item(f"iowait: {iowait}%")
296
+ out.set_data("iowait_pct", iowait)
297
+ else:
298
+ out.item("iowait: iostat 无输出")
299
+ else:
300
+ iowait_pct = "?"
301
+ try:
302
+ with open("/proc/stat") as f:
303
+ for ln in f:
304
+ if ln.startswith("cpu "):
305
+ parts = ln.split()
306
+ nums = [int(x) for x in parts[1:]]
307
+ total = sum(nums)
308
+ iw = nums[4] if len(nums) > 4 else 0
309
+ iowait_pct = f"{(iw * 100 / total):.2f}" if total > 0 else "0"
310
+ break
311
+ except OSError:
312
+ pass
313
+ out.item(f"iowait (累计): {iowait_pct}% (iostat 未安装)")
314
+ out.set_data("iowait_pct", iowait_pct)
315
+
316
+ disk_err = 0
317
+ if have("dmesg"):
318
+ rc, stdout, _ = run(["dmesg"])
319
+ if rc == 0:
320
+ disk_err = sum(1 for ln in stdout.splitlines()
321
+ if re.search(r"I/O error|Buffer I/O error|end_request.*I/O|ata.*error", ln, re.I))
322
+ out.item(f"磁盘错误(dmesg): {disk_err} 条")
323
+ out.set_data("disk_errors_dmesg", disk_err)
324
+ out.line("")
325
+
326
+
327
+ def section_process(out: output.Output, oc_pid: Optional[str]) -> None:
328
+ out.line(" ── 1.7 进程状态 ──")
329
+ out.line("")
330
+ if oc_pid and os.path.isdir(f"/proc/{oc_pid}"):
331
+ rc, stdout, _ = run(["ps", "-p", oc_pid, "-o", "etime=,rss="])
332
+ etime = "?"
333
+ rss_mb = 0
334
+ if rc == 0 and stdout.strip():
335
+ parts = stdout.split()
336
+ if len(parts) >= 2:
337
+ etime = parts[0]
338
+ try:
339
+ rss_mb = int(int(parts[1]) / 1024)
340
+ except Exception:
341
+ rss_mb = 0
342
+ try:
343
+ fd_count = len(os.listdir(f"/proc/{oc_pid}/fd"))
344
+ except OSError:
345
+ fd_count = 0
346
+ fd_limit = "?"
347
+ try:
348
+ with open(f"/proc/{oc_pid}/limits") as f:
349
+ for ln in f:
350
+ if ln.startswith("Max open files"):
351
+ parts = ln.split()
352
+ if len(parts) >= 4:
353
+ fd_limit = parts[3]
354
+ break
355
+ except OSError:
356
+ pass
357
+ out.item(f"Gateway 进程: PID={oc_pid} | uptime={etime} | RSS={rss_mb}MB")
358
+ if fd_limit not in ("?", "0"):
359
+ try:
360
+ fd_pct = (fd_count * 100 / int(fd_limit))
361
+ out.item(f"文件描述符: {fd_count}/{fd_limit} ({fd_pct:.2f}%)")
362
+ except Exception:
363
+ out.item(f"文件描述符: {fd_count}/{fd_limit}")
364
+ else:
365
+ out.item(f"文件描述符: {fd_count} (limit={fd_limit})")
366
+ out.set_data("process", {
367
+ "pid": oc_pid, "etime": etime, "rss_mb": rss_mb,
368
+ "fd_count": fd_count, "fd_limit": fd_limit,
369
+ })
370
+ else:
371
+ out.item("Gateway 进程: 未运行")
372
+
373
+ rc, stdout, _ = run(["ps", "-eo", "stat", "--no-headers"])
374
+ zombie_count = sum(1 for ln in stdout.splitlines() if ln.strip().startswith("Z")) if rc == 0 else 0
375
+ out.item(f"僵尸进程: {zombie_count}")
376
+ out.set_data("zombie_count", zombie_count)
377
+ out.line("")
378
+
379
+
380
+ def section_time_sync(out: output.Output) -> None:
381
+ out.line(" ── 1.8 时间同步 ──")
382
+ out.line("")
383
+ tsync_ok = False
384
+ if have("timedatectl"):
385
+ rc, stdout, _ = run(["timedatectl"])
386
+ if rc == 0 and stdout:
387
+ ntp_sync_m = re.search(r"System clock synchronized:\s*(yes|no)", stdout)
388
+ sync_status_m = re.search(r"NTP service:\s*(\S+)", stdout)
389
+ ntp_sync = ntp_sync_m.group(1) if ntp_sync_m else "unknown"
390
+ sync_status = sync_status_m.group(1) if sync_status_m else "unknown"
391
+ out.item(f"NTP 同步: service={sync_status} synchronized={ntp_sync}")
392
+ out.set_data("ntp", {"service": sync_status, "synchronized": ntp_sync})
393
+ tsync_ok = True
394
+ if have("ntpstat"):
395
+ rc, stdout, _ = run(["ntpstat"])
396
+ ntpstat_out = " | ".join(stdout.splitlines()[:2]) if stdout else ""
397
+ if ntpstat_out:
398
+ out.item(f"ntpstat: {ntpstat_out}")
399
+ tsync_ok = True
400
+ if have("chronyc"):
401
+ rc, stdout, _ = run(["chronyc", "tracking"])
402
+ if rc == 0:
403
+ extracted = [ln for ln in stdout.splitlines()
404
+ if "System time" in ln or "Last offset" in ln]
405
+ if extracted:
406
+ out.item(f"chrony: {' | '.join(extracted)}")
407
+ tsync_ok = True
408
+ if not tsync_ok:
409
+ out.item("时间同步: 无法检测(timedatectl/ntpstat/chronyc 均不可用)")
410
+
411
+
412
+ def main() -> int:
413
+ parser = cli.build_common_parser(
414
+ description="模块 1:系统健康检查",
415
+ prog="01_sys_health",
416
+ )
417
+ args = parser.parse_args()
418
+
419
+ out = output.init("sys_health", json_mode=args.json, no_color=args.no_color)
420
+ out.section("模块 1:系统健康检查")
421
+ out.line("")
422
+
423
+ targets = dns_targets_from_config(args.config)
424
+ if not targets:
425
+ targets = ["dns.google"]
426
+
427
+ oc_pid = detect_oc_pid()
428
+ out.set_data("openclaw_pid", oc_pid)
429
+
430
+ section_dns(out, targets)
431
+ section_network(out, targets)
432
+ section_cpu(out, oc_pid)
433
+ section_memory(out)
434
+ section_disk_space(out)
435
+ section_disk_io(out)
436
+ section_process(out, oc_pid)
437
+ section_time_sync(out)
438
+
439
+ return out.done()
440
+
441
+
442
+ if __name__ == "__main__":
443
+ sys.exit(main())