openclaw-diag-cli 0.1.3 → 0.2.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +83 -71
- package/bin/ocdiag +0 -1
- package/bin/openclaw-diag.js +65 -176
- package/diag/01_sys_health.py +0 -2
- package/diag/02_environment.py +32 -6
- package/diag/03_configuration.py +4 -1
- package/diag/04_gateway.py +30 -8
- package/diag/05_recent_errors.py +24 -14
- package/diag/06_cron_jobs.py +4 -41
- package/diag/07_performance.py +114 -42
- package/diag/08_sessions.py +2 -54
- package/diag/09_plugin_diag.py +52 -25
- package/diag/10_shell_history.py +28 -10
- package/lib/__pycache__/bundle.cpython-310.pyc +0 -0
- package/lib/bundle.py +6 -13
- package/ocdiag/__init__.py +1 -1
- package/ocdiag/__pycache__/__init__.cpython-310.pyc +0 -0
- package/ocdiag/__pycache__/cli.cpython-310.pyc +0 -0
- package/ocdiag/__pycache__/dispatcher.cpython-310.pyc +0 -0
- package/ocdiag/__pycache__/doctor.cpython-310.pyc +0 -0
- package/ocdiag/__pycache__/jsonlog.cpython-310.pyc +0 -0
- package/ocdiag/__pycache__/output.cpython-310.pyc +0 -0
- package/ocdiag/__pycache__/paths.cpython-310.pyc +0 -0
- package/ocdiag/__pycache__/recent_logs.cpython-310.pyc +0 -0
- package/ocdiag/__pycache__/sensitive.cpython-310.pyc +0 -0
- package/ocdiag/__pycache__/sessions.cpython-310.pyc +0 -0
- package/ocdiag/__pycache__/timeutil.cpython-310.pyc +0 -0
- package/ocdiag/__pycache__/tokens.cpython-310.pyc +0 -0
- package/ocdiag/cli.py +16 -1
- package/ocdiag/dispatcher.py +140 -53
- package/ocdiag/doctor.py +162 -0
- package/ocdiag/jsonlog.py +0 -5
- package/ocdiag/paths.py +0 -17
- package/ocdiag/recent_logs.py +0 -3
- package/ocdiag/sensitive.py +95 -1
- package/ocdiag/sessions.py +161 -0
- package/ocdiag/timeutil.py +0 -11
- package/ocdiag/tokens.py +0 -4
- package/package.json +2 -2
- package/tools/oc_session_extract.py +190 -67
- package/tools/oc_session_trace.py +48 -46
package/diag/02_environment.py
CHANGED
|
@@ -7,7 +7,6 @@ import json
|
|
|
7
7
|
import os
|
|
8
8
|
import re
|
|
9
9
|
import shlex
|
|
10
|
-
import shutil
|
|
11
10
|
import subprocess
|
|
12
11
|
import sys
|
|
13
12
|
from pathlib import Path
|
|
@@ -16,7 +15,7 @@ from typing import Optional
|
|
|
16
15
|
sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
|
|
17
16
|
|
|
18
17
|
from ocdiag import cli, output, paths
|
|
19
|
-
from ocdiag.sensitive import safe_val
|
|
18
|
+
from ocdiag.sensitive import safe_val, sanitize_text
|
|
20
19
|
|
|
21
20
|
|
|
22
21
|
def run(cmd, timeout=5):
|
|
@@ -110,7 +109,6 @@ def parse_proc_environ(pid: str) -> Optional[list]:
|
|
|
110
109
|
def main() -> int:
|
|
111
110
|
parser = cli.build_common_parser(
|
|
112
111
|
description="模块 2:采集 OpenClaw 基础环境",
|
|
113
|
-
prog="02_environment",
|
|
114
112
|
)
|
|
115
113
|
args = parser.parse_args()
|
|
116
114
|
out = output.init("environment", json_mode=args.json, no_color=args.no_color)
|
|
@@ -123,6 +121,12 @@ def main() -> int:
|
|
|
123
121
|
out.item("OpenClaw 版本: 无法确定")
|
|
124
122
|
out.evidence("openclaw --version", "命令未找到或无输出")
|
|
125
123
|
out.set_data("oc_version", oc_version)
|
|
124
|
+
if not oc_version:
|
|
125
|
+
out.set_data("oc_version_status", {
|
|
126
|
+
"found": False,
|
|
127
|
+
"reason": "command_not_found",
|
|
128
|
+
"checked": "openclaw --version + pnpm/global node_modules",
|
|
129
|
+
})
|
|
126
130
|
|
|
127
131
|
service_file = paths.SERVICE_FILE
|
|
128
132
|
svc_version = None
|
|
@@ -153,6 +157,10 @@ def main() -> int:
|
|
|
153
157
|
out.item("Node.js: 未找到")
|
|
154
158
|
out.evidence("node --version", "命令未找到")
|
|
155
159
|
out.set_data("node_version", node_ver)
|
|
160
|
+
if not node_ver:
|
|
161
|
+
out.set_data("node_version_status", {
|
|
162
|
+
"found": False, "reason": "command_not_found", "checked": "node --version",
|
|
163
|
+
})
|
|
156
164
|
|
|
157
165
|
rc, stdout, _ = run(["free", "-m"])
|
|
158
166
|
mem_avail = ""
|
|
@@ -166,6 +174,10 @@ def main() -> int:
|
|
|
166
174
|
if mem_avail:
|
|
167
175
|
out.item(f"可用内存: {mem_avail} MB")
|
|
168
176
|
out.set_data("memory_available_mb", mem_avail)
|
|
177
|
+
if not mem_avail:
|
|
178
|
+
out.set_data("memory_status", {
|
|
179
|
+
"found": False, "reason": "free_unavailable", "checked": "free -m",
|
|
180
|
+
})
|
|
169
181
|
|
|
170
182
|
rc, stdout, _ = run(["df", "-m", paths.OPENCLAW_HOME])
|
|
171
183
|
disk_avail = ""
|
|
@@ -178,6 +190,11 @@ def main() -> int:
|
|
|
178
190
|
if disk_avail:
|
|
179
191
|
out.item(f"磁盘可用 ({paths.OPENCLAW_HOME}): {disk_avail} MB")
|
|
180
192
|
out.set_data("disk_available_mb", disk_avail)
|
|
193
|
+
if not disk_avail:
|
|
194
|
+
out.set_data("disk_status", {
|
|
195
|
+
"found": False, "reason": "df_unavailable",
|
|
196
|
+
"checked": f"df -m {paths.OPENCLAW_HOME}",
|
|
197
|
+
})
|
|
181
198
|
|
|
182
199
|
gw_status = gateway_systemctl_status()
|
|
183
200
|
if gw_status:
|
|
@@ -245,8 +262,16 @@ def main() -> int:
|
|
|
245
262
|
out.set_data("gateway_env", [{"key": k, "value": v} for k, v in env_pairs])
|
|
246
263
|
elif pid:
|
|
247
264
|
out.item(f"无法读取 /proc/{pid}/environ(权限不足?)")
|
|
265
|
+
out.set_data("gateway_env_status", {
|
|
266
|
+
"found": False, "reason": "proc_unreadable",
|
|
267
|
+
"checked": f"/proc/{pid}/environ",
|
|
268
|
+
})
|
|
248
269
|
else:
|
|
249
270
|
out.item("Gateway 进程未运行,跳过")
|
|
271
|
+
out.set_data("gateway_env_status", {
|
|
272
|
+
"found": False, "reason": "process_not_running",
|
|
273
|
+
"checked": "pgrep -f openclaw.*gateway",
|
|
274
|
+
})
|
|
250
275
|
|
|
251
276
|
if os.path.isfile(paths.SERVICE_ENV_FILE):
|
|
252
277
|
out.line("")
|
|
@@ -281,9 +306,10 @@ def main() -> int:
|
|
|
281
306
|
try:
|
|
282
307
|
with open(service_file) as f:
|
|
283
308
|
for line in f:
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
309
|
+
raw = line.rstrip("\n")
|
|
310
|
+
out.item(raw if args.unmask else sanitize_text(raw))
|
|
311
|
+
except OSError as e:
|
|
312
|
+
out.item(f"读取失败: {e}")
|
|
287
313
|
|
|
288
314
|
return out.done()
|
|
289
315
|
|
package/diag/03_configuration.py
CHANGED
|
@@ -85,7 +85,6 @@ def emit_config(out: output.Output, data: list, obj, prefix: str = "") -> None:
|
|
|
85
85
|
def main() -> int:
|
|
86
86
|
parser = cli.build_common_parser(
|
|
87
87
|
description="模块 3:采集 OpenClaw 配置(含敏感字段脱敏)",
|
|
88
|
-
prog="03_configuration",
|
|
89
88
|
)
|
|
90
89
|
args = parser.parse_args()
|
|
91
90
|
|
|
@@ -95,6 +94,10 @@ def main() -> int:
|
|
|
95
94
|
config_path = args.config
|
|
96
95
|
if not os.path.isfile(config_path):
|
|
97
96
|
out.item(f"配置文件未找到: {config_path}")
|
|
97
|
+
out.line(" 下一步:")
|
|
98
|
+
out.line(" 1) 确认 OpenClaw 已经初始化(运行过 `openclaw` 即会生成配置)")
|
|
99
|
+
out.line(" 2) 用 OPENCLAW_CONFIG=/path/to/openclaw.json 或 --config 指向正确路径")
|
|
100
|
+
out.line(" 3) 在容器/远端诊断时,用 OPENCLAW_HOME=/path 整体覆盖")
|
|
98
101
|
out.evidence(config_path, "<文件缺失>")
|
|
99
102
|
out.set_data("config_path", config_path)
|
|
100
103
|
out.set_data("found", False)
|
package/diag/04_gateway.py
CHANGED
|
@@ -11,7 +11,6 @@ import sys
|
|
|
11
11
|
from collections import defaultdict
|
|
12
12
|
from datetime import datetime
|
|
13
13
|
from pathlib import Path
|
|
14
|
-
from typing import List, Optional
|
|
15
14
|
|
|
16
15
|
sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
|
|
17
16
|
|
|
@@ -146,11 +145,20 @@ def section_restart_events(out: output.Output) -> None:
|
|
|
146
145
|
|
|
147
146
|
def section_model_api(out: output.Output, args) -> None:
|
|
148
147
|
if not os.path.isfile(args.config):
|
|
148
|
+
out.item("模型 API: 配置文件未找到")
|
|
149
|
+
out.set_data("model_api_status", {
|
|
150
|
+
"found": False, "reason": "config_not_found", "checked": args.config,
|
|
151
|
+
})
|
|
149
152
|
return
|
|
150
153
|
try:
|
|
151
154
|
with open(args.config) as f:
|
|
152
155
|
cfg = json.load(f)
|
|
153
|
-
except
|
|
156
|
+
except (OSError, json.JSONDecodeError) as e:
|
|
157
|
+
out.item(f"模型 API: 配置读取失败 ({type(e).__name__})")
|
|
158
|
+
out.set_data("model_api_status", {
|
|
159
|
+
"found": False, "reason": "config_unreadable",
|
|
160
|
+
"checked": args.config, "error": str(e)[:200],
|
|
161
|
+
})
|
|
154
162
|
return
|
|
155
163
|
models = cfg.get("models", {}) or {}
|
|
156
164
|
all_cfgs = {}
|
|
@@ -290,8 +298,12 @@ def section_ws_lifecycle(out: output.Output, app_log: str) -> None:
|
|
|
290
298
|
else:
|
|
291
299
|
continue
|
|
292
300
|
events.append((ts_dt, ts_str, account, kind, msg))
|
|
293
|
-
except OSError:
|
|
294
|
-
out.item("Channel WS: 读取应用日志失败")
|
|
301
|
+
except OSError as e:
|
|
302
|
+
out.item(f"Channel WS: 读取应用日志失败 ({type(e).__name__})")
|
|
303
|
+
out.set_data("ws_summary_status", {
|
|
304
|
+
"found": False, "reason": "log_unreadable",
|
|
305
|
+
"checked": app_log, "error": str(e)[:200],
|
|
306
|
+
})
|
|
295
307
|
return
|
|
296
308
|
|
|
297
309
|
if not events and not expired:
|
|
@@ -564,7 +576,12 @@ def section_gateway_errors(out: output.Output, app_log: str) -> None:
|
|
|
564
576
|
continue
|
|
565
577
|
kind, code, reason = r
|
|
566
578
|
events.append((ts, kind, code, reason or "(no reason)"))
|
|
567
|
-
except OSError:
|
|
579
|
+
except OSError as e:
|
|
580
|
+
out.item(f"Gateway 错误码: 读取应用日志失败 ({type(e).__name__})")
|
|
581
|
+
out.set_data("gateway_errors_status", {
|
|
582
|
+
"found": False, "reason": "log_unreadable",
|
|
583
|
+
"checked": app_log, "error": str(e)[:200],
|
|
584
|
+
})
|
|
568
585
|
return
|
|
569
586
|
|
|
570
587
|
if not events:
|
|
@@ -614,7 +631,6 @@ def section_gateway_errors(out: output.Output, app_log: str) -> None:
|
|
|
614
631
|
def main() -> int:
|
|
615
632
|
parser = cli.build_common_parser(
|
|
616
633
|
description="模块 4:Gateway 状态采集",
|
|
617
|
-
prog="04_gateway",
|
|
618
634
|
)
|
|
619
635
|
args = parser.parse_args()
|
|
620
636
|
|
|
@@ -622,6 +638,7 @@ def main() -> int:
|
|
|
622
638
|
out.section("模块 4:Gateway 状态")
|
|
623
639
|
|
|
624
640
|
port = 18789
|
|
641
|
+
port_source = "default"
|
|
625
642
|
if os.path.isfile(args.config):
|
|
626
643
|
try:
|
|
627
644
|
with open(args.config) as f:
|
|
@@ -629,8 +646,13 @@ def main() -> int:
|
|
|
629
646
|
cp = cfg.get("gateway", {}).get("port")
|
|
630
647
|
if cp:
|
|
631
648
|
port = int(cp)
|
|
632
|
-
|
|
633
|
-
|
|
649
|
+
port_source = "config"
|
|
650
|
+
except (OSError, json.JSONDecodeError, ValueError) as e:
|
|
651
|
+
out.set_data("port_source_status", {
|
|
652
|
+
"found": False, "reason": "config_unreadable",
|
|
653
|
+
"checked": args.config, "error": str(e)[:200],
|
|
654
|
+
})
|
|
655
|
+
out.set_data("port_source", port_source)
|
|
634
656
|
|
|
635
657
|
section_process_port(out, args, port)
|
|
636
658
|
section_restart_events(out)
|
package/diag/05_recent_errors.py
CHANGED
|
@@ -79,21 +79,25 @@ def render_log_line(line: str, max_len: int = 300) -> str:
|
|
|
79
79
|
return line
|
|
80
80
|
|
|
81
81
|
|
|
82
|
-
def collect_error_lines(log_files: List[str])
|
|
82
|
+
def collect_error_lines(log_files: List[str]):
|
|
83
|
+
"""Returns (matched_lines, unreadable_files). One unreadable file does not
|
|
84
|
+
abort the whole scan, but we tell the caller which paths failed."""
|
|
83
85
|
out: List[str] = []
|
|
86
|
+
unreadable: List[dict] = []
|
|
84
87
|
for lf in log_files:
|
|
85
88
|
try:
|
|
86
89
|
with open(lf, errors="replace") as f:
|
|
87
90
|
for ln in f:
|
|
88
91
|
if _ERR_RE.search(ln):
|
|
89
92
|
out.append(ln.rstrip("\n"))
|
|
90
|
-
except OSError:
|
|
91
|
-
|
|
92
|
-
return out
|
|
93
|
+
except OSError as e:
|
|
94
|
+
unreadable.append({"path": lf, "error": f"{type(e).__name__}: {e}"})
|
|
95
|
+
return out, unreadable
|
|
93
96
|
|
|
94
97
|
|
|
95
|
-
def collect_api_errors(log_files: List[str])
|
|
98
|
+
def collect_api_errors(log_files: List[str]):
|
|
96
99
|
out: List[str] = []
|
|
100
|
+
unreadable: List[dict] = []
|
|
97
101
|
for lf in log_files:
|
|
98
102
|
try:
|
|
99
103
|
with open(lf, errors="replace") as f:
|
|
@@ -107,9 +111,9 @@ def collect_api_errors(log_files: List[str]) -> List[str]:
|
|
|
107
111
|
if _API_EXCLUDE_TXT_RE.search(ln):
|
|
108
112
|
continue
|
|
109
113
|
out.append(ln.rstrip("\n"))
|
|
110
|
-
except OSError:
|
|
111
|
-
|
|
112
|
-
return out
|
|
114
|
+
except OSError as e:
|
|
115
|
+
unreadable.append({"path": lf, "error": f"{type(e).__name__}: {e}"})
|
|
116
|
+
return out, unreadable
|
|
113
117
|
|
|
114
118
|
|
|
115
119
|
def journalctl_errors() -> str:
|
|
@@ -152,17 +156,21 @@ def tool_errors_from_session(session_path: str):
|
|
|
152
156
|
msg = obj.get("message", {}) or {}
|
|
153
157
|
if msg.get("isError"):
|
|
154
158
|
counts[msg.get("toolName", "unknown")] += 1
|
|
155
|
-
except
|
|
156
|
-
|
|
159
|
+
except (json.JSONDecodeError, ValueError):
|
|
160
|
+
# Expected: session.jsonl can have malformed lines from
|
|
161
|
+
# interrupted writes; skip and keep counting.
|
|
162
|
+
continue
|
|
157
163
|
except OSError:
|
|
158
|
-
|
|
164
|
+
# Session file disappeared between glob() and open(). Caller already
|
|
165
|
+
# falls back to "no recent session"; reporting per-file unreadable
|
|
166
|
+
# would mostly add noise here.
|
|
167
|
+
return counts
|
|
159
168
|
return counts
|
|
160
169
|
|
|
161
170
|
|
|
162
171
|
def main() -> int:
|
|
163
172
|
parser = cli.build_common_parser(
|
|
164
173
|
description="模块 5:采集近期错误日志",
|
|
165
|
-
prog="05_recent_errors",
|
|
166
174
|
)
|
|
167
175
|
args = parser.parse_args()
|
|
168
176
|
|
|
@@ -187,8 +195,10 @@ def main() -> int:
|
|
|
187
195
|
out.line("")
|
|
188
196
|
|
|
189
197
|
if logs:
|
|
190
|
-
err_lines = collect_error_lines(logs)
|
|
198
|
+
err_lines, err_unreadable = collect_error_lines(logs)
|
|
191
199
|
out.set_data("app_error_count", len(err_lines))
|
|
200
|
+
if err_unreadable:
|
|
201
|
+
out.set_data("app_log_unreadable", err_unreadable)
|
|
192
202
|
if err_lines:
|
|
193
203
|
out.item(f"应用日志 ERROR 级别: {len(err_lines)} 条 — Gateway 运行时报错,包括工具失败、模型异常等")
|
|
194
204
|
rendered = []
|
|
@@ -202,7 +212,7 @@ def main() -> int:
|
|
|
202
212
|
else:
|
|
203
213
|
out.item("应用日志 ERROR 级别: 0 条 — Gateway 运行时报错")
|
|
204
214
|
|
|
205
|
-
api_lines = collect_api_errors(logs)
|
|
215
|
+
api_lines, _api_unreadable = collect_api_errors(logs)
|
|
206
216
|
out.set_data("api_error_count", len(api_lines))
|
|
207
217
|
if api_lines:
|
|
208
218
|
out.item(f"模型 API HTTP 错误: {len(api_lines)} 条 ")
|
package/diag/06_cron_jobs.py
CHANGED
|
@@ -13,11 +13,12 @@ import sys
|
|
|
13
13
|
import time
|
|
14
14
|
from collections import Counter, deque
|
|
15
15
|
from pathlib import Path
|
|
16
|
-
from typing import Optional
|
|
17
16
|
|
|
18
17
|
sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
|
|
19
18
|
|
|
20
|
-
from ocdiag import cli, output
|
|
19
|
+
from ocdiag import cli, output
|
|
20
|
+
from ocdiag.timeutil import fmt_age, fmt_ts
|
|
21
|
+
from ocdiag.tokens import fmt_tokens, percentile
|
|
21
22
|
|
|
22
23
|
try:
|
|
23
24
|
from croniter import croniter # type: ignore
|
|
@@ -26,15 +27,6 @@ except ImportError:
|
|
|
26
27
|
HAS_CRONITER = False
|
|
27
28
|
|
|
28
29
|
|
|
29
|
-
def fmt_ts(ms):
|
|
30
|
-
if not ms:
|
|
31
|
-
return "?"
|
|
32
|
-
try:
|
|
33
|
-
return datetime.datetime.fromtimestamp(ms / 1000).strftime("%Y-%m-%d %H:%M:%S")
|
|
34
|
-
except Exception:
|
|
35
|
-
return str(ms)
|
|
36
|
-
|
|
37
|
-
|
|
38
30
|
def fmt_duration(ms):
|
|
39
31
|
if ms is None:
|
|
40
32
|
return "?"
|
|
@@ -46,24 +38,6 @@ def fmt_duration(ms):
|
|
|
46
38
|
return f"{s/3600:.1f}h"
|
|
47
39
|
|
|
48
40
|
|
|
49
|
-
def fmt_age(ms_delta):
|
|
50
|
-
s = abs(ms_delta) / 1000
|
|
51
|
-
if s < 60:
|
|
52
|
-
return f"{s:.0f}秒"
|
|
53
|
-
if s < 3600:
|
|
54
|
-
return f"{s/60:.0f}分钟"
|
|
55
|
-
if s < 86400:
|
|
56
|
-
return f"{s/3600:.1f}小时"
|
|
57
|
-
return f"{s/86400:.1f}天"
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
def percentile(sorted_list, p):
|
|
61
|
-
if not sorted_list:
|
|
62
|
-
return None
|
|
63
|
-
k = max(0, min(len(sorted_list) - 1, int(len(sorted_list) * p)))
|
|
64
|
-
return sorted_list[k]
|
|
65
|
-
|
|
66
|
-
|
|
67
41
|
def format_schedule(sched):
|
|
68
42
|
k = sched.get("kind", "?")
|
|
69
43
|
if k == "cron":
|
|
@@ -121,16 +95,6 @@ def load_runs(runs_dir, jid):
|
|
|
121
95
|
return out
|
|
122
96
|
|
|
123
97
|
|
|
124
|
-
def fmt_k(n):
|
|
125
|
-
if n is None:
|
|
126
|
-
return "?"
|
|
127
|
-
if n >= 1_000_000:
|
|
128
|
-
return f"{n/1_000_000:.1f}M"
|
|
129
|
-
if n >= 1000:
|
|
130
|
-
return f"{n/1000:.1f}K"
|
|
131
|
-
return str(n)
|
|
132
|
-
|
|
133
|
-
|
|
134
98
|
def extract_usage(r):
|
|
135
99
|
u = r.get("usage")
|
|
136
100
|
if not u and isinstance(r.get("result"), dict):
|
|
@@ -463,7 +427,7 @@ def section_jobs(out: output.Output, jobs_file: str, state_file: str, runs_dir:
|
|
|
463
427
|
cost_sum += cost
|
|
464
428
|
has_cost = True
|
|
465
429
|
if has_usage:
|
|
466
|
-
line = f" tokens(最近{len(recent)}次): in={
|
|
430
|
+
line = f" tokens(最近{len(recent)}次): in={fmt_tokens(input_sum)} out={fmt_tokens(output_sum)}"
|
|
467
431
|
if has_cost:
|
|
468
432
|
line += f" | cost=${cost_sum:.4f}"
|
|
469
433
|
out.item(line)
|
|
@@ -671,7 +635,6 @@ def section_system_crontab(out: output.Output) -> None:
|
|
|
671
635
|
def main() -> int:
|
|
672
636
|
parser = cli.build_common_parser(
|
|
673
637
|
description="模块 6:定时任务采集",
|
|
674
|
-
prog="06_cron_jobs",
|
|
675
638
|
)
|
|
676
639
|
args = parser.parse_args()
|
|
677
640
|
|