openclaw-diag-cli 0.1.3 → 0.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +84 -71
- package/bin/openclaw-diag.js +65 -176
- package/diag/01_sys_health.py +0 -2
- package/diag/02_environment.py +32 -6
- package/diag/03_configuration.py +4 -1
- package/diag/04_gateway.py +30 -8
- package/diag/05_recent_errors.py +24 -14
- package/diag/06_cron_jobs.py +4 -41
- package/diag/07_performance.py +114 -42
- package/diag/08_sessions.py +2 -54
- package/diag/09_plugin_diag.py +52 -25
- package/diag/10_shell_history.py +28 -10
- package/lib/bundle.py +6 -13
- package/ocdiag/__init__.py +1 -1
- package/ocdiag/cli.py +16 -1
- package/ocdiag/dispatcher.py +140 -53
- package/ocdiag/doctor.py +162 -0
- package/ocdiag/jsonlog.py +0 -5
- package/ocdiag/paths.py +0 -1
- package/ocdiag/recent_logs.py +0 -3
- package/ocdiag/sensitive.py +95 -1
- package/ocdiag/timeutil.py +0 -11
- package/ocdiag/tokens.py +0 -4
- package/package.json +2 -2
- package/tools/oc_session_extract.py +75 -7
- package/tools/oc_session_trace.py +31 -9
package/diag/03_configuration.py
CHANGED
|
@@ -85,7 +85,6 @@ def emit_config(out: output.Output, data: list, obj, prefix: str = "") -> None:
|
|
|
85
85
|
def main() -> int:
|
|
86
86
|
parser = cli.build_common_parser(
|
|
87
87
|
description="模块 3:采集 OpenClaw 配置(含敏感字段脱敏)",
|
|
88
|
-
prog="03_configuration",
|
|
89
88
|
)
|
|
90
89
|
args = parser.parse_args()
|
|
91
90
|
|
|
@@ -95,6 +94,10 @@ def main() -> int:
|
|
|
95
94
|
config_path = args.config
|
|
96
95
|
if not os.path.isfile(config_path):
|
|
97
96
|
out.item(f"配置文件未找到: {config_path}")
|
|
97
|
+
out.line(" 下一步:")
|
|
98
|
+
out.line(" 1) 确认 OpenClaw 已经初始化(运行过 `openclaw` 即会生成配置)")
|
|
99
|
+
out.line(" 2) 用 OPENCLAW_CONFIG=/path/to/openclaw.json 或 --config 指向正确路径")
|
|
100
|
+
out.line(" 3) 在容器/远端诊断时,用 OPENCLAW_HOME=/path 整体覆盖")
|
|
98
101
|
out.evidence(config_path, "<文件缺失>")
|
|
99
102
|
out.set_data("config_path", config_path)
|
|
100
103
|
out.set_data("found", False)
|
package/diag/04_gateway.py
CHANGED
|
@@ -11,7 +11,6 @@ import sys
|
|
|
11
11
|
from collections import defaultdict
|
|
12
12
|
from datetime import datetime
|
|
13
13
|
from pathlib import Path
|
|
14
|
-
from typing import List, Optional
|
|
15
14
|
|
|
16
15
|
sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
|
|
17
16
|
|
|
@@ -146,11 +145,20 @@ def section_restart_events(out: output.Output) -> None:
|
|
|
146
145
|
|
|
147
146
|
def section_model_api(out: output.Output, args) -> None:
|
|
148
147
|
if not os.path.isfile(args.config):
|
|
148
|
+
out.item("模型 API: 配置文件未找到")
|
|
149
|
+
out.set_data("model_api_status", {
|
|
150
|
+
"found": False, "reason": "config_not_found", "checked": args.config,
|
|
151
|
+
})
|
|
149
152
|
return
|
|
150
153
|
try:
|
|
151
154
|
with open(args.config) as f:
|
|
152
155
|
cfg = json.load(f)
|
|
153
|
-
except
|
|
156
|
+
except (OSError, json.JSONDecodeError) as e:
|
|
157
|
+
out.item(f"模型 API: 配置读取失败 ({type(e).__name__})")
|
|
158
|
+
out.set_data("model_api_status", {
|
|
159
|
+
"found": False, "reason": "config_unreadable",
|
|
160
|
+
"checked": args.config, "error": str(e)[:200],
|
|
161
|
+
})
|
|
154
162
|
return
|
|
155
163
|
models = cfg.get("models", {}) or {}
|
|
156
164
|
all_cfgs = {}
|
|
@@ -290,8 +298,12 @@ def section_ws_lifecycle(out: output.Output, app_log: str) -> None:
|
|
|
290
298
|
else:
|
|
291
299
|
continue
|
|
292
300
|
events.append((ts_dt, ts_str, account, kind, msg))
|
|
293
|
-
except OSError:
|
|
294
|
-
out.item("Channel WS: 读取应用日志失败")
|
|
301
|
+
except OSError as e:
|
|
302
|
+
out.item(f"Channel WS: 读取应用日志失败 ({type(e).__name__})")
|
|
303
|
+
out.set_data("ws_summary_status", {
|
|
304
|
+
"found": False, "reason": "log_unreadable",
|
|
305
|
+
"checked": app_log, "error": str(e)[:200],
|
|
306
|
+
})
|
|
295
307
|
return
|
|
296
308
|
|
|
297
309
|
if not events and not expired:
|
|
@@ -564,7 +576,12 @@ def section_gateway_errors(out: output.Output, app_log: str) -> None:
|
|
|
564
576
|
continue
|
|
565
577
|
kind, code, reason = r
|
|
566
578
|
events.append((ts, kind, code, reason or "(no reason)"))
|
|
567
|
-
except OSError:
|
|
579
|
+
except OSError as e:
|
|
580
|
+
out.item(f"Gateway 错误码: 读取应用日志失败 ({type(e).__name__})")
|
|
581
|
+
out.set_data("gateway_errors_status", {
|
|
582
|
+
"found": False, "reason": "log_unreadable",
|
|
583
|
+
"checked": app_log, "error": str(e)[:200],
|
|
584
|
+
})
|
|
568
585
|
return
|
|
569
586
|
|
|
570
587
|
if not events:
|
|
@@ -614,7 +631,6 @@ def section_gateway_errors(out: output.Output, app_log: str) -> None:
|
|
|
614
631
|
def main() -> int:
|
|
615
632
|
parser = cli.build_common_parser(
|
|
616
633
|
description="模块 4:Gateway 状态采集",
|
|
617
|
-
prog="04_gateway",
|
|
618
634
|
)
|
|
619
635
|
args = parser.parse_args()
|
|
620
636
|
|
|
@@ -622,6 +638,7 @@ def main() -> int:
|
|
|
622
638
|
out.section("模块 4:Gateway 状态")
|
|
623
639
|
|
|
624
640
|
port = 18789
|
|
641
|
+
port_source = "default"
|
|
625
642
|
if os.path.isfile(args.config):
|
|
626
643
|
try:
|
|
627
644
|
with open(args.config) as f:
|
|
@@ -629,8 +646,13 @@ def main() -> int:
|
|
|
629
646
|
cp = cfg.get("gateway", {}).get("port")
|
|
630
647
|
if cp:
|
|
631
648
|
port = int(cp)
|
|
632
|
-
|
|
633
|
-
|
|
649
|
+
port_source = "config"
|
|
650
|
+
except (OSError, json.JSONDecodeError, ValueError) as e:
|
|
651
|
+
out.set_data("port_source_status", {
|
|
652
|
+
"found": False, "reason": "config_unreadable",
|
|
653
|
+
"checked": args.config, "error": str(e)[:200],
|
|
654
|
+
})
|
|
655
|
+
out.set_data("port_source", port_source)
|
|
634
656
|
|
|
635
657
|
section_process_port(out, args, port)
|
|
636
658
|
section_restart_events(out)
|
package/diag/05_recent_errors.py
CHANGED
|
@@ -79,21 +79,25 @@ def render_log_line(line: str, max_len: int = 300) -> str:
|
|
|
79
79
|
return line
|
|
80
80
|
|
|
81
81
|
|
|
82
|
-
def collect_error_lines(log_files: List[str])
|
|
82
|
+
def collect_error_lines(log_files: List[str]):
|
|
83
|
+
"""Returns (matched_lines, unreadable_files). One unreadable file does not
|
|
84
|
+
abort the whole scan, but we tell the caller which paths failed."""
|
|
83
85
|
out: List[str] = []
|
|
86
|
+
unreadable: List[dict] = []
|
|
84
87
|
for lf in log_files:
|
|
85
88
|
try:
|
|
86
89
|
with open(lf, errors="replace") as f:
|
|
87
90
|
for ln in f:
|
|
88
91
|
if _ERR_RE.search(ln):
|
|
89
92
|
out.append(ln.rstrip("\n"))
|
|
90
|
-
except OSError:
|
|
91
|
-
|
|
92
|
-
return out
|
|
93
|
+
except OSError as e:
|
|
94
|
+
unreadable.append({"path": lf, "error": f"{type(e).__name__}: {e}"})
|
|
95
|
+
return out, unreadable
|
|
93
96
|
|
|
94
97
|
|
|
95
|
-
def collect_api_errors(log_files: List[str])
|
|
98
|
+
def collect_api_errors(log_files: List[str]):
|
|
96
99
|
out: List[str] = []
|
|
100
|
+
unreadable: List[dict] = []
|
|
97
101
|
for lf in log_files:
|
|
98
102
|
try:
|
|
99
103
|
with open(lf, errors="replace") as f:
|
|
@@ -107,9 +111,9 @@ def collect_api_errors(log_files: List[str]) -> List[str]:
|
|
|
107
111
|
if _API_EXCLUDE_TXT_RE.search(ln):
|
|
108
112
|
continue
|
|
109
113
|
out.append(ln.rstrip("\n"))
|
|
110
|
-
except OSError:
|
|
111
|
-
|
|
112
|
-
return out
|
|
114
|
+
except OSError as e:
|
|
115
|
+
unreadable.append({"path": lf, "error": f"{type(e).__name__}: {e}"})
|
|
116
|
+
return out, unreadable
|
|
113
117
|
|
|
114
118
|
|
|
115
119
|
def journalctl_errors() -> str:
|
|
@@ -152,17 +156,21 @@ def tool_errors_from_session(session_path: str):
|
|
|
152
156
|
msg = obj.get("message", {}) or {}
|
|
153
157
|
if msg.get("isError"):
|
|
154
158
|
counts[msg.get("toolName", "unknown")] += 1
|
|
155
|
-
except
|
|
156
|
-
|
|
159
|
+
except (json.JSONDecodeError, ValueError):
|
|
160
|
+
# Expected: session.jsonl can have malformed lines from
|
|
161
|
+
# interrupted writes; skip and keep counting.
|
|
162
|
+
continue
|
|
157
163
|
except OSError:
|
|
158
|
-
|
|
164
|
+
# Session file disappeared between glob() and open(). Caller already
|
|
165
|
+
# falls back to "no recent session"; reporting per-file unreadable
|
|
166
|
+
# would mostly add noise here.
|
|
167
|
+
return counts
|
|
159
168
|
return counts
|
|
160
169
|
|
|
161
170
|
|
|
162
171
|
def main() -> int:
|
|
163
172
|
parser = cli.build_common_parser(
|
|
164
173
|
description="模块 5:采集近期错误日志",
|
|
165
|
-
prog="05_recent_errors",
|
|
166
174
|
)
|
|
167
175
|
args = parser.parse_args()
|
|
168
176
|
|
|
@@ -187,8 +195,10 @@ def main() -> int:
|
|
|
187
195
|
out.line("")
|
|
188
196
|
|
|
189
197
|
if logs:
|
|
190
|
-
err_lines = collect_error_lines(logs)
|
|
198
|
+
err_lines, err_unreadable = collect_error_lines(logs)
|
|
191
199
|
out.set_data("app_error_count", len(err_lines))
|
|
200
|
+
if err_unreadable:
|
|
201
|
+
out.set_data("app_log_unreadable", err_unreadable)
|
|
192
202
|
if err_lines:
|
|
193
203
|
out.item(f"应用日志 ERROR 级别: {len(err_lines)} 条 — Gateway 运行时报错,包括工具失败、模型异常等")
|
|
194
204
|
rendered = []
|
|
@@ -202,7 +212,7 @@ def main() -> int:
|
|
|
202
212
|
else:
|
|
203
213
|
out.item("应用日志 ERROR 级别: 0 条 — Gateway 运行时报错")
|
|
204
214
|
|
|
205
|
-
api_lines = collect_api_errors(logs)
|
|
215
|
+
api_lines, _api_unreadable = collect_api_errors(logs)
|
|
206
216
|
out.set_data("api_error_count", len(api_lines))
|
|
207
217
|
if api_lines:
|
|
208
218
|
out.item(f"模型 API HTTP 错误: {len(api_lines)} 条 ")
|
package/diag/06_cron_jobs.py
CHANGED
|
@@ -13,11 +13,12 @@ import sys
|
|
|
13
13
|
import time
|
|
14
14
|
from collections import Counter, deque
|
|
15
15
|
from pathlib import Path
|
|
16
|
-
from typing import Optional
|
|
17
16
|
|
|
18
17
|
sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
|
|
19
18
|
|
|
20
|
-
from ocdiag import cli, output
|
|
19
|
+
from ocdiag import cli, output
|
|
20
|
+
from ocdiag.timeutil import fmt_age, fmt_ts
|
|
21
|
+
from ocdiag.tokens import fmt_tokens, percentile
|
|
21
22
|
|
|
22
23
|
try:
|
|
23
24
|
from croniter import croniter # type: ignore
|
|
@@ -26,15 +27,6 @@ except ImportError:
|
|
|
26
27
|
HAS_CRONITER = False
|
|
27
28
|
|
|
28
29
|
|
|
29
|
-
def fmt_ts(ms):
|
|
30
|
-
if not ms:
|
|
31
|
-
return "?"
|
|
32
|
-
try:
|
|
33
|
-
return datetime.datetime.fromtimestamp(ms / 1000).strftime("%Y-%m-%d %H:%M:%S")
|
|
34
|
-
except Exception:
|
|
35
|
-
return str(ms)
|
|
36
|
-
|
|
37
|
-
|
|
38
30
|
def fmt_duration(ms):
|
|
39
31
|
if ms is None:
|
|
40
32
|
return "?"
|
|
@@ -46,24 +38,6 @@ def fmt_duration(ms):
|
|
|
46
38
|
return f"{s/3600:.1f}h"
|
|
47
39
|
|
|
48
40
|
|
|
49
|
-
def fmt_age(ms_delta):
|
|
50
|
-
s = abs(ms_delta) / 1000
|
|
51
|
-
if s < 60:
|
|
52
|
-
return f"{s:.0f}秒"
|
|
53
|
-
if s < 3600:
|
|
54
|
-
return f"{s/60:.0f}分钟"
|
|
55
|
-
if s < 86400:
|
|
56
|
-
return f"{s/3600:.1f}小时"
|
|
57
|
-
return f"{s/86400:.1f}天"
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
def percentile(sorted_list, p):
|
|
61
|
-
if not sorted_list:
|
|
62
|
-
return None
|
|
63
|
-
k = max(0, min(len(sorted_list) - 1, int(len(sorted_list) * p)))
|
|
64
|
-
return sorted_list[k]
|
|
65
|
-
|
|
66
|
-
|
|
67
41
|
def format_schedule(sched):
|
|
68
42
|
k = sched.get("kind", "?")
|
|
69
43
|
if k == "cron":
|
|
@@ -121,16 +95,6 @@ def load_runs(runs_dir, jid):
|
|
|
121
95
|
return out
|
|
122
96
|
|
|
123
97
|
|
|
124
|
-
def fmt_k(n):
|
|
125
|
-
if n is None:
|
|
126
|
-
return "?"
|
|
127
|
-
if n >= 1_000_000:
|
|
128
|
-
return f"{n/1_000_000:.1f}M"
|
|
129
|
-
if n >= 1000:
|
|
130
|
-
return f"{n/1000:.1f}K"
|
|
131
|
-
return str(n)
|
|
132
|
-
|
|
133
|
-
|
|
134
98
|
def extract_usage(r):
|
|
135
99
|
u = r.get("usage")
|
|
136
100
|
if not u and isinstance(r.get("result"), dict):
|
|
@@ -463,7 +427,7 @@ def section_jobs(out: output.Output, jobs_file: str, state_file: str, runs_dir:
|
|
|
463
427
|
cost_sum += cost
|
|
464
428
|
has_cost = True
|
|
465
429
|
if has_usage:
|
|
466
|
-
line = f" tokens(最近{len(recent)}次): in={
|
|
430
|
+
line = f" tokens(最近{len(recent)}次): in={fmt_tokens(input_sum)} out={fmt_tokens(output_sum)}"
|
|
467
431
|
if has_cost:
|
|
468
432
|
line += f" | cost=${cost_sum:.4f}"
|
|
469
433
|
out.item(line)
|
|
@@ -671,7 +635,6 @@ def section_system_crontab(out: output.Output) -> None:
|
|
|
671
635
|
def main() -> int:
|
|
672
636
|
parser = cli.build_common_parser(
|
|
673
637
|
description="模块 6:定时任务采集",
|
|
674
|
-
prog="06_cron_jobs",
|
|
675
638
|
)
|
|
676
639
|
args = parser.parse_args()
|
|
677
640
|
|
package/diag/07_performance.py
CHANGED
|
@@ -7,7 +7,6 @@ import glob
|
|
|
7
7
|
import json
|
|
8
8
|
import os
|
|
9
9
|
import sys
|
|
10
|
-
import tempfile
|
|
11
10
|
from collections import defaultdict
|
|
12
11
|
from datetime import datetime, timezone, timedelta
|
|
13
12
|
from pathlib import Path
|
|
@@ -15,45 +14,13 @@ from pathlib import Path
|
|
|
15
14
|
sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
|
|
16
15
|
|
|
17
16
|
from ocdiag import cli, output
|
|
17
|
+
from ocdiag.timeutil import parse_msg_ts, parse_obj_ts
|
|
18
|
+
from ocdiag.tokens import fmt_tokens, pct
|
|
18
19
|
|
|
19
20
|
|
|
20
21
|
NORMAL_STOPS = {"stop", "end_turn", "toolUse", "tool_calls", ""}
|
|
21
22
|
|
|
22
23
|
|
|
23
|
-
def parse_obj_ts(ts_str):
|
|
24
|
-
if not ts_str:
|
|
25
|
-
return None
|
|
26
|
-
try:
|
|
27
|
-
return datetime.fromisoformat(ts_str.replace("Z", "+00:00"))
|
|
28
|
-
except Exception:
|
|
29
|
-
return None
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
def parse_msg_ts(ms):
|
|
33
|
-
if ms is None:
|
|
34
|
-
return None
|
|
35
|
-
try:
|
|
36
|
-
return datetime.fromtimestamp(int(ms) / 1000, tz=timezone.utc)
|
|
37
|
-
except Exception:
|
|
38
|
-
return None
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
def pct(sorted_vals, p):
|
|
42
|
-
if not sorted_vals:
|
|
43
|
-
return 0.0
|
|
44
|
-
n = len(sorted_vals)
|
|
45
|
-
idx = min(n - 1, int(n * p))
|
|
46
|
-
return sorted_vals[idx]
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
def fmt_tokens(n):
|
|
50
|
-
if n >= 1_000_000:
|
|
51
|
-
return f"{n/1_000_000:.1f}M"
|
|
52
|
-
if n >= 1_000:
|
|
53
|
-
return f"{n/1_000:.1f}K"
|
|
54
|
-
return str(n)
|
|
55
|
-
|
|
56
|
-
|
|
57
24
|
def fmt_args(tool_name, tc_args, max_len=100):
|
|
58
25
|
if isinstance(tc_args, str):
|
|
59
26
|
try:
|
|
@@ -432,11 +399,18 @@ def render(out: output.Output, data, file_count):
|
|
|
432
399
|
else:
|
|
433
400
|
for i, (sec, val, hint) in enumerate(bottleneck_items):
|
|
434
401
|
out.item(f"#{i+1}: {sec}(P95={val:.1f}s, {hint})")
|
|
402
|
+
out.set_data("bottleneck", {
|
|
403
|
+
"model_p95": round(model_p95, 3),
|
|
404
|
+
"tool_p95": round(tool_p95, 3),
|
|
405
|
+
"model_top": model_top,
|
|
406
|
+
"tool_top": tool_top,
|
|
407
|
+
})
|
|
435
408
|
|
|
436
409
|
out.subsection("模型性能")
|
|
437
410
|
out.item(f"数据来源: 最近 {file_count} 个 session 文件")
|
|
438
411
|
out.line("")
|
|
439
412
|
model_stats = data["model_stats"]
|
|
413
|
+
models_payload = {}
|
|
440
414
|
if not model_stats:
|
|
441
415
|
out.item("最近 Session 中未发现模型使用数据")
|
|
442
416
|
else:
|
|
@@ -481,9 +455,29 @@ def render(out: output.Output, data, file_count):
|
|
|
481
455
|
stops_str = " ".join(f"{k}:{v}" for k, v in sorted(stops.items(), key=lambda x: -x[1]))
|
|
482
456
|
out.item(f" stopReasons: {stops_str}" if stops_str else " stopReasons: (none)")
|
|
483
457
|
out.line("")
|
|
458
|
+
models_payload[model_key] = {
|
|
459
|
+
"calls": calls,
|
|
460
|
+
"p50_s": round(p50, 3),
|
|
461
|
+
"p95_s": round(p95, 3),
|
|
462
|
+
"max_s": round(mx, 3),
|
|
463
|
+
"throughput_tok_s": (
|
|
464
|
+
None if s["output"] == 0 or total_dur <= 0
|
|
465
|
+
else round(s["output"] / total_dur, 1)
|
|
466
|
+
),
|
|
467
|
+
"input_tokens": s["input"],
|
|
468
|
+
"output_tokens": s["output"],
|
|
469
|
+
"cache_read_tokens": s["cache_read"],
|
|
470
|
+
"cache_write_tokens": s["cache_write"],
|
|
471
|
+
"cost_usd": round(s["cost"], 6),
|
|
472
|
+
"success_rate_pct": round(success, 1),
|
|
473
|
+
"stop_reasons": dict(stops),
|
|
474
|
+
}
|
|
475
|
+
out.set_data("models", models_payload)
|
|
476
|
+
out.set_data("session_files_analyzed", file_count)
|
|
484
477
|
|
|
485
478
|
out.subsection("工具性能(Top 10 by 调用量)")
|
|
486
479
|
timed_tools = {n: s for n, s in data["tool_stats"].items() if s["durations"]}
|
|
480
|
+
tools_payload = {}
|
|
487
481
|
if not timed_tools:
|
|
488
482
|
out.item("(无工具调用数据)")
|
|
489
483
|
else:
|
|
@@ -491,8 +485,11 @@ def render(out: output.Output, data, file_count):
|
|
|
491
485
|
for name, s in ranked:
|
|
492
486
|
durs = sorted(s["durations"])
|
|
493
487
|
calls = s["calls"]
|
|
488
|
+
p50 = pct(durs, 0.50)
|
|
489
|
+
p95 = pct(durs, 0.95)
|
|
490
|
+
mx = durs[-1]
|
|
494
491
|
err_rate = (s["errors"] / calls * 100) if calls else 0.0
|
|
495
|
-
dur_str = f"P50={
|
|
492
|
+
dur_str = f"P50={p50:.3f}s P95={p95:.3f}s Max={mx:.3f}s"
|
|
496
493
|
out.item(f"{name}: {calls} 次 | {dur_str} | 错误 {err_rate:.0f}%")
|
|
497
494
|
timed = [r for r in s["records"] if r["dur"] is not None]
|
|
498
495
|
timed.sort(key=lambda r: r["dur"], reverse=True)
|
|
@@ -511,6 +508,15 @@ def render(out: output.Output, data, file_count):
|
|
|
511
508
|
tail = f", {r['err_brief']}" if r["err_brief"] else ""
|
|
512
509
|
out.item(f" 失败: {args} (error, {dur_txt}{tail})")
|
|
513
510
|
err_shown += 1
|
|
511
|
+
tools_payload[name] = {
|
|
512
|
+
"calls": calls,
|
|
513
|
+
"errors": s["errors"],
|
|
514
|
+
"error_rate_pct": round(err_rate, 1),
|
|
515
|
+
"p50_s": round(p50, 3),
|
|
516
|
+
"p95_s": round(p95, 3),
|
|
517
|
+
"max_s": round(mx, 3),
|
|
518
|
+
}
|
|
519
|
+
out.set_data("tools", tools_payload)
|
|
514
520
|
|
|
515
521
|
out.subsection("慢调用 Top 20")
|
|
516
522
|
slow = sorted(data["slow_calls_top"], key=lambda x: x[0], reverse=True)
|
|
@@ -529,12 +535,17 @@ def render(out: output.Output, data, file_count):
|
|
|
529
535
|
else:
|
|
530
536
|
for i, entry in enumerate(top20, 1):
|
|
531
537
|
out.item(f"[{i}] {entry[2]}")
|
|
538
|
+
out.set_data("slow_calls_top20", [
|
|
539
|
+
{"duration_s": round(e[0], 3), "kind": e[1], "summary": e[2]}
|
|
540
|
+
for e in top20
|
|
541
|
+
])
|
|
532
542
|
|
|
533
543
|
out.subsection("异常 stopReason — 模型非正常结束(如 error、中断)")
|
|
534
544
|
abnormal_stops = data["abnormal_stops"]
|
|
535
545
|
out.item(f"共 {len(abnormal_stops)} 条" + ("(无异常)" if not abnormal_stops else ""))
|
|
536
546
|
for s in abnormal_stops[:20]:
|
|
537
547
|
out.item(s)
|
|
548
|
+
out.set_data("abnormal_stops", abnormal_stops)
|
|
538
549
|
|
|
539
550
|
out.subsection("模型 API 错误分布")
|
|
540
551
|
api_err_total = sum(data["api_error_stats"].values())
|
|
@@ -548,9 +559,16 @@ def render(out: output.Output, data, file_count):
|
|
|
548
559
|
out.item("分布:")
|
|
549
560
|
for cat, n in sorted(data["api_error_stats"].items(), key=lambda kv: -kv[1]):
|
|
550
561
|
out.item(f" {cat}: {n}")
|
|
562
|
+
out.set_data("api_errors", {
|
|
563
|
+
"total_calls": api_total,
|
|
564
|
+
"error_count": api_err_total,
|
|
565
|
+
"error_rate_pct": round(api_err_total / api_total * 100, 2) if api_total else 0.0,
|
|
566
|
+
"by_category": dict(data["api_error_stats"]),
|
|
567
|
+
})
|
|
551
568
|
|
|
552
569
|
out.subsection("端到端消息延迟(user 发送 → assistant 最终响应)")
|
|
553
570
|
e2e = data["e2e_latencies"]
|
|
571
|
+
e2e_payload = {"count": 0}
|
|
554
572
|
if not e2e:
|
|
555
573
|
out.item("(数据不足,未发现 user→assistant 配对)")
|
|
556
574
|
else:
|
|
@@ -573,10 +591,19 @@ def render(out: output.Output, data, file_count):
|
|
|
573
591
|
n = bucket_counts[lbl]
|
|
574
592
|
pct_v = (n / total * 100) if total else 0.0
|
|
575
593
|
out.item(f" {lbl}: {n} ({pct_v:.1f}%)")
|
|
594
|
+
e2e_payload = {
|
|
595
|
+
"count": total,
|
|
596
|
+
"p50_s": round(p50, 3),
|
|
597
|
+
"p95_s": round(p95, 3),
|
|
598
|
+
"max_s": round(mx, 3),
|
|
599
|
+
"buckets": dict(bucket_counts),
|
|
600
|
+
}
|
|
601
|
+
out.set_data("e2e_latency", e2e_payload)
|
|
576
602
|
|
|
577
603
|
out.subsection("延迟 vs 上下文大小")
|
|
578
604
|
ctx_buckets_def = data["ctx_buckets_def"]
|
|
579
605
|
ctx_durs = data["ctx_bucket_durs"]
|
|
606
|
+
ctx_payload = {}
|
|
580
607
|
if not any(ctx_durs.get(l) for l, _ in ctx_buckets_def):
|
|
581
608
|
out.item("(数据不足)")
|
|
582
609
|
else:
|
|
@@ -585,13 +612,21 @@ def render(out: output.Output, data, file_count):
|
|
|
585
612
|
durs = sorted(ctx_durs.get(b_label, []))
|
|
586
613
|
if not durs:
|
|
587
614
|
out.line(f" {b_label:<14} {0:>8} {'-':>10} {'-':>10}")
|
|
615
|
+
ctx_payload[b_label] = {"count": 0, "p50_s": None, "p95_s": None}
|
|
588
616
|
continue
|
|
589
617
|
p50 = pct(durs, 0.50)
|
|
590
618
|
p95 = pct(durs, 0.95)
|
|
591
619
|
out.line(f" {b_label:<14} {len(durs):>8} {p50:>9.1f}s {p95:>9.1f}s")
|
|
620
|
+
ctx_payload[b_label] = {
|
|
621
|
+
"count": len(durs),
|
|
622
|
+
"p50_s": round(p50, 3),
|
|
623
|
+
"p95_s": round(p95, 3),
|
|
624
|
+
}
|
|
625
|
+
out.set_data("ctx_buckets", ctx_payload)
|
|
592
626
|
|
|
593
627
|
out.subsection("每日趋势(最近 7 天)")
|
|
594
628
|
daily_stats = data["daily_stats"]
|
|
629
|
+
daily_payload = []
|
|
595
630
|
if not daily_stats:
|
|
596
631
|
out.item("(数据不足)")
|
|
597
632
|
else:
|
|
@@ -602,12 +637,22 @@ def render(out: output.Output, data, file_count):
|
|
|
602
637
|
d = daily_stats.get(d_key)
|
|
603
638
|
if not d or d["calls"] == 0:
|
|
604
639
|
out.line(f" {d_key:<10} {0:>8} {'-':>10} {'-':>14}")
|
|
640
|
+
daily_payload.append({"date": d_key, "calls": 0,
|
|
641
|
+
"p50_s": None, "output_tokens": 0})
|
|
605
642
|
continue
|
|
606
643
|
durs = sorted(d["durs"])
|
|
607
644
|
p50 = pct(durs, 0.50) if durs else 0.0
|
|
608
645
|
out.line(f" {d_key:<10} {d['calls']:>8} {p50:>9.1f}s {fmt_tokens(d['output']):>14}")
|
|
646
|
+
daily_payload.append({
|
|
647
|
+
"date": d_key,
|
|
648
|
+
"calls": d["calls"],
|
|
649
|
+
"p50_s": round(p50, 3),
|
|
650
|
+
"output_tokens": d["output"],
|
|
651
|
+
})
|
|
652
|
+
out.set_data("daily_trend", daily_payload)
|
|
609
653
|
|
|
610
654
|
out.subsection("Cache 命中率")
|
|
655
|
+
cache_payload = {"total_calls": data["cache_total_calls"]}
|
|
611
656
|
if data["cache_total_calls"] == 0:
|
|
612
657
|
out.item("(无数据)")
|
|
613
658
|
else:
|
|
@@ -622,17 +667,34 @@ def render(out: output.Output, data, file_count):
|
|
|
622
667
|
f"cache_write: {fmt_tokens(data['cache_sum_cache_write'])}"
|
|
623
668
|
)
|
|
624
669
|
denom = data["cache_sum_input"] + data["cache_sum_cache_read"]
|
|
670
|
+
ratio_pct = None
|
|
625
671
|
if denom > 0:
|
|
626
672
|
ratio = data["cache_sum_cache_read"] / denom * 100
|
|
673
|
+
ratio_pct = round(ratio, 3)
|
|
627
674
|
out.item(
|
|
628
675
|
f"上下文 cache 占比: cacheRead/(input+cacheRead) = "
|
|
629
676
|
f"{ratio:.3f}% ({fmt_tokens(data['cache_sum_cache_read'])}/{fmt_tokens(denom)})"
|
|
630
677
|
)
|
|
678
|
+
cache_payload = {
|
|
679
|
+
"total_calls": data["cache_total_calls"],
|
|
680
|
+
"calls_with_cache_read": data["cache_calls_with_cache"],
|
|
681
|
+
"hit_rate_pct": round(hit_pct, 2),
|
|
682
|
+
"input_tokens": data["cache_sum_input"],
|
|
683
|
+
"cache_read_tokens": data["cache_sum_cache_read"],
|
|
684
|
+
"cache_write_tokens": data["cache_sum_cache_write"],
|
|
685
|
+
"ctx_cache_ratio_pct": ratio_pct,
|
|
686
|
+
}
|
|
687
|
+
out.set_data("cache_hit_rate", cache_payload)
|
|
631
688
|
|
|
632
689
|
out.subsection("工具错误明细")
|
|
633
690
|
tool_stats = data["tool_stats"]
|
|
634
691
|
err_total = sum(s["errors"] for s in tool_stats.values())
|
|
635
692
|
call_total = sum(s["calls"] for s in tool_stats.values())
|
|
693
|
+
tool_errors_payload = {
|
|
694
|
+
"total_errors": err_total,
|
|
695
|
+
"total_calls": call_total,
|
|
696
|
+
"by_tool": {},
|
|
697
|
+
}
|
|
636
698
|
if err_total == 0:
|
|
637
699
|
out.item(f"共 0 次错误 (总调用 {call_total} 次中)")
|
|
638
700
|
else:
|
|
@@ -642,13 +704,22 @@ def render(out: output.Output, data, file_count):
|
|
|
642
704
|
if s["errors"] == 0:
|
|
643
705
|
continue
|
|
644
706
|
out.line(f" {name} ({s['errors']}次):")
|
|
707
|
+
samples = []
|
|
645
708
|
for r in s["error_records"][:3]:
|
|
646
709
|
ts_label = r["ts"].strftime("%Y-%m-%d %H:%M:%S") if r["ts"] else "?"
|
|
647
710
|
brief = r["err_brief"] or "(无错误内容)"
|
|
648
711
|
out.line(f" {ts_label} | {brief[:100]}")
|
|
712
|
+
samples.append({"ts": ts_label, "brief": brief[:200]})
|
|
713
|
+
tool_errors_payload["by_tool"][name] = {
|
|
714
|
+
"errors": s["errors"],
|
|
715
|
+
"calls": s["calls"],
|
|
716
|
+
"samples": samples,
|
|
717
|
+
}
|
|
718
|
+
out.set_data("tool_errors", tool_errors_payload)
|
|
649
719
|
|
|
650
720
|
out.subsection("Session 消耗 Top 5")
|
|
651
721
|
session_stats = data["session_stats"]
|
|
722
|
+
session_top_payload = []
|
|
652
723
|
if not session_stats:
|
|
653
724
|
out.item("(无数据)")
|
|
654
725
|
else:
|
|
@@ -657,12 +728,18 @@ def render(out: output.Output, data, file_count):
|
|
|
657
728
|
for sid, ss in ranked:
|
|
658
729
|
out.line(f" {sid:<40} {ss['calls']:>8} "
|
|
659
730
|
f"{fmt_tokens(ss['tokens']):>10} {ss['duration']:>11.0f}s")
|
|
731
|
+
session_top_payload.append({
|
|
732
|
+
"session": sid,
|
|
733
|
+
"calls": ss["calls"],
|
|
734
|
+
"tokens": ss["tokens"],
|
|
735
|
+
"duration_s": round(ss["duration"], 1),
|
|
736
|
+
})
|
|
737
|
+
out.set_data("session_top5", session_top_payload)
|
|
660
738
|
|
|
661
739
|
|
|
662
740
|
def main() -> int:
|
|
663
741
|
parser = cli.build_common_parser(
|
|
664
742
|
description="模块 7:模型与性能数据",
|
|
665
|
-
prog="07_performance",
|
|
666
743
|
)
|
|
667
744
|
args = parser.parse_args()
|
|
668
745
|
out = output.init("performance", json_mode=args.json, no_color=args.no_color)
|
|
@@ -675,11 +752,6 @@ def main() -> int:
|
|
|
675
752
|
|
|
676
753
|
data = analyze_sessions(session_files)
|
|
677
754
|
render(out, data, len(session_files))
|
|
678
|
-
|
|
679
|
-
if args.json:
|
|
680
|
-
out.set_data("model_count", len(data["model_stats"]))
|
|
681
|
-
out.set_data("session_files_analyzed", len(session_files))
|
|
682
|
-
out.set_data("e2e_latency_count", len(data["e2e_latencies"]))
|
|
683
755
|
return out.done()
|
|
684
756
|
|
|
685
757
|
|
package/diag/08_sessions.py
CHANGED
|
@@ -10,69 +10,18 @@ import re
|
|
|
10
10
|
import sys
|
|
11
11
|
import time
|
|
12
12
|
from collections import defaultdict
|
|
13
|
-
from datetime import datetime, timezone
|
|
14
13
|
from pathlib import Path
|
|
15
14
|
|
|
16
15
|
sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
|
|
17
16
|
|
|
18
17
|
from ocdiag import cli, output
|
|
18
|
+
from ocdiag.timeutil import fmt_duration, parse_msg_ts, parse_obj_ts
|
|
19
|
+
from ocdiag.tokens import fmt_tokens, human_size, pct
|
|
19
20
|
|
|
20
21
|
|
|
21
22
|
NORMAL_STOPS = {"stop", "end_turn", "toolUse", "tool_calls", ""}
|
|
22
23
|
|
|
23
24
|
|
|
24
|
-
def parse_obj_ts(ts_str):
|
|
25
|
-
if not ts_str:
|
|
26
|
-
return None
|
|
27
|
-
try:
|
|
28
|
-
return datetime.fromisoformat(ts_str.replace("Z", "+00:00"))
|
|
29
|
-
except Exception:
|
|
30
|
-
return None
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
def parse_msg_ts(ms):
|
|
34
|
-
if ms is None:
|
|
35
|
-
return None
|
|
36
|
-
try:
|
|
37
|
-
return datetime.fromtimestamp(int(ms) / 1000, tz=timezone.utc)
|
|
38
|
-
except Exception:
|
|
39
|
-
return None
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
def human_size(b):
|
|
43
|
-
if b < 1024:
|
|
44
|
-
return f"{b}B"
|
|
45
|
-
if b < 1048576:
|
|
46
|
-
return f"{b/1024:.1f}KB"
|
|
47
|
-
if b < 1073741824:
|
|
48
|
-
return f"{b/1048576:.1f}MB"
|
|
49
|
-
return f"{b/1073741824:.1f}GB"
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
def fmt_tokens(n):
|
|
53
|
-
if n >= 1_000_000:
|
|
54
|
-
return f"{n/1_000_000:.1f}M"
|
|
55
|
-
if n >= 1_000:
|
|
56
|
-
return f"{n/1_000:.1f}K"
|
|
57
|
-
return str(n)
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
def fmt_duration(sec):
|
|
61
|
-
if sec < 60:
|
|
62
|
-
return f"{sec:.0f}s"
|
|
63
|
-
if sec < 3600:
|
|
64
|
-
return f"{sec/60:.1f}m"
|
|
65
|
-
return f"{sec/3600:.1f}h"
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
def pct(sorted_vals, p):
|
|
69
|
-
if not sorted_vals:
|
|
70
|
-
return 0.0
|
|
71
|
-
n = len(sorted_vals)
|
|
72
|
-
idx = min(n - 1, int(n * p))
|
|
73
|
-
return sorted_vals[idx]
|
|
74
|
-
|
|
75
|
-
|
|
76
25
|
def build_id_to_key_map(agent_dir):
|
|
77
26
|
sess_json = os.path.join(agent_dir, "sessions", "sessions.json")
|
|
78
27
|
id_to_key = {}
|
|
@@ -502,7 +451,6 @@ def stuck_dimension(out: output.Output, log_dir: str) -> None:
|
|
|
502
451
|
def main() -> int:
|
|
503
452
|
parser = cli.build_common_parser(
|
|
504
453
|
description="模块 8:Session 数据采集 + Stuck 探测",
|
|
505
|
-
prog="08_sessions",
|
|
506
454
|
)
|
|
507
455
|
args = parser.parse_args()
|
|
508
456
|
out = output.init("sessions", json_mode=args.json, no_color=args.no_color)
|