openclaw-diag-cli 0.1.2 → 0.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +84 -71
- package/bin/openclaw-diag.js +67 -178
- package/diag/01_sys_health.py +0 -2
- package/diag/02_environment.py +34 -8
- package/diag/03_configuration.py +4 -1
- package/diag/04_gateway.py +30 -8
- package/diag/05_recent_errors.py +24 -14
- package/diag/06_cron_jobs.py +4 -41
- package/diag/07_performance.py +114 -42
- package/diag/08_sessions.py +2 -54
- package/diag/09_plugin_diag.py +52 -25
- package/diag/10_shell_history.py +30 -12
- package/lib/bundle.py +6 -13
- package/ocdiag/__init__.py +1 -1
- package/ocdiag/cli.py +16 -1
- package/ocdiag/dispatcher.py +140 -53
- package/ocdiag/doctor.py +162 -0
- package/ocdiag/jsonlog.py +0 -5
- package/ocdiag/paths.py +0 -1
- package/ocdiag/recent_logs.py +0 -3
- package/ocdiag/sensitive.py +95 -1
- package/ocdiag/timeutil.py +0 -11
- package/ocdiag/tokens.py +0 -4
- package/package.json +2 -3
- package/tools/oc_session_extract.py +75 -7
- package/tools/oc_session_trace.py +31 -9
package/diag/07_performance.py
CHANGED
|
@@ -7,7 +7,6 @@ import glob
|
|
|
7
7
|
import json
|
|
8
8
|
import os
|
|
9
9
|
import sys
|
|
10
|
-
import tempfile
|
|
11
10
|
from collections import defaultdict
|
|
12
11
|
from datetime import datetime, timezone, timedelta
|
|
13
12
|
from pathlib import Path
|
|
@@ -15,45 +14,13 @@ from pathlib import Path
|
|
|
15
14
|
sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
|
|
16
15
|
|
|
17
16
|
from ocdiag import cli, output
|
|
17
|
+
from ocdiag.timeutil import parse_msg_ts, parse_obj_ts
|
|
18
|
+
from ocdiag.tokens import fmt_tokens, pct
|
|
18
19
|
|
|
19
20
|
|
|
20
21
|
NORMAL_STOPS = {"stop", "end_turn", "toolUse", "tool_calls", ""}
|
|
21
22
|
|
|
22
23
|
|
|
23
|
-
def parse_obj_ts(ts_str):
|
|
24
|
-
if not ts_str:
|
|
25
|
-
return None
|
|
26
|
-
try:
|
|
27
|
-
return datetime.fromisoformat(ts_str.replace("Z", "+00:00"))
|
|
28
|
-
except Exception:
|
|
29
|
-
return None
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
def parse_msg_ts(ms):
|
|
33
|
-
if ms is None:
|
|
34
|
-
return None
|
|
35
|
-
try:
|
|
36
|
-
return datetime.fromtimestamp(int(ms) / 1000, tz=timezone.utc)
|
|
37
|
-
except Exception:
|
|
38
|
-
return None
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
def pct(sorted_vals, p):
|
|
42
|
-
if not sorted_vals:
|
|
43
|
-
return 0.0
|
|
44
|
-
n = len(sorted_vals)
|
|
45
|
-
idx = min(n - 1, int(n * p))
|
|
46
|
-
return sorted_vals[idx]
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
def fmt_tokens(n):
|
|
50
|
-
if n >= 1_000_000:
|
|
51
|
-
return f"{n/1_000_000:.1f}M"
|
|
52
|
-
if n >= 1_000:
|
|
53
|
-
return f"{n/1_000:.1f}K"
|
|
54
|
-
return str(n)
|
|
55
|
-
|
|
56
|
-
|
|
57
24
|
def fmt_args(tool_name, tc_args, max_len=100):
|
|
58
25
|
if isinstance(tc_args, str):
|
|
59
26
|
try:
|
|
@@ -432,11 +399,18 @@ def render(out: output.Output, data, file_count):
|
|
|
432
399
|
else:
|
|
433
400
|
for i, (sec, val, hint) in enumerate(bottleneck_items):
|
|
434
401
|
out.item(f"#{i+1}: {sec}(P95={val:.1f}s, {hint})")
|
|
402
|
+
out.set_data("bottleneck", {
|
|
403
|
+
"model_p95": round(model_p95, 3),
|
|
404
|
+
"tool_p95": round(tool_p95, 3),
|
|
405
|
+
"model_top": model_top,
|
|
406
|
+
"tool_top": tool_top,
|
|
407
|
+
})
|
|
435
408
|
|
|
436
409
|
out.subsection("模型性能")
|
|
437
410
|
out.item(f"数据来源: 最近 {file_count} 个 session 文件")
|
|
438
411
|
out.line("")
|
|
439
412
|
model_stats = data["model_stats"]
|
|
413
|
+
models_payload = {}
|
|
440
414
|
if not model_stats:
|
|
441
415
|
out.item("最近 Session 中未发现模型使用数据")
|
|
442
416
|
else:
|
|
@@ -481,9 +455,29 @@ def render(out: output.Output, data, file_count):
|
|
|
481
455
|
stops_str = " ".join(f"{k}:{v}" for k, v in sorted(stops.items(), key=lambda x: -x[1]))
|
|
482
456
|
out.item(f" stopReasons: {stops_str}" if stops_str else " stopReasons: (none)")
|
|
483
457
|
out.line("")
|
|
458
|
+
models_payload[model_key] = {
|
|
459
|
+
"calls": calls,
|
|
460
|
+
"p50_s": round(p50, 3),
|
|
461
|
+
"p95_s": round(p95, 3),
|
|
462
|
+
"max_s": round(mx, 3),
|
|
463
|
+
"throughput_tok_s": (
|
|
464
|
+
None if s["output"] == 0 or total_dur <= 0
|
|
465
|
+
else round(s["output"] / total_dur, 1)
|
|
466
|
+
),
|
|
467
|
+
"input_tokens": s["input"],
|
|
468
|
+
"output_tokens": s["output"],
|
|
469
|
+
"cache_read_tokens": s["cache_read"],
|
|
470
|
+
"cache_write_tokens": s["cache_write"],
|
|
471
|
+
"cost_usd": round(s["cost"], 6),
|
|
472
|
+
"success_rate_pct": round(success, 1),
|
|
473
|
+
"stop_reasons": dict(stops),
|
|
474
|
+
}
|
|
475
|
+
out.set_data("models", models_payload)
|
|
476
|
+
out.set_data("session_files_analyzed", file_count)
|
|
484
477
|
|
|
485
478
|
out.subsection("工具性能(Top 10 by 调用量)")
|
|
486
479
|
timed_tools = {n: s for n, s in data["tool_stats"].items() if s["durations"]}
|
|
480
|
+
tools_payload = {}
|
|
487
481
|
if not timed_tools:
|
|
488
482
|
out.item("(无工具调用数据)")
|
|
489
483
|
else:
|
|
@@ -491,8 +485,11 @@ def render(out: output.Output, data, file_count):
|
|
|
491
485
|
for name, s in ranked:
|
|
492
486
|
durs = sorted(s["durations"])
|
|
493
487
|
calls = s["calls"]
|
|
488
|
+
p50 = pct(durs, 0.50)
|
|
489
|
+
p95 = pct(durs, 0.95)
|
|
490
|
+
mx = durs[-1]
|
|
494
491
|
err_rate = (s["errors"] / calls * 100) if calls else 0.0
|
|
495
|
-
dur_str = f"P50={
|
|
492
|
+
dur_str = f"P50={p50:.3f}s P95={p95:.3f}s Max={mx:.3f}s"
|
|
496
493
|
out.item(f"{name}: {calls} 次 | {dur_str} | 错误 {err_rate:.0f}%")
|
|
497
494
|
timed = [r for r in s["records"] if r["dur"] is not None]
|
|
498
495
|
timed.sort(key=lambda r: r["dur"], reverse=True)
|
|
@@ -511,6 +508,15 @@ def render(out: output.Output, data, file_count):
|
|
|
511
508
|
tail = f", {r['err_brief']}" if r["err_brief"] else ""
|
|
512
509
|
out.item(f" 失败: {args} (error, {dur_txt}{tail})")
|
|
513
510
|
err_shown += 1
|
|
511
|
+
tools_payload[name] = {
|
|
512
|
+
"calls": calls,
|
|
513
|
+
"errors": s["errors"],
|
|
514
|
+
"error_rate_pct": round(err_rate, 1),
|
|
515
|
+
"p50_s": round(p50, 3),
|
|
516
|
+
"p95_s": round(p95, 3),
|
|
517
|
+
"max_s": round(mx, 3),
|
|
518
|
+
}
|
|
519
|
+
out.set_data("tools", tools_payload)
|
|
514
520
|
|
|
515
521
|
out.subsection("慢调用 Top 20")
|
|
516
522
|
slow = sorted(data["slow_calls_top"], key=lambda x: x[0], reverse=True)
|
|
@@ -529,12 +535,17 @@ def render(out: output.Output, data, file_count):
|
|
|
529
535
|
else:
|
|
530
536
|
for i, entry in enumerate(top20, 1):
|
|
531
537
|
out.item(f"[{i}] {entry[2]}")
|
|
538
|
+
out.set_data("slow_calls_top20", [
|
|
539
|
+
{"duration_s": round(e[0], 3), "kind": e[1], "summary": e[2]}
|
|
540
|
+
for e in top20
|
|
541
|
+
])
|
|
532
542
|
|
|
533
543
|
out.subsection("异常 stopReason — 模型非正常结束(如 error、中断)")
|
|
534
544
|
abnormal_stops = data["abnormal_stops"]
|
|
535
545
|
out.item(f"共 {len(abnormal_stops)} 条" + ("(无异常)" if not abnormal_stops else ""))
|
|
536
546
|
for s in abnormal_stops[:20]:
|
|
537
547
|
out.item(s)
|
|
548
|
+
out.set_data("abnormal_stops", abnormal_stops)
|
|
538
549
|
|
|
539
550
|
out.subsection("模型 API 错误分布")
|
|
540
551
|
api_err_total = sum(data["api_error_stats"].values())
|
|
@@ -548,9 +559,16 @@ def render(out: output.Output, data, file_count):
|
|
|
548
559
|
out.item("分布:")
|
|
549
560
|
for cat, n in sorted(data["api_error_stats"].items(), key=lambda kv: -kv[1]):
|
|
550
561
|
out.item(f" {cat}: {n}")
|
|
562
|
+
out.set_data("api_errors", {
|
|
563
|
+
"total_calls": api_total,
|
|
564
|
+
"error_count": api_err_total,
|
|
565
|
+
"error_rate_pct": round(api_err_total / api_total * 100, 2) if api_total else 0.0,
|
|
566
|
+
"by_category": dict(data["api_error_stats"]),
|
|
567
|
+
})
|
|
551
568
|
|
|
552
569
|
out.subsection("端到端消息延迟(user 发送 → assistant 最终响应)")
|
|
553
570
|
e2e = data["e2e_latencies"]
|
|
571
|
+
e2e_payload = {"count": 0}
|
|
554
572
|
if not e2e:
|
|
555
573
|
out.item("(数据不足,未发现 user→assistant 配对)")
|
|
556
574
|
else:
|
|
@@ -573,10 +591,19 @@ def render(out: output.Output, data, file_count):
|
|
|
573
591
|
n = bucket_counts[lbl]
|
|
574
592
|
pct_v = (n / total * 100) if total else 0.0
|
|
575
593
|
out.item(f" {lbl}: {n} ({pct_v:.1f}%)")
|
|
594
|
+
e2e_payload = {
|
|
595
|
+
"count": total,
|
|
596
|
+
"p50_s": round(p50, 3),
|
|
597
|
+
"p95_s": round(p95, 3),
|
|
598
|
+
"max_s": round(mx, 3),
|
|
599
|
+
"buckets": dict(bucket_counts),
|
|
600
|
+
}
|
|
601
|
+
out.set_data("e2e_latency", e2e_payload)
|
|
576
602
|
|
|
577
603
|
out.subsection("延迟 vs 上下文大小")
|
|
578
604
|
ctx_buckets_def = data["ctx_buckets_def"]
|
|
579
605
|
ctx_durs = data["ctx_bucket_durs"]
|
|
606
|
+
ctx_payload = {}
|
|
580
607
|
if not any(ctx_durs.get(l) for l, _ in ctx_buckets_def):
|
|
581
608
|
out.item("(数据不足)")
|
|
582
609
|
else:
|
|
@@ -585,13 +612,21 @@ def render(out: output.Output, data, file_count):
|
|
|
585
612
|
durs = sorted(ctx_durs.get(b_label, []))
|
|
586
613
|
if not durs:
|
|
587
614
|
out.line(f" {b_label:<14} {0:>8} {'-':>10} {'-':>10}")
|
|
615
|
+
ctx_payload[b_label] = {"count": 0, "p50_s": None, "p95_s": None}
|
|
588
616
|
continue
|
|
589
617
|
p50 = pct(durs, 0.50)
|
|
590
618
|
p95 = pct(durs, 0.95)
|
|
591
619
|
out.line(f" {b_label:<14} {len(durs):>8} {p50:>9.1f}s {p95:>9.1f}s")
|
|
620
|
+
ctx_payload[b_label] = {
|
|
621
|
+
"count": len(durs),
|
|
622
|
+
"p50_s": round(p50, 3),
|
|
623
|
+
"p95_s": round(p95, 3),
|
|
624
|
+
}
|
|
625
|
+
out.set_data("ctx_buckets", ctx_payload)
|
|
592
626
|
|
|
593
627
|
out.subsection("每日趋势(最近 7 天)")
|
|
594
628
|
daily_stats = data["daily_stats"]
|
|
629
|
+
daily_payload = []
|
|
595
630
|
if not daily_stats:
|
|
596
631
|
out.item("(数据不足)")
|
|
597
632
|
else:
|
|
@@ -602,12 +637,22 @@ def render(out: output.Output, data, file_count):
|
|
|
602
637
|
d = daily_stats.get(d_key)
|
|
603
638
|
if not d or d["calls"] == 0:
|
|
604
639
|
out.line(f" {d_key:<10} {0:>8} {'-':>10} {'-':>14}")
|
|
640
|
+
daily_payload.append({"date": d_key, "calls": 0,
|
|
641
|
+
"p50_s": None, "output_tokens": 0})
|
|
605
642
|
continue
|
|
606
643
|
durs = sorted(d["durs"])
|
|
607
644
|
p50 = pct(durs, 0.50) if durs else 0.0
|
|
608
645
|
out.line(f" {d_key:<10} {d['calls']:>8} {p50:>9.1f}s {fmt_tokens(d['output']):>14}")
|
|
646
|
+
daily_payload.append({
|
|
647
|
+
"date": d_key,
|
|
648
|
+
"calls": d["calls"],
|
|
649
|
+
"p50_s": round(p50, 3),
|
|
650
|
+
"output_tokens": d["output"],
|
|
651
|
+
})
|
|
652
|
+
out.set_data("daily_trend", daily_payload)
|
|
609
653
|
|
|
610
654
|
out.subsection("Cache 命中率")
|
|
655
|
+
cache_payload = {"total_calls": data["cache_total_calls"]}
|
|
611
656
|
if data["cache_total_calls"] == 0:
|
|
612
657
|
out.item("(无数据)")
|
|
613
658
|
else:
|
|
@@ -622,17 +667,34 @@ def render(out: output.Output, data, file_count):
|
|
|
622
667
|
f"cache_write: {fmt_tokens(data['cache_sum_cache_write'])}"
|
|
623
668
|
)
|
|
624
669
|
denom = data["cache_sum_input"] + data["cache_sum_cache_read"]
|
|
670
|
+
ratio_pct = None
|
|
625
671
|
if denom > 0:
|
|
626
672
|
ratio = data["cache_sum_cache_read"] / denom * 100
|
|
673
|
+
ratio_pct = round(ratio, 3)
|
|
627
674
|
out.item(
|
|
628
675
|
f"上下文 cache 占比: cacheRead/(input+cacheRead) = "
|
|
629
676
|
f"{ratio:.3f}% ({fmt_tokens(data['cache_sum_cache_read'])}/{fmt_tokens(denom)})"
|
|
630
677
|
)
|
|
678
|
+
cache_payload = {
|
|
679
|
+
"total_calls": data["cache_total_calls"],
|
|
680
|
+
"calls_with_cache_read": data["cache_calls_with_cache"],
|
|
681
|
+
"hit_rate_pct": round(hit_pct, 2),
|
|
682
|
+
"input_tokens": data["cache_sum_input"],
|
|
683
|
+
"cache_read_tokens": data["cache_sum_cache_read"],
|
|
684
|
+
"cache_write_tokens": data["cache_sum_cache_write"],
|
|
685
|
+
"ctx_cache_ratio_pct": ratio_pct,
|
|
686
|
+
}
|
|
687
|
+
out.set_data("cache_hit_rate", cache_payload)
|
|
631
688
|
|
|
632
689
|
out.subsection("工具错误明细")
|
|
633
690
|
tool_stats = data["tool_stats"]
|
|
634
691
|
err_total = sum(s["errors"] for s in tool_stats.values())
|
|
635
692
|
call_total = sum(s["calls"] for s in tool_stats.values())
|
|
693
|
+
tool_errors_payload = {
|
|
694
|
+
"total_errors": err_total,
|
|
695
|
+
"total_calls": call_total,
|
|
696
|
+
"by_tool": {},
|
|
697
|
+
}
|
|
636
698
|
if err_total == 0:
|
|
637
699
|
out.item(f"共 0 次错误 (总调用 {call_total} 次中)")
|
|
638
700
|
else:
|
|
@@ -642,13 +704,22 @@ def render(out: output.Output, data, file_count):
|
|
|
642
704
|
if s["errors"] == 0:
|
|
643
705
|
continue
|
|
644
706
|
out.line(f" {name} ({s['errors']}次):")
|
|
707
|
+
samples = []
|
|
645
708
|
for r in s["error_records"][:3]:
|
|
646
709
|
ts_label = r["ts"].strftime("%Y-%m-%d %H:%M:%S") if r["ts"] else "?"
|
|
647
710
|
brief = r["err_brief"] or "(无错误内容)"
|
|
648
711
|
out.line(f" {ts_label} | {brief[:100]}")
|
|
712
|
+
samples.append({"ts": ts_label, "brief": brief[:200]})
|
|
713
|
+
tool_errors_payload["by_tool"][name] = {
|
|
714
|
+
"errors": s["errors"],
|
|
715
|
+
"calls": s["calls"],
|
|
716
|
+
"samples": samples,
|
|
717
|
+
}
|
|
718
|
+
out.set_data("tool_errors", tool_errors_payload)
|
|
649
719
|
|
|
650
720
|
out.subsection("Session 消耗 Top 5")
|
|
651
721
|
session_stats = data["session_stats"]
|
|
722
|
+
session_top_payload = []
|
|
652
723
|
if not session_stats:
|
|
653
724
|
out.item("(无数据)")
|
|
654
725
|
else:
|
|
@@ -657,12 +728,18 @@ def render(out: output.Output, data, file_count):
|
|
|
657
728
|
for sid, ss in ranked:
|
|
658
729
|
out.line(f" {sid:<40} {ss['calls']:>8} "
|
|
659
730
|
f"{fmt_tokens(ss['tokens']):>10} {ss['duration']:>11.0f}s")
|
|
731
|
+
session_top_payload.append({
|
|
732
|
+
"session": sid,
|
|
733
|
+
"calls": ss["calls"],
|
|
734
|
+
"tokens": ss["tokens"],
|
|
735
|
+
"duration_s": round(ss["duration"], 1),
|
|
736
|
+
})
|
|
737
|
+
out.set_data("session_top5", session_top_payload)
|
|
660
738
|
|
|
661
739
|
|
|
662
740
|
def main() -> int:
|
|
663
741
|
parser = cli.build_common_parser(
|
|
664
742
|
description="模块 7:模型与性能数据",
|
|
665
|
-
prog="07_performance",
|
|
666
743
|
)
|
|
667
744
|
args = parser.parse_args()
|
|
668
745
|
out = output.init("performance", json_mode=args.json, no_color=args.no_color)
|
|
@@ -675,11 +752,6 @@ def main() -> int:
|
|
|
675
752
|
|
|
676
753
|
data = analyze_sessions(session_files)
|
|
677
754
|
render(out, data, len(session_files))
|
|
678
|
-
|
|
679
|
-
if args.json:
|
|
680
|
-
out.set_data("model_count", len(data["model_stats"]))
|
|
681
|
-
out.set_data("session_files_analyzed", len(session_files))
|
|
682
|
-
out.set_data("e2e_latency_count", len(data["e2e_latencies"]))
|
|
683
755
|
return out.done()
|
|
684
756
|
|
|
685
757
|
|
package/diag/08_sessions.py
CHANGED
|
@@ -10,69 +10,18 @@ import re
|
|
|
10
10
|
import sys
|
|
11
11
|
import time
|
|
12
12
|
from collections import defaultdict
|
|
13
|
-
from datetime import datetime, timezone
|
|
14
13
|
from pathlib import Path
|
|
15
14
|
|
|
16
15
|
sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
|
|
17
16
|
|
|
18
17
|
from ocdiag import cli, output
|
|
18
|
+
from ocdiag.timeutil import fmt_duration, parse_msg_ts, parse_obj_ts
|
|
19
|
+
from ocdiag.tokens import fmt_tokens, human_size, pct
|
|
19
20
|
|
|
20
21
|
|
|
21
22
|
NORMAL_STOPS = {"stop", "end_turn", "toolUse", "tool_calls", ""}
|
|
22
23
|
|
|
23
24
|
|
|
24
|
-
def parse_obj_ts(ts_str):
|
|
25
|
-
if not ts_str:
|
|
26
|
-
return None
|
|
27
|
-
try:
|
|
28
|
-
return datetime.fromisoformat(ts_str.replace("Z", "+00:00"))
|
|
29
|
-
except Exception:
|
|
30
|
-
return None
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
def parse_msg_ts(ms):
|
|
34
|
-
if ms is None:
|
|
35
|
-
return None
|
|
36
|
-
try:
|
|
37
|
-
return datetime.fromtimestamp(int(ms) / 1000, tz=timezone.utc)
|
|
38
|
-
except Exception:
|
|
39
|
-
return None
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
def human_size(b):
|
|
43
|
-
if b < 1024:
|
|
44
|
-
return f"{b}B"
|
|
45
|
-
if b < 1048576:
|
|
46
|
-
return f"{b/1024:.1f}KB"
|
|
47
|
-
if b < 1073741824:
|
|
48
|
-
return f"{b/1048576:.1f}MB"
|
|
49
|
-
return f"{b/1073741824:.1f}GB"
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
def fmt_tokens(n):
|
|
53
|
-
if n >= 1_000_000:
|
|
54
|
-
return f"{n/1_000_000:.1f}M"
|
|
55
|
-
if n >= 1_000:
|
|
56
|
-
return f"{n/1_000:.1f}K"
|
|
57
|
-
return str(n)
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
def fmt_duration(sec):
|
|
61
|
-
if sec < 60:
|
|
62
|
-
return f"{sec:.0f}s"
|
|
63
|
-
if sec < 3600:
|
|
64
|
-
return f"{sec/60:.1f}m"
|
|
65
|
-
return f"{sec/3600:.1f}h"
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
def pct(sorted_vals, p):
|
|
69
|
-
if not sorted_vals:
|
|
70
|
-
return 0.0
|
|
71
|
-
n = len(sorted_vals)
|
|
72
|
-
idx = min(n - 1, int(n * p))
|
|
73
|
-
return sorted_vals[idx]
|
|
74
|
-
|
|
75
|
-
|
|
76
25
|
def build_id_to_key_map(agent_dir):
|
|
77
26
|
sess_json = os.path.join(agent_dir, "sessions", "sessions.json")
|
|
78
27
|
id_to_key = {}
|
|
@@ -502,7 +451,6 @@ def stuck_dimension(out: output.Output, log_dir: str) -> None:
|
|
|
502
451
|
def main() -> int:
|
|
503
452
|
parser = cli.build_common_parser(
|
|
504
453
|
description="模块 8:Session 数据采集 + Stuck 探测",
|
|
505
|
-
prog="08_sessions",
|
|
506
454
|
)
|
|
507
455
|
args = parser.parse_args()
|
|
508
456
|
out = output.init("sessions", json_mode=args.json, no_color=args.no_color)
|
package/diag/09_plugin_diag.py
CHANGED
|
@@ -17,6 +17,7 @@ sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
|
|
|
17
17
|
|
|
18
18
|
from ocdiag import cli, output
|
|
19
19
|
from ocdiag.jsonlog import parse_name
|
|
20
|
+
from ocdiag.sensitive import sanitize_text
|
|
20
21
|
from ocdiag.timeutil import fmt_hms
|
|
21
22
|
|
|
22
23
|
|
|
@@ -110,13 +111,18 @@ def scan_logs(today_logs):
|
|
|
110
111
|
for logf in today_logs:
|
|
111
112
|
try:
|
|
112
113
|
fh = open(logf, "r", errors="replace")
|
|
113
|
-
except
|
|
114
|
+
except OSError:
|
|
115
|
+
# Best-effort: if today's log is unreadable, skip it; the parent
|
|
116
|
+
# caller still surfaces "no log data" via the empty plugin_diag
|
|
117
|
+
# output. (We don't fail the whole module for one missing file.)
|
|
114
118
|
continue
|
|
115
119
|
with fh:
|
|
116
120
|
for line in fh:
|
|
117
121
|
try:
|
|
118
122
|
o = json.loads(line)
|
|
119
|
-
except
|
|
123
|
+
except (json.JSONDecodeError, ValueError):
|
|
124
|
+
# Expected: log files are JSONL; non-JSON lines are emitted
|
|
125
|
+
# by Node before logger init. Drop those lines silently.
|
|
120
126
|
continue
|
|
121
127
|
plugin, sub = parse_name(o)
|
|
122
128
|
lvl = o.get("_meta", {}).get("logLevelName", "")
|
|
@@ -190,18 +196,23 @@ def scan_logs(today_logs):
|
|
|
190
196
|
|
|
191
197
|
|
|
192
198
|
def load_configured(config_path):
|
|
199
|
+
"""Return {plugin_id: enabled_bool}. Status reported as second return."""
|
|
193
200
|
configured = {}
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
201
|
+
status = {"found": True}
|
|
202
|
+
if not config_path or not os.path.isfile(config_path):
|
|
203
|
+
return configured, {"found": False, "reason": "config_not_found",
|
|
204
|
+
"checked": config_path or ""}
|
|
205
|
+
try:
|
|
206
|
+
with open(config_path) as f:
|
|
207
|
+
cfg = json.load(f)
|
|
208
|
+
entries = cfg.get("plugins", {}).get("entries", {}) or {}
|
|
209
|
+
for k, v in entries.items():
|
|
210
|
+
if isinstance(v, dict):
|
|
211
|
+
configured[k] = bool(v.get("enabled", False))
|
|
212
|
+
except (OSError, json.JSONDecodeError) as e:
|
|
213
|
+
return configured, {"found": False, "reason": "config_unreadable",
|
|
214
|
+
"checked": config_path, "error": str(e)[:200]}
|
|
215
|
+
return configured, status
|
|
205
216
|
|
|
206
217
|
|
|
207
218
|
def load_extensions(oc_home):
|
|
@@ -274,7 +285,10 @@ def section_state(out, scan, configured, extensions):
|
|
|
274
285
|
})
|
|
275
286
|
|
|
276
287
|
|
|
277
|
-
def section_errors(out, scan, configured):
|
|
288
|
+
def section_errors(out, scan, configured, unmask=False):
|
|
289
|
+
def _scrub(s: str) -> str:
|
|
290
|
+
return s if unmask else sanitize_text(s)
|
|
291
|
+
|
|
278
292
|
out.subsection("9.2 插件错误/警告")
|
|
279
293
|
plugin_level_counts = scan["plugin_level_counts"]
|
|
280
294
|
plugin_error_samples = scan["plugin_error_samples"]
|
|
@@ -313,9 +327,11 @@ def section_errors(out, scan, configured):
|
|
|
313
327
|
if samples:
|
|
314
328
|
for ts, lvl, text in dedup_messages(samples, max_unique=999):
|
|
315
329
|
tag = {"ERROR": "E", "FATAL": "F", "WARN": "W"}.get(lvl, "?")
|
|
316
|
-
snippet = text.replace("\n", " ")
|
|
330
|
+
snippet = _scrub(text.replace("\n", " "))
|
|
317
331
|
out.item(f" [{tag}] {fmt_hms(ts)}: {snippet}")
|
|
318
|
-
sample_payload.append({
|
|
332
|
+
sample_payload.append({
|
|
333
|
+
"ts": ts, "level": lvl, "msg": _scrub(text[:300]),
|
|
334
|
+
})
|
|
319
335
|
if err > 0 or warn > 0 or sample_payload:
|
|
320
336
|
errors_payload[p] = {
|
|
321
337
|
"error_count": err,
|
|
@@ -331,9 +347,9 @@ def section_errors(out, scan, configured):
|
|
|
331
347
|
out.item(f"[plugin-manager]: {len(pm_errors)} ERROR, {len(pm_warns)} WARN, "
|
|
332
348
|
f"{len(plugin_diag_messages)} total")
|
|
333
349
|
for ts, _lvl, text in dedup_messages(pm_errors, max_unique=999):
|
|
334
|
-
out.item(f" [E] {fmt_hms(ts)}: {text.replace(chr(10),' ')}")
|
|
350
|
+
out.item(f" [E] {fmt_hms(ts)}: {_scrub(text.replace(chr(10),' '))}")
|
|
335
351
|
for ts, _lvl, text in dedup_messages(pm_warns, max_unique=999):
|
|
336
|
-
out.item(f" [W] {fmt_hms(ts)}: {text.replace(chr(10),' ')}")
|
|
352
|
+
out.item(f" [W] {fmt_hms(ts)}: {_scrub(text.replace(chr(10),' '))}")
|
|
337
353
|
elif plugin_diag_messages:
|
|
338
354
|
out.item(f"[plugin-manager]: 0 ERROR, 0 WARN, {len(plugin_diag_messages)} total")
|
|
339
355
|
|
|
@@ -444,12 +460,15 @@ def walk_urls(val, out_set):
|
|
|
444
460
|
walk_urls(v, out_set)
|
|
445
461
|
|
|
446
462
|
|
|
447
|
-
def section_deps(out, config_path):
|
|
463
|
+
def section_deps(out, config_path, unmask=False):
|
|
448
464
|
out.subsection("9.5 插件外部依赖")
|
|
449
465
|
plugin_deps = {}
|
|
450
466
|
if not (config_path and os.path.isfile(config_path)):
|
|
451
467
|
out.item("未发现已启用插件的外部依赖配置")
|
|
452
468
|
out.set_data("plugin_deps", {})
|
|
469
|
+
out.set_data("plugin_deps_status",
|
|
470
|
+
{"found": False, "reason": "config_not_found",
|
|
471
|
+
"checked": config_path or ""})
|
|
453
472
|
return
|
|
454
473
|
try:
|
|
455
474
|
with open(config_path) as f:
|
|
@@ -464,8 +483,13 @@ def section_deps(out, config_path):
|
|
|
464
483
|
walk_urls(pconf, hosts)
|
|
465
484
|
hosts = {h for h in hosts if not h.startswith(("127.", "localhost", "0.0.0.0"))}
|
|
466
485
|
plugin_deps[pid] = hosts
|
|
467
|
-
except
|
|
468
|
-
|
|
486
|
+
except (OSError, json.JSONDecodeError) as e:
|
|
487
|
+
out.item(f"配置读取/解析失败: {type(e).__name__}")
|
|
488
|
+
out.set_data("plugin_deps", {})
|
|
489
|
+
out.set_data("plugin_deps_status",
|
|
490
|
+
{"found": False, "reason": "config_unreadable",
|
|
491
|
+
"checked": config_path, "error": str(e)[:200]})
|
|
492
|
+
return
|
|
469
493
|
|
|
470
494
|
if not plugin_deps:
|
|
471
495
|
out.item("未发现已启用插件的外部依赖配置")
|
|
@@ -509,7 +533,6 @@ def section_deps(out, config_path):
|
|
|
509
533
|
def main() -> int:
|
|
510
534
|
parser = cli.build_common_parser(
|
|
511
535
|
description="模块 9:插件诊断",
|
|
512
|
-
prog="09_plugin_diag",
|
|
513
536
|
)
|
|
514
537
|
args = parser.parse_args()
|
|
515
538
|
out = output.init("plugin_diag", json_mode=args.json, no_color=args.no_color)
|
|
@@ -519,14 +542,18 @@ def main() -> int:
|
|
|
519
542
|
today_logs = sorted(glob.glob(os.path.join(args.log_dir, f"openclaw-{today}.log")))
|
|
520
543
|
|
|
521
544
|
scan = scan_logs(today_logs)
|
|
522
|
-
configured = load_configured(args.config)
|
|
545
|
+
configured, configured_status = load_configured(args.config)
|
|
523
546
|
extensions = load_extensions(args.openclaw_home)
|
|
547
|
+
if not configured_status.get("found", True):
|
|
548
|
+
out.item(f"配置加载失败: {configured_status.get('reason')} "
|
|
549
|
+
f"({configured_status.get('checked')})")
|
|
550
|
+
out.set_data("configured_status", configured_status)
|
|
524
551
|
|
|
525
552
|
section_state(out, scan, configured, extensions)
|
|
526
|
-
section_errors(out, scan, configured)
|
|
553
|
+
section_errors(out, scan, configured, unmask=args.unmask)
|
|
527
554
|
section_hooks(out, scan)
|
|
528
555
|
section_channels(out, scan)
|
|
529
|
-
section_deps(out, args.config)
|
|
556
|
+
section_deps(out, args.config, unmask=args.unmask)
|
|
530
557
|
|
|
531
558
|
return out.done()
|
|
532
559
|
|
package/diag/10_shell_history.py
CHANGED
|
@@ -12,6 +12,7 @@ from typing import List, Tuple
|
|
|
12
12
|
sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
|
|
13
13
|
|
|
14
14
|
from ocdiag import cli, output
|
|
15
|
+
from ocdiag.sensitive import sanitize_text
|
|
15
16
|
|
|
16
17
|
|
|
17
18
|
DANGEROUS_RE = re.compile(
|
|
@@ -34,26 +35,34 @@ def list_history_files() -> List[str]:
|
|
|
34
35
|
return [c for c in candidates if os.path.isfile(c)]
|
|
35
36
|
|
|
36
37
|
|
|
37
|
-
def read_lines(path: str) -> List[Tuple[int, str]]:
|
|
38
|
+
def read_lines(path: str) -> Tuple[List[Tuple[int, str]], str]:
|
|
39
|
+
"""Read history file. Returns (lines, error_str). error_str=='' on success.
|
|
40
|
+
|
|
41
|
+
Permission denied / missing files become an explicit error instead of an
|
|
42
|
+
empty list, so the caller can distinguish "no commands" from "couldn't read".
|
|
43
|
+
"""
|
|
38
44
|
out: List[Tuple[int, str]] = []
|
|
39
45
|
try:
|
|
40
46
|
with open(path, "r", errors="replace") as f:
|
|
41
47
|
for i, line in enumerate(f, 1):
|
|
42
48
|
out.append((i, line.rstrip("\n")))
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
49
|
+
return out, ""
|
|
50
|
+
except OSError as e:
|
|
51
|
+
return out, f"{type(e).__name__}: {e}"
|
|
46
52
|
|
|
47
53
|
|
|
48
54
|
def main() -> int:
|
|
49
55
|
parser = cli.build_common_parser(
|
|
50
56
|
description="模块 10:采集 shell 历史",
|
|
51
|
-
prog="10_shell_history",
|
|
52
57
|
)
|
|
53
58
|
args = parser.parse_args()
|
|
54
59
|
|
|
55
60
|
out = output.init("shell_history", json_mode=args.json, no_color=args.no_color)
|
|
56
61
|
out.section("模块 10:命令执行历史")
|
|
62
|
+
|
|
63
|
+
def maybe_sanitize(s: str) -> str:
|
|
64
|
+
return s if args.unmask else sanitize_text(s)
|
|
65
|
+
|
|
57
66
|
out.line(" 系统 shell 历史记录,用于判断是否有人或脚本执行过高危命令"
|
|
58
67
|
"(rm -rf、kill、systemctl stop 等)。")
|
|
59
68
|
out.line("")
|
|
@@ -66,7 +75,16 @@ def main() -> int:
|
|
|
66
75
|
|
|
67
76
|
files_data = []
|
|
68
77
|
for hfile in history_files:
|
|
69
|
-
lines = read_lines(hfile)
|
|
78
|
+
lines, read_err = read_lines(hfile)
|
|
79
|
+
if read_err:
|
|
80
|
+
out.item(f"{os.path.basename(hfile)} — 读取失败 ({read_err})")
|
|
81
|
+
files_data.append({
|
|
82
|
+
"path": hfile,
|
|
83
|
+
"found": False,
|
|
84
|
+
"reason": "unreadable",
|
|
85
|
+
"error": read_err,
|
|
86
|
+
})
|
|
87
|
+
continue
|
|
70
88
|
total = len(lines)
|
|
71
89
|
out.item(f"{os.path.basename(hfile)} — 共 {total} 条记录")
|
|
72
90
|
|
|
@@ -79,7 +97,7 @@ def main() -> int:
|
|
|
79
97
|
|
|
80
98
|
if dangerous:
|
|
81
99
|
out.item(f" 高危命令: {len(dangerous)} 条 ")
|
|
82
|
-
ev = "\n".join(f"{n}: {ln}" for n, ln in dangerous)
|
|
100
|
+
ev = "\n".join(f"{n}: {maybe_sanitize(ln)}" for n, ln in dangerous)
|
|
83
101
|
out.evidence(f"{hfile} (高危)", ev)
|
|
84
102
|
else:
|
|
85
103
|
out.item(" 高危命令: 0 条")
|
|
@@ -89,25 +107,25 @@ def main() -> int:
|
|
|
89
107
|
oc_cmds = oc_all[-30:]
|
|
90
108
|
if oc_total:
|
|
91
109
|
out.item(
|
|
92
|
-
f"
|
|
110
|
+
f" OpenClaw 相关命令: 全文 {oc_total} 条,最近 30 条采样 {len(oc_cmds)} 条 — "
|
|
93
111
|
"用户手动执行的 openclaw 命令"
|
|
94
112
|
)
|
|
95
|
-
ev = "\n".join(f"{n}: {ln}" for n, ln in oc_cmds)
|
|
113
|
+
ev = "\n".join(f"{n}: {maybe_sanitize(ln)}" for n, ln in oc_cmds)
|
|
96
114
|
out.evidence(f"{hfile} (openclaw)", ev)
|
|
97
115
|
else:
|
|
98
|
-
out.item("
|
|
116
|
+
out.item(" OpenClaw 相关命令: 0 条")
|
|
99
117
|
|
|
100
118
|
recent = lines[-20:]
|
|
101
119
|
if recent:
|
|
102
120
|
out.item(" 最近 20 条命令:")
|
|
103
|
-
ev = "\n".join(ln for _, ln in recent)
|
|
121
|
+
ev = "\n".join(maybe_sanitize(ln) for _, ln in recent)
|
|
104
122
|
out.evidence(f"{hfile} (最近)", ev)
|
|
105
123
|
|
|
106
124
|
files_data.append({
|
|
107
125
|
"path": hfile,
|
|
108
126
|
"total_lines": total,
|
|
109
127
|
"dangerous_count": len(dangerous),
|
|
110
|
-
"dangerous": [{"line": n, "cmd": ln} for n, ln in dangerous],
|
|
128
|
+
"dangerous": [{"line": n, "cmd": maybe_sanitize(ln)} for n, ln in dangerous],
|
|
111
129
|
"openclaw_count_total": oc_total,
|
|
112
130
|
"openclaw_count_sample_30": len(oc_cmds),
|
|
113
131
|
"recent_count": len(recent),
|