openclaw-diag-cli 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,518 @@
1
+ #!/usr/bin/env python3
2
+ """模块 8:Session 数据(六维分析 + Stuck 探测)。"""
3
+
4
+ from __future__ import annotations
5
+
6
+ import glob
7
+ import json
8
+ import os
9
+ import re
10
+ import sys
11
+ import time
12
+ from collections import defaultdict
13
+ from datetime import datetime, timezone
14
+ from pathlib import Path
15
+
16
+ sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
17
+
18
+ from ocdiag import cli, output
19
+
20
+
21
+ NORMAL_STOPS = {"stop", "end_turn", "toolUse", "tool_calls", ""}
22
+
23
+
24
+ def parse_obj_ts(ts_str):
25
+ if not ts_str:
26
+ return None
27
+ try:
28
+ return datetime.fromisoformat(ts_str.replace("Z", "+00:00"))
29
+ except Exception:
30
+ return None
31
+
32
+
33
+ def parse_msg_ts(ms):
34
+ if ms is None:
35
+ return None
36
+ try:
37
+ return datetime.fromtimestamp(int(ms) / 1000, tz=timezone.utc)
38
+ except Exception:
39
+ return None
40
+
41
+
42
+ def human_size(b):
43
+ if b < 1024:
44
+ return f"{b}B"
45
+ if b < 1048576:
46
+ return f"{b/1024:.1f}KB"
47
+ if b < 1073741824:
48
+ return f"{b/1048576:.1f}MB"
49
+ return f"{b/1073741824:.1f}GB"
50
+
51
+
52
+ def fmt_tokens(n):
53
+ if n >= 1_000_000:
54
+ return f"{n/1_000_000:.1f}M"
55
+ if n >= 1_000:
56
+ return f"{n/1_000:.1f}K"
57
+ return str(n)
58
+
59
+
60
+ def fmt_duration(sec):
61
+ if sec < 60:
62
+ return f"{sec:.0f}s"
63
+ if sec < 3600:
64
+ return f"{sec/60:.1f}m"
65
+ return f"{sec/3600:.1f}h"
66
+
67
+
68
+ def pct(sorted_vals, p):
69
+ if not sorted_vals:
70
+ return 0.0
71
+ n = len(sorted_vals)
72
+ idx = min(n - 1, int(n * p))
73
+ return sorted_vals[idx]
74
+
75
+
76
+ def build_id_to_key_map(agent_dir):
77
+ sess_json = os.path.join(agent_dir, "sessions", "sessions.json")
78
+ id_to_key = {}
79
+ try:
80
+ with open(sess_json) as f:
81
+ store = json.load(f)
82
+ if isinstance(store, dict):
83
+ for key, entry in store.items():
84
+ if isinstance(entry, dict) and "sessionId" in entry:
85
+ id_to_key[entry["sessionId"]] = key
86
+ except (FileNotFoundError, json.JSONDecodeError, AttributeError, OSError):
87
+ pass
88
+ return id_to_key
89
+
90
+
91
+ def analyze_session_file(fpath: str):
92
+ role_counts = defaultdict(int)
93
+ first_ts = None
94
+ last_ts = None
95
+ total_input = total_output = total_cache_read = total_cache_write = 0
96
+ total_cost = 0.0
97
+ model_calls = 0
98
+ models_seen = set()
99
+ model_latencies = []
100
+ last_call_input = None
101
+ per_call_inputs = []
102
+ tool_calls_total = 0
103
+ tool_errors = 0
104
+ tool_counts = defaultdict(int)
105
+ tool_durations = defaultdict(list)
106
+ anomalies = []
107
+ parse_failed = False
108
+
109
+ try:
110
+ with open(fpath, errors="replace") as fp:
111
+ for raw in fp:
112
+ raw = raw.strip()
113
+ if not raw:
114
+ continue
115
+ try:
116
+ obj = json.loads(raw)
117
+ except Exception:
118
+ continue
119
+ obj_ts = parse_obj_ts(obj.get("timestamp"))
120
+ if obj_ts:
121
+ if first_ts is None or obj_ts < first_ts:
122
+ first_ts = obj_ts
123
+ if last_ts is None or obj_ts > last_ts:
124
+ last_ts = obj_ts
125
+ msg = obj.get("message")
126
+ if not isinstance(msg, dict):
127
+ continue
128
+ role = msg.get("role", "")
129
+ if not role:
130
+ continue
131
+ role_counts[role] += 1
132
+ if role == "assistant":
133
+ model_calls += 1
134
+ provider = msg.get("provider") or ""
135
+ model = msg.get("model") or ""
136
+ if provider == "openclaw" and model in ("delivery-mirror", "gateway-injected"):
137
+ model_calls -= 1
138
+ role_counts[role] -= 1
139
+ continue
140
+ model_key = f"{provider}/{model}" if provider else model
141
+ if model_key:
142
+ models_seen.add(model_key)
143
+ usage = msg.get("usage") or {}
144
+ inp = usage.get("input", 0) or 0
145
+ out_v = usage.get("output", 0) or 0
146
+ cr = usage.get("cacheRead", 0) or 0
147
+ cw = usage.get("cacheWrite", 0) or 0
148
+ cost_obj = usage.get("cost") or {}
149
+ cost = (cost_obj.get("total", 0) or 0) if isinstance(cost_obj, dict) else 0
150
+ total_input += inp
151
+ total_output += out_v
152
+ total_cache_read += cr
153
+ total_cache_write += cw
154
+ total_cost += cost
155
+ per_call_inputs.append(inp + cr)
156
+ last_call_input = inp + cr
157
+ dur_ms = usage.get("durationMs")
158
+ dur_s = None
159
+ if isinstance(dur_ms, (int, float)) and dur_ms >= 0:
160
+ dur_s = dur_ms / 1000.0
161
+ else:
162
+ msg_ts = parse_msg_ts(msg.get("timestamp"))
163
+ if obj_ts and msg_ts:
164
+ d = (obj_ts - msg_ts).total_seconds()
165
+ if 0 <= d <= 600:
166
+ dur_s = d
167
+ if dur_s is not None:
168
+ model_latencies.append(dur_s)
169
+ stop = msg.get("stopReason") or ""
170
+ if stop and stop not in NORMAL_STOPS:
171
+ label = obj_ts.strftime("%Y-%m-%d %H:%M:%S") if obj_ts else "?"
172
+ detail = f"stop={stop} | in={fmt_tokens(inp + cr)} out={fmt_tokens(out_v)}"
173
+ anomalies.append((label, "model", detail))
174
+ elif role == "toolResult":
175
+ tool_calls_total += 1
176
+ tname = msg.get("toolName") or "?"
177
+ tool_counts[tname] += 1
178
+ is_err = bool(msg.get("isError", False))
179
+ details = msg.get("details") or {}
180
+ dur_ms = details.get("durationMs") if isinstance(details, dict) else None
181
+ if isinstance(dur_ms, (int, float)) and dur_ms >= 0:
182
+ tool_durations[tname].append(dur_ms / 1000.0)
183
+ if is_err:
184
+ tool_errors += 1
185
+ label = obj_ts.strftime("%Y-%m-%d %H:%M:%S") if obj_ts else "?"
186
+ err_brief = ""
187
+ c = msg.get("content")
188
+ if isinstance(c, list):
189
+ for item in c:
190
+ if isinstance(item, dict):
191
+ t = item.get("text") or item.get("content") or ""
192
+ if t:
193
+ err_brief = str(t)
194
+ break
195
+ elif isinstance(item, str):
196
+ err_brief = item
197
+ break
198
+ elif isinstance(c, str):
199
+ err_brief = c
200
+ err_brief = err_brief.replace("\n", " ")[:80]
201
+ detail = f"{tname} | isError=true"
202
+ if err_brief:
203
+ detail += f" | {err_brief}"
204
+ anomalies.append((label, "tool", detail))
205
+ except Exception:
206
+ parse_failed = True
207
+
208
+ return dict(
209
+ role_counts=role_counts, first_ts=first_ts, last_ts=last_ts,
210
+ total_input=total_input, total_output=total_output,
211
+ total_cache_read=total_cache_read, total_cache_write=total_cache_write,
212
+ total_cost=total_cost, model_calls=model_calls, models_seen=models_seen,
213
+ model_latencies=model_latencies, last_call_input=last_call_input,
214
+ per_call_inputs=per_call_inputs, tool_calls_total=tool_calls_total,
215
+ tool_errors=tool_errors, tool_counts=tool_counts, tool_durations=tool_durations,
216
+ anomalies=anomalies, parse_failed=parse_failed,
217
+ )
218
+
219
+
220
+ def session_data_dimension(out: output.Output, sessions_base: str) -> None:
221
+ active_cutoff = time.time() - 7 * 86400
222
+ all_files_info = []
223
+ active_files = []
224
+
225
+ for agent_dir in sorted(glob.glob(os.path.join(sessions_base, "*"))):
226
+ if not os.path.isdir(agent_dir):
227
+ continue
228
+ sess_dir = os.path.join(agent_dir, "sessions")
229
+ if not os.path.isdir(sess_dir):
230
+ continue
231
+ for f in os.listdir(sess_dir):
232
+ fp = os.path.join(sess_dir, f)
233
+ if not os.path.isfile(fp):
234
+ continue
235
+ if f.endswith(".trajectory.jsonl"):
236
+ continue
237
+ if not (f.endswith(".jsonl") or ".jsonl.reset." in f):
238
+ continue
239
+ try:
240
+ st = os.stat(fp)
241
+ except OSError:
242
+ continue
243
+ all_files_info.append((f, fp, st.st_size, st.st_mtime))
244
+ if st.st_mtime >= active_cutoff:
245
+ active_files.append((agent_dir, f, fp, st.st_size, st.st_mtime))
246
+
247
+ total_files = len(all_files_info)
248
+ total_size = sum(x[2] for x in all_files_info)
249
+ active_count = len(active_files)
250
+ out.item(
251
+ f"Session 总览: {total_files} 个文件, 总大小 {human_size(total_size)}, "
252
+ f"活跃(7天内) {active_count} 个"
253
+ )
254
+ out.set_data("disk_summary", {
255
+ "total_files": total_files,
256
+ "total_size_bytes": total_size,
257
+ "active_count": active_count,
258
+ })
259
+
260
+ agents_data: dict = {}
261
+
262
+ by_agent = defaultdict(list)
263
+ for agent_dir, fname, fpath, size, mtime in active_files:
264
+ by_agent[agent_dir].append((fname, fpath, size, mtime))
265
+
266
+ for agent_dir in sorted(by_agent.keys()):
267
+ agent_name = os.path.basename(agent_dir)
268
+ id_to_key = build_id_to_key_map(agent_dir)
269
+ files = by_agent[agent_dir]
270
+ files.sort(key=lambda x: x[3], reverse=True)
271
+
272
+ out.line("")
273
+ out.item(f"Agent: {agent_name} ({len(files)} 个 session)")
274
+
275
+ agent_sessions = []
276
+
277
+ for fname, fpath, fsize, _mtime in files:
278
+ sess_id = fname.split(".jsonl")[0]
279
+ is_reset = ".reset." in fname
280
+ tag = " [reset]" if is_reset else ""
281
+
282
+ a = analyze_session_file(fpath)
283
+
284
+ sess_key = id_to_key.get(sess_id, "")
285
+ if a["parse_failed"]:
286
+ out.line(f" {sess_id}{tag}")
287
+ if sess_key:
288
+ out.line(f" sessionKey={sess_key} | size={human_size(fsize)} | <解析失败>")
289
+ else:
290
+ out.line(f" size={human_size(fsize)} | <解析失败>")
291
+ agent_sessions.append({
292
+ "id": sess_id,
293
+ "key": sess_key,
294
+ "size_bytes": fsize,
295
+ "parse_failed": True,
296
+ })
297
+ continue
298
+
299
+ if a["first_ts"] and a["last_ts"]:
300
+ dur_total = (a["last_ts"] - a["first_ts"]).total_seconds()
301
+ duration_str = fmt_duration(dur_total)
302
+ start_str = a["first_ts"].strftime("%Y-%m-%d %H:%M:%S")
303
+ end_str = a["last_ts"].strftime("%Y-%m-%d %H:%M:%S")
304
+ else:
305
+ dur_total = None
306
+ duration_str = start_str = end_str = "?"
307
+
308
+ agent_sessions.append({
309
+ "id": sess_id,
310
+ "key": sess_key,
311
+ "size_bytes": fsize,
312
+ "duration_s": dur_total,
313
+ "model_calls": a["model_calls"],
314
+ "tool_calls": a["tool_calls_total"],
315
+ "tool_errors": a["tool_errors"],
316
+ "anomaly_count": len(a["anomalies"]),
317
+ "is_reset": is_reset,
318
+ })
319
+
320
+ out.line(f" {sess_id}{tag}")
321
+ sk_part = f"sessionKey={sess_key} | " if sess_key else ""
322
+ out.line(f" {sk_part}size={human_size(fsize)} | duration={duration_str}")
323
+ out.line(f" start={start_str} end={end_str}")
324
+
325
+ role_order = ["user", "assistant", "toolResult", "system"]
326
+ parts = []
327
+ for r in role_order:
328
+ if a["role_counts"].get(r, 0):
329
+ parts.append(f"{r}={a['role_counts'][r]}")
330
+ for r in sorted(a["role_counts"].keys()):
331
+ if r not in role_order and a["role_counts"][r]:
332
+ parts.append(f"{r}={a['role_counts'][r]}")
333
+ if parts:
334
+ out.line(f" messages: {' '.join(parts)}")
335
+
336
+ if a["model_calls"]:
337
+ token_parts = [f"in={fmt_tokens(a['total_input'])}", f"out={fmt_tokens(a['total_output'])}"]
338
+ if a["total_cache_read"]:
339
+ token_parts.append(f"cache_read={fmt_tokens(a['total_cache_read'])}")
340
+ if a["total_cache_write"]:
341
+ token_parts.append(f"cache_write={fmt_tokens(a['total_cache_write'])}")
342
+ cost_part = f" | cost=${a['total_cost']:.4f}" if a["total_cost"] > 0 else ""
343
+ out.line(f" tokens: {' '.join(token_parts)}{cost_part}")
344
+ if a["per_call_inputs"]:
345
+ avg_in = sum(a["per_call_inputs"]) / len(a["per_call_inputs"])
346
+ last_in = a["last_call_input"] if a["last_call_input"] is not None else 0
347
+ out.line(f" context: avg_input={fmt_tokens(int(avg_in))} "
348
+ f"last_input={fmt_tokens(last_in)}(当前上下文大小)")
349
+
350
+ if a["model_calls"]:
351
+ models_str = ", ".join(sorted(a["models_seen"])) if a["models_seen"] else "?"
352
+ out.line(f" model: [{models_str}] calls={a['model_calls']}")
353
+ if a["model_latencies"]:
354
+ sl = sorted(a["model_latencies"])
355
+ p50 = pct(sl, 0.50)
356
+ p95 = pct(sl, 0.95)
357
+ mx = sl[-1]
358
+ total_dur = sum(sl)
359
+ tp = (a["total_output"] / total_dur) if total_dur > 0 else 0.0
360
+ out.line(f" latency: P50={p50:.1f}s P95={p95:.1f}s Max={mx:.1f}s | "
361
+ f"throughput={tp:.1f} tok/s")
362
+
363
+ if a["tool_calls_total"]:
364
+ err_rate = (a["tool_errors"] / a["tool_calls_total"] * 100) if a["tool_calls_total"] else 0.0
365
+ out.line(f" tools: {a['tool_calls_total']} calls | error_rate={err_rate:.1f}%")
366
+ top = sorted(a["tool_counts"].items(), key=lambda x: -x[1])[:5]
367
+ top_str = ", ".join(f"{t}:{c}" for t, c in top)
368
+ out.line(f" top: [{top_str}]")
369
+ timed = [(n, ds) for n, ds in a["tool_durations"].items() if ds]
370
+ if timed:
371
+ timed.sort(key=lambda x: a["tool_counts"][x[0]], reverse=True)
372
+ timed_parts = []
373
+ for n, ds in timed[:4]:
374
+ dsr = sorted(ds)
375
+ timed_parts.append(f"{n} P50={pct(dsr,0.50):.2f}s P95={pct(dsr,0.95):.2f}s")
376
+ out.line(f" 耗时(有 durationMs 的): {' | '.join(timed_parts)}")
377
+
378
+ if a["anomalies"]:
379
+ n = len(a["anomalies"])
380
+ out.line(f" 异常({n}):")
381
+ for ts_label, kind, detail in a["anomalies"][:10]:
382
+ out.line(f" {ts_label} | {kind} | {detail}")
383
+ if n > 10:
384
+ out.line(f" ... 省略 {n - 10} 条 ...")
385
+
386
+ agents_data[agent_name] = {
387
+ "session_count": len(agent_sessions),
388
+ "sessions": agent_sessions,
389
+ }
390
+
391
+ out.set_data("agents", agents_data)
392
+
393
+
394
+ _STUCK_RE = re.compile(
395
+ r"stuck session:\s*"
396
+ r"sessionId=(\S+)\s+"
397
+ r"sessionKey=(\S+)\s+"
398
+ r"state=(\S+)\s+"
399
+ r"age=(\S+)\s+"
400
+ r"queueDepth=(\S+)"
401
+ )
402
+
403
+
404
+ def extract_stuck_match(obj):
405
+ raw = obj.get("1", "") or obj.get("msg", "") or obj.get("message", "")
406
+ if isinstance(raw, dict):
407
+ raw = str(raw)
408
+ m = _STUCK_RE.search(raw)
409
+ if m:
410
+ return m
411
+ for v in obj.values():
412
+ if isinstance(v, str) and "stuck session" in v:
413
+ m = _STUCK_RE.search(v)
414
+ if m:
415
+ return m
416
+ return None
417
+
418
+
419
+ def stuck_dimension(out: output.Output, log_dir: str) -> None:
420
+ out.line("")
421
+ out.line(" ── Session Stuck 状态探测 ──")
422
+ out.line("")
423
+
424
+ log_files = sorted(glob.glob(os.path.join(log_dir, "openclaw-*.log")),
425
+ key=lambda p: os.path.getmtime(p) if os.path.isfile(p) else 0,
426
+ reverse=True)
427
+ if not log_files:
428
+ out.item("未找到任何日志文件")
429
+ out.set_data("stuck_sessions", [])
430
+ out.set_data("scanned_logs", [])
431
+ return
432
+
433
+ all_entries = []
434
+ files_read = []
435
+ for lf in log_files:
436
+ files_read.append(os.path.basename(lf))
437
+ try:
438
+ with open(lf, errors="replace") as f:
439
+ for line in f:
440
+ if "stuck session" not in line:
441
+ continue
442
+ try:
443
+ obj = json.loads(line.strip())
444
+ except Exception:
445
+ continue
446
+ m = extract_stuck_match(obj)
447
+ if not m:
448
+ continue
449
+ sess_id, sess_key, state, age, qd = m.group(1, 2, 3, 4, 5)
450
+ ts = obj.get("time", "")[:19]
451
+ all_entries.append((ts, sess_id, sess_key, state, age, qd, lf))
452
+ except OSError:
453
+ continue
454
+
455
+ all_entries.sort(key=lambda x: x[0])
456
+ out.set_data("scanned_logs", files_read)
457
+ if not all_entries:
458
+ out.item(f"扫描: {', '.join(files_read)}")
459
+ out.item("日志中未出现 stuck session 记录")
460
+ out.set_data("stuck_sessions", [])
461
+ return
462
+
463
+ sessions = defaultdict(lambda: {
464
+ "count": 0, "first_ts": "", "last_ts": "", "state": "",
465
+ "age": "", "queueDepth": "", "sessionKey": "", "logfile": "",
466
+ })
467
+ for ts, sess_id, sess_key, state, age, qd, lf in all_entries:
468
+ s = sessions[sess_id]
469
+ s["count"] += 1
470
+ if not s["first_ts"]:
471
+ s["first_ts"] = ts
472
+ s["last_ts"] = ts
473
+ s["state"] = state
474
+ s["age"] = age
475
+ s["queueDepth"] = qd
476
+ s["sessionKey"] = sess_key
477
+ s["logfile"] = os.path.basename(lf)
478
+
479
+ latest_logfile = os.path.basename(all_entries[-1][6])
480
+ out.item(f"扫描: {', '.join(files_read)}")
481
+ out.item(f"最新条目: {all_entries[-1][0]} [来自 {latest_logfile}]")
482
+ out.item(f"检测到 {len(sessions)} 个 stuck session(按最后出现时间排序):")
483
+ stuck_payload = []
484
+ for sid, s in sorted(sessions.items(), key=lambda x: x[1]["last_ts"], reverse=True):
485
+ out.item(f" {s['sessionKey']} (sessionId={sid}) [{s['logfile']}]")
486
+ out.item(f" state={s['state']} age={s['age']} queueDepth={s['queueDepth']}")
487
+ out.item(f" 首次: {s['first_ts']} 最后: {s['last_ts']} 共 {s['count']} 条")
488
+ stuck_payload.append({
489
+ "sessionId": sid,
490
+ "sessionKey": s["sessionKey"],
491
+ "state": s["state"],
492
+ "age": s["age"],
493
+ "queueDepth": s["queueDepth"],
494
+ "first_ts": s["first_ts"],
495
+ "last_ts": s["last_ts"],
496
+ "count": s["count"],
497
+ "logfile": s["logfile"],
498
+ })
499
+ out.set_data("stuck_sessions", stuck_payload)
500
+
501
+
502
+ def main() -> int:
503
+ parser = cli.build_common_parser(
504
+ description="模块 8:Session 数据采集 + Stuck 探测",
505
+ prog="08_sessions",
506
+ )
507
+ args = parser.parse_args()
508
+ out = output.init("sessions", json_mode=args.json, no_color=args.no_color)
509
+ out.section("模块 8:Session 数据")
510
+
511
+ session_data_dimension(out, args.sessions_base)
512
+ stuck_dimension(out, args.log_dir)
513
+
514
+ return out.done()
515
+
516
+
517
+ if __name__ == "__main__":
518
+ sys.exit(main())