openclaw-diag-cli 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +260 -0
- package/bin/ocdiag +14 -0
- package/bin/openclaw-diag.js +275 -0
- package/diag/01_sys_health.py +443 -0
- package/diag/02_environment.py +292 -0
- package/diag/03_configuration.py +131 -0
- package/diag/04_gateway.py +651 -0
- package/diag/05_recent_errors.py +246 -0
- package/diag/06_cron_jobs.py +694 -0
- package/diag/07_performance.py +687 -0
- package/diag/08_sessions.py +518 -0
- package/diag/09_plugin_diag.py +535 -0
- package/diag/10_shell_history.py +121 -0
- package/diag/__init__.py +0 -0
- package/lib/bundle.py +204 -0
- package/ocdiag/__init__.py +3 -0
- package/ocdiag/cli.py +39 -0
- package/ocdiag/dispatcher.py +137 -0
- package/ocdiag/jsonlog.py +65 -0
- package/ocdiag/output.py +131 -0
- package/ocdiag/paths.py +48 -0
- package/ocdiag/recent_logs.py +53 -0
- package/ocdiag/sensitive.py +41 -0
- package/ocdiag/timeutil.py +77 -0
- package/ocdiag/tokens.py +46 -0
- package/package.json +42 -0
- package/tools/__init__.py +0 -0
- package/tools/oc_session_extract.py +254 -0
- package/tools/oc_session_trace.py +715 -0
|
@@ -0,0 +1,715 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""Trace the processing timeline of a user message in an OpenClaw session.
|
|
3
|
+
|
|
4
|
+
Channel-agnostic. Uses only universal data sources:
|
|
5
|
+
1. session.jsonl (required) — message-level timeline
|
|
6
|
+
2. trajectory.jsonl (optional) — run-level metadata
|
|
7
|
+
3. gateway log (optional) — embedded run start/prompt start/prompt end
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
import argparse
|
|
13
|
+
import glob
|
|
14
|
+
import json
|
|
15
|
+
import os
|
|
16
|
+
import re
|
|
17
|
+
import sys
|
|
18
|
+
from datetime import datetime, timezone
|
|
19
|
+
from pathlib import Path
|
|
20
|
+
from typing import Any, Dict, List, Optional, Tuple
|
|
21
|
+
|
|
22
|
+
sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
|
|
23
|
+
|
|
24
|
+
from ocdiag import paths
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
DEFAULT_BASE_DIR = paths.SESSIONS_BASE
|
|
28
|
+
DEFAULT_LOG_DIR = paths.LOG_DIR
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def iso_to_epoch_ms(iso: str) -> int:
|
|
32
|
+
s = iso.replace("Z", "+00:00")
|
|
33
|
+
try:
|
|
34
|
+
dt = datetime.fromisoformat(s)
|
|
35
|
+
except ValueError:
|
|
36
|
+
return 0
|
|
37
|
+
return int(dt.timestamp() * 1000)
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def epoch_ms_to_iso(ms: int) -> str:
|
|
41
|
+
dt = datetime.fromtimestamp(ms / 1000, tz=timezone.utc)
|
|
42
|
+
return dt.strftime("%Y-%m-%dT%H:%M:%S.") + f"{ms % 1000:03d}Z"
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def fmt_duration(ms: float) -> str:
|
|
46
|
+
if ms < 1000:
|
|
47
|
+
return f"{ms:.0f}ms"
|
|
48
|
+
if ms < 60_000:
|
|
49
|
+
return f"{ms / 1000:.1f}s"
|
|
50
|
+
m = int(ms // 60_000)
|
|
51
|
+
s = (ms % 60_000) / 1000
|
|
52
|
+
return f"{m}m{s:.1f}s"
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def human_size(n: int) -> str:
|
|
56
|
+
for unit in ("B", "KB", "MB", "GB"):
|
|
57
|
+
if n < 1024:
|
|
58
|
+
return f"{n:.1f} {unit}" if unit != "B" else f"{n} {unit}"
|
|
59
|
+
n /= 1024
|
|
60
|
+
return f"{n:.1f} TB"
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def extract_text(content: Any) -> str:
|
|
64
|
+
if isinstance(content, str):
|
|
65
|
+
return content
|
|
66
|
+
if isinstance(content, list):
|
|
67
|
+
parts = []
|
|
68
|
+
for c in content:
|
|
69
|
+
if isinstance(c, dict):
|
|
70
|
+
if c.get("type") == "text":
|
|
71
|
+
parts.append(c.get("text", ""))
|
|
72
|
+
elif c.get("type") == "toolCall":
|
|
73
|
+
parts.append(f"[toolCall:{c.get('name','')}]")
|
|
74
|
+
return " ".join(parts)
|
|
75
|
+
return str(content)
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
def find_session_file(
|
|
79
|
+
session_id: str,
|
|
80
|
+
base_dir: str = DEFAULT_BASE_DIR,
|
|
81
|
+
agent: Optional[str] = None,
|
|
82
|
+
) -> Optional[str]:
|
|
83
|
+
if agent:
|
|
84
|
+
agent_dirs = [os.path.join(base_dir, agent)]
|
|
85
|
+
else:
|
|
86
|
+
agent_dirs = sorted(glob.glob(os.path.join(base_dir, "*")))
|
|
87
|
+
|
|
88
|
+
candidates: List[Tuple[str, str]] = []
|
|
89
|
+
for ad in agent_dirs:
|
|
90
|
+
sd = os.path.join(ad, "sessions")
|
|
91
|
+
if not os.path.isdir(sd):
|
|
92
|
+
continue
|
|
93
|
+
for entry in os.listdir(sd):
|
|
94
|
+
if not entry.startswith(session_id):
|
|
95
|
+
continue
|
|
96
|
+
if ".trajectory" in entry or entry.endswith(".json"):
|
|
97
|
+
continue
|
|
98
|
+
full = os.path.join(sd, entry)
|
|
99
|
+
if not os.path.isfile(full):
|
|
100
|
+
continue
|
|
101
|
+
if entry == f"{session_id}.jsonl":
|
|
102
|
+
candidates.append((full, "active"))
|
|
103
|
+
elif ".jsonl.deleted." in entry:
|
|
104
|
+
candidates.append((full, "deleted"))
|
|
105
|
+
elif ".jsonl.reset." in entry:
|
|
106
|
+
candidates.append((full, "reset"))
|
|
107
|
+
elif ".jsonl.bak-" in entry:
|
|
108
|
+
candidates.append((full, "backup"))
|
|
109
|
+
|
|
110
|
+
prio = {"active": 0, "deleted": 1, "reset": 2, "backup": 3}
|
|
111
|
+
candidates.sort(key=lambda x: prio.get(x[1], 9))
|
|
112
|
+
return candidates[0][0] if candidates else None
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
def find_trajectory_file(session_file: str) -> Optional[str]:
|
|
116
|
+
d = os.path.dirname(session_file)
|
|
117
|
+
base = os.path.basename(session_file).split(".jsonl")[0]
|
|
118
|
+
traj = os.path.join(d, f"{base}.trajectory.jsonl")
|
|
119
|
+
return traj if os.path.isfile(traj) else None
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
def find_gateway_logs(log_dir: str) -> List[str]:
|
|
123
|
+
return sorted(glob.glob(os.path.join(log_dir, "openclaw-*.log")))
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
def load_records(filepath: str) -> List[Dict]:
|
|
127
|
+
records: List[Dict] = []
|
|
128
|
+
with open(filepath, "r") as f:
|
|
129
|
+
for line in f:
|
|
130
|
+
line = line.strip()
|
|
131
|
+
if not line:
|
|
132
|
+
continue
|
|
133
|
+
try:
|
|
134
|
+
records.append(json.loads(line))
|
|
135
|
+
except json.JSONDecodeError:
|
|
136
|
+
continue
|
|
137
|
+
return records
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
def find_user_messages(records: List[Dict]) -> List[Tuple[int, Dict]]:
|
|
141
|
+
result = []
|
|
142
|
+
for i, r in enumerate(records):
|
|
143
|
+
if r.get("type") == "message":
|
|
144
|
+
msg = r.get("message", {})
|
|
145
|
+
if msg.get("role") == "user":
|
|
146
|
+
result.append((i, r))
|
|
147
|
+
return result
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
def find_first_message(records: List[Dict]) -> List[Tuple[int, Dict]]:
|
|
151
|
+
"""Fall back: any record whose type=='message' (regardless of role)."""
|
|
152
|
+
result = []
|
|
153
|
+
for i, r in enumerate(records):
|
|
154
|
+
if r.get("type") == "message" and isinstance(r.get("message"), dict):
|
|
155
|
+
result.append((i, r))
|
|
156
|
+
return result
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
def select_user_message(records, msg_index=None, msg_id=None, msg_match=None):
|
|
160
|
+
user_msgs = find_user_messages(records)
|
|
161
|
+
if not user_msgs:
|
|
162
|
+
# No user messages — fall back to scanning all message records so trace
|
|
163
|
+
# still works for assistant-only streams (e.g. cron delivery sessions).
|
|
164
|
+
user_msgs = find_first_message(records)
|
|
165
|
+
if not user_msgs:
|
|
166
|
+
print("Error: no message records found in session", file=sys.stderr)
|
|
167
|
+
sys.exit(1)
|
|
168
|
+
print(
|
|
169
|
+
f"Note: no user-role messages; tracing from first message record "
|
|
170
|
+
f"({len(user_msgs)} message(s) total)",
|
|
171
|
+
file=sys.stderr,
|
|
172
|
+
)
|
|
173
|
+
if msg_id is not None:
|
|
174
|
+
for idx, r in user_msgs:
|
|
175
|
+
if r.get("id") == msg_id:
|
|
176
|
+
return idx, r
|
|
177
|
+
print(f"Error: no message with id '{msg_id}'", file=sys.stderr)
|
|
178
|
+
sys.exit(1)
|
|
179
|
+
if msg_match is not None:
|
|
180
|
+
for idx, r in user_msgs:
|
|
181
|
+
text = extract_text(r.get("message", {}).get("content", ""))
|
|
182
|
+
if msg_match in text:
|
|
183
|
+
return idx, r
|
|
184
|
+
print(f"Error: no message matching '{msg_match}'", file=sys.stderr)
|
|
185
|
+
sys.exit(1)
|
|
186
|
+
if msg_index is not None:
|
|
187
|
+
if msg_index < 0 or msg_index >= len(user_msgs):
|
|
188
|
+
print(f"Error: msg-index {msg_index} out of range (0..{len(user_msgs)-1})",
|
|
189
|
+
file=sys.stderr)
|
|
190
|
+
sys.exit(1)
|
|
191
|
+
return user_msgs[msg_index]
|
|
192
|
+
return user_msgs[-1]
|
|
193
|
+
|
|
194
|
+
|
|
195
|
+
def extract_trace_records(records, start_idx):
|
|
196
|
+
trace = []
|
|
197
|
+
for i in range(start_idx, len(records)):
|
|
198
|
+
r = records[i]
|
|
199
|
+
if i > start_idx and r.get("type") == "message":
|
|
200
|
+
msg = r.get("message", {})
|
|
201
|
+
if msg.get("role") == "user":
|
|
202
|
+
break
|
|
203
|
+
trace.append(r)
|
|
204
|
+
return trace
|
|
205
|
+
|
|
206
|
+
|
|
207
|
+
def _tool_batch_duration(results, prev_epoch):
|
|
208
|
+
if not results or prev_epoch is None:
|
|
209
|
+
return 0
|
|
210
|
+
max_ts = max(r.get("message", {}).get("timestamp", 0) for r in results)
|
|
211
|
+
return max(0, max_ts - prev_epoch)
|
|
212
|
+
|
|
213
|
+
|
|
214
|
+
def _flush_tool_batch(events, tool_execs, results, base_ms, prev_epoch):
|
|
215
|
+
if not results:
|
|
216
|
+
return
|
|
217
|
+
batch_start_epoch = prev_epoch or base_ms
|
|
218
|
+
batch_end_epoch = max(r.get("message", {}).get("timestamp", 0) for r in results)
|
|
219
|
+
batch_dur = max(0, batch_end_epoch - batch_start_epoch)
|
|
220
|
+
by_name: Dict[str, int] = {}
|
|
221
|
+
errors = 0
|
|
222
|
+
for r in results:
|
|
223
|
+
msg = r.get("message", {})
|
|
224
|
+
name = msg.get("toolName", "?")
|
|
225
|
+
by_name[name] = by_name.get(name, 0) + 1
|
|
226
|
+
if msg.get("isError"):
|
|
227
|
+
errors += 1
|
|
228
|
+
parts = [(f"{n}" + (f" ×{cnt}" if cnt > 1 else "")) for n, cnt in by_name.items()]
|
|
229
|
+
tools_str = " + ".join(parts)
|
|
230
|
+
status = "ok" if errors == 0 else f"{errors} error(s)"
|
|
231
|
+
events.append({
|
|
232
|
+
"offset_ms": max(0, (batch_start_epoch - base_ms)),
|
|
233
|
+
"type": "tool_batch",
|
|
234
|
+
"detail": f"{tools_str} → {status} ({fmt_duration(batch_dur)})",
|
|
235
|
+
"count": len(results),
|
|
236
|
+
"duration_ms": batch_dur,
|
|
237
|
+
})
|
|
238
|
+
for r in results:
|
|
239
|
+
msg = r.get("message", {})
|
|
240
|
+
name = msg.get("toolName", "?")
|
|
241
|
+
ts = msg.get("timestamp", 0)
|
|
242
|
+
dur = max(0, ts - batch_start_epoch) if ts and batch_start_epoch else 0
|
|
243
|
+
tool_execs.append({
|
|
244
|
+
"name": name,
|
|
245
|
+
"duration_ms": dur,
|
|
246
|
+
"is_error": msg.get("isError", False),
|
|
247
|
+
})
|
|
248
|
+
|
|
249
|
+
|
|
250
|
+
def analyze_phases(trace):
|
|
251
|
+
events: List[Dict] = []
|
|
252
|
+
model_calls: List[Dict] = []
|
|
253
|
+
tool_execs: List[Dict] = []
|
|
254
|
+
|
|
255
|
+
user_rec = trace[0]
|
|
256
|
+
user_msg = user_rec.get("message", {})
|
|
257
|
+
base_ms = user_msg.get("timestamp", 0)
|
|
258
|
+
if not base_ms:
|
|
259
|
+
base_ms = iso_to_epoch_ms(user_rec.get("timestamp", ""))
|
|
260
|
+
|
|
261
|
+
events.append({"offset_ms": 0, "type": "user", "detail": "Message received"})
|
|
262
|
+
|
|
263
|
+
model_num = 0
|
|
264
|
+
tool_num = 0
|
|
265
|
+
prev_assistant_record_epoch: Optional[int] = None
|
|
266
|
+
pending_tool_results: List[Dict] = []
|
|
267
|
+
total_model_ms = 0
|
|
268
|
+
total_tool_ms = 0
|
|
269
|
+
total_input_tokens = 0
|
|
270
|
+
total_output_tokens = 0
|
|
271
|
+
total_cache_read = 0
|
|
272
|
+
total_cache_write = 0
|
|
273
|
+
|
|
274
|
+
for r in trace[1:]:
|
|
275
|
+
rtype = r.get("type")
|
|
276
|
+
if rtype == "message":
|
|
277
|
+
msg = r.get("message", {})
|
|
278
|
+
role = msg.get("role")
|
|
279
|
+
if role == "assistant":
|
|
280
|
+
if pending_tool_results:
|
|
281
|
+
_flush_tool_batch(events, tool_execs, pending_tool_results,
|
|
282
|
+
base_ms, prev_assistant_record_epoch)
|
|
283
|
+
batch_dur = _tool_batch_duration(pending_tool_results, prev_assistant_record_epoch)
|
|
284
|
+
total_tool_ms += batch_dur
|
|
285
|
+
tool_num += len(pending_tool_results)
|
|
286
|
+
pending_tool_results = []
|
|
287
|
+
model_num += 1
|
|
288
|
+
msg_ts = msg.get("timestamp", 0)
|
|
289
|
+
record_epoch = iso_to_epoch_ms(r.get("timestamp", ""))
|
|
290
|
+
duration_ms = record_epoch - msg_ts if (record_epoch and msg_ts) else 0
|
|
291
|
+
usage = msg.get("usage", {})
|
|
292
|
+
out_tok = usage.get("output", 0)
|
|
293
|
+
in_tok = usage.get("input", 0)
|
|
294
|
+
cache_r = usage.get("cacheRead", 0)
|
|
295
|
+
cache_w = usage.get("cacheWrite", 0)
|
|
296
|
+
stop = msg.get("stopReason", "")
|
|
297
|
+
provider = msg.get("provider", "")
|
|
298
|
+
model = msg.get("model", "")
|
|
299
|
+
rate = out_tok / (duration_ms / 1000) if duration_ms > 0 else 0
|
|
300
|
+
start_offset = msg_ts - base_ms if msg_ts else 0
|
|
301
|
+
end_offset = record_epoch - base_ms if record_epoch else 0
|
|
302
|
+
events.append({
|
|
303
|
+
"offset_ms": start_offset, "type": "model_start", "num": model_num,
|
|
304
|
+
"detail": f"Call started → {provider}/{model}" if provider else "Call started",
|
|
305
|
+
})
|
|
306
|
+
events.append({
|
|
307
|
+
"offset_ms": end_offset, "type": "model_end", "num": model_num,
|
|
308
|
+
"detail": f"Completed (stopReason={stop})" + (" ← FINAL" if stop == "stop" else ""),
|
|
309
|
+
"duration_ms": duration_ms, "tokens_in": in_tok, "tokens_out": out_tok,
|
|
310
|
+
"cache_read": cache_r, "cache_write": cache_w, "rate": round(rate, 1),
|
|
311
|
+
})
|
|
312
|
+
tool_names = []
|
|
313
|
+
content = msg.get("content", [])
|
|
314
|
+
if isinstance(content, list):
|
|
315
|
+
for c in content:
|
|
316
|
+
if isinstance(c, dict) and c.get("type") == "toolCall":
|
|
317
|
+
tool_names.append(c.get("name", "?"))
|
|
318
|
+
model_calls.append({
|
|
319
|
+
"num": model_num, "duration_ms": duration_ms,
|
|
320
|
+
"tokens_out": out_tok, "tokens_in": in_tok,
|
|
321
|
+
"cache_read": cache_r, "cache_write": cache_w,
|
|
322
|
+
"stop_reason": stop, "tool_names": tool_names,
|
|
323
|
+
"provider": provider, "model": model, "rate": round(rate, 1),
|
|
324
|
+
})
|
|
325
|
+
total_model_ms += duration_ms
|
|
326
|
+
total_input_tokens += in_tok
|
|
327
|
+
total_output_tokens += out_tok
|
|
328
|
+
total_cache_read += cache_r
|
|
329
|
+
total_cache_write += cache_w
|
|
330
|
+
prev_assistant_record_epoch = record_epoch
|
|
331
|
+
elif role == "toolResult":
|
|
332
|
+
pending_tool_results.append(r)
|
|
333
|
+
elif rtype == "custom" and r.get("customType") == "openclaw:prompt-error":
|
|
334
|
+
data = r.get("data", {})
|
|
335
|
+
err_ts = data.get("timestamp", 0)
|
|
336
|
+
offset = err_ts - base_ms if err_ts else 0
|
|
337
|
+
events.append({
|
|
338
|
+
"offset_ms": offset, "type": "error",
|
|
339
|
+
"detail": f"prompt-error: {data.get('error', '?')}",
|
|
340
|
+
"provider": data.get("provider", ""), "model": data.get("model", ""),
|
|
341
|
+
})
|
|
342
|
+
|
|
343
|
+
if pending_tool_results:
|
|
344
|
+
_flush_tool_batch(events, tool_execs, pending_tool_results,
|
|
345
|
+
base_ms, prev_assistant_record_epoch)
|
|
346
|
+
batch_dur = _tool_batch_duration(pending_tool_results, prev_assistant_record_epoch)
|
|
347
|
+
total_tool_ms += batch_dur
|
|
348
|
+
tool_num += len(pending_tool_results)
|
|
349
|
+
|
|
350
|
+
last_offset = events[-1]["offset_ms"] if events else 0
|
|
351
|
+
return {
|
|
352
|
+
"events": events, "model_calls": model_calls, "tool_execs": tool_execs,
|
|
353
|
+
"summary": {
|
|
354
|
+
"total_ms": last_offset, "model_count": model_num,
|
|
355
|
+
"model_total_ms": total_model_ms, "tool_count": tool_num,
|
|
356
|
+
"tool_total_ms": total_tool_ms, "total_input_tokens": total_input_tokens,
|
|
357
|
+
"total_output_tokens": total_output_tokens,
|
|
358
|
+
"total_cache_read": total_cache_read, "total_cache_write": total_cache_write,
|
|
359
|
+
},
|
|
360
|
+
"base_epoch_ms": base_ms,
|
|
361
|
+
}
|
|
362
|
+
|
|
363
|
+
|
|
364
|
+
def load_trajectory_info(traj_path, base_epoch_ms):
|
|
365
|
+
runs: Dict[str, List[Dict]] = {}
|
|
366
|
+
try:
|
|
367
|
+
with open(traj_path, "r") as f:
|
|
368
|
+
for line in f:
|
|
369
|
+
line = line.strip()
|
|
370
|
+
if not line:
|
|
371
|
+
continue
|
|
372
|
+
try:
|
|
373
|
+
r = json.loads(line)
|
|
374
|
+
except json.JSONDecodeError:
|
|
375
|
+
continue
|
|
376
|
+
rid = r.get("runId", "")
|
|
377
|
+
if rid:
|
|
378
|
+
runs.setdefault(rid, []).append(r)
|
|
379
|
+
except OSError:
|
|
380
|
+
return None
|
|
381
|
+
if not runs:
|
|
382
|
+
return None
|
|
383
|
+
best_run = None
|
|
384
|
+
best_delta = float("inf")
|
|
385
|
+
for rid, evts in runs.items():
|
|
386
|
+
for e in evts:
|
|
387
|
+
if e.get("type") == "session.started":
|
|
388
|
+
ts = iso_to_epoch_ms(e.get("ts", ""))
|
|
389
|
+
delta = abs(ts - base_epoch_ms)
|
|
390
|
+
if delta < best_delta:
|
|
391
|
+
best_delta = delta
|
|
392
|
+
best_run = rid
|
|
393
|
+
break
|
|
394
|
+
if best_run is None or best_delta > 60_000:
|
|
395
|
+
return None
|
|
396
|
+
evts = runs[best_run]
|
|
397
|
+
info: Dict[str, Any] = {"runId": best_run}
|
|
398
|
+
ts_map: Dict[str, int] = {}
|
|
399
|
+
for e in evts:
|
|
400
|
+
etype = e.get("type", "")
|
|
401
|
+
ts_map[etype] = iso_to_epoch_ms(e.get("ts", ""))
|
|
402
|
+
if etype == "session.started":
|
|
403
|
+
data = e.get("data", {})
|
|
404
|
+
info["trigger"] = data.get("trigger")
|
|
405
|
+
info["toolCount"] = data.get("toolCount")
|
|
406
|
+
elif etype == "trace.metadata":
|
|
407
|
+
data = e.get("data", {})
|
|
408
|
+
model_info = data.get("model", {})
|
|
409
|
+
info["model_config"] = {
|
|
410
|
+
k: model_info.get(k)
|
|
411
|
+
for k in ("provider", "name", "api", "thinkLevel", "reasoningLevel")
|
|
412
|
+
if model_info.get(k) is not None
|
|
413
|
+
}
|
|
414
|
+
elif etype == "session.ended":
|
|
415
|
+
data = e.get("data", {})
|
|
416
|
+
info["status"] = data.get("status")
|
|
417
|
+
info["aborted"] = data.get("aborted")
|
|
418
|
+
info["timedOut"] = data.get("timedOut")
|
|
419
|
+
if "session.started" in ts_map and "context.compiled" in ts_map:
|
|
420
|
+
info["context_compilation_ms"] = ts_map["context.compiled"] - ts_map["session.started"]
|
|
421
|
+
if "context.compiled" in ts_map and "prompt.submitted" in ts_map:
|
|
422
|
+
info["prompt_submission_ms"] = ts_map["prompt.submitted"] - ts_map["context.compiled"]
|
|
423
|
+
return info
|
|
424
|
+
|
|
425
|
+
|
|
426
|
+
def _parse_log_ts(ts_str):
|
|
427
|
+
try:
|
|
428
|
+
dt = datetime.fromisoformat(ts_str)
|
|
429
|
+
return int(dt.timestamp() * 1000)
|
|
430
|
+
except (ValueError, TypeError):
|
|
431
|
+
return None
|
|
432
|
+
|
|
433
|
+
|
|
434
|
+
def load_gateway_timing(log_files, session_id, base_epoch_ms):
|
|
435
|
+
if not log_files:
|
|
436
|
+
return None
|
|
437
|
+
run_start = None
|
|
438
|
+
prompt_start = None
|
|
439
|
+
prompt_end = None
|
|
440
|
+
duration = None
|
|
441
|
+
base_date = epoch_ms_to_iso(base_epoch_ms)[:10]
|
|
442
|
+
for lf in log_files:
|
|
443
|
+
if base_date not in os.path.basename(lf):
|
|
444
|
+
continue
|
|
445
|
+
try:
|
|
446
|
+
with open(lf, "r") as f:
|
|
447
|
+
for line in f:
|
|
448
|
+
if session_id not in line or "agent/embedded" not in line:
|
|
449
|
+
continue
|
|
450
|
+
try:
|
|
451
|
+
rec = json.loads(line.strip())
|
|
452
|
+
except json.JSONDecodeError:
|
|
453
|
+
continue
|
|
454
|
+
msg = rec.get("1", "")
|
|
455
|
+
ts_str = rec.get("time", "")
|
|
456
|
+
if "embedded run start:" in msg and f"sessionId={session_id}" in msg:
|
|
457
|
+
ts = _parse_log_ts(ts_str)
|
|
458
|
+
if ts and abs(ts - base_epoch_ms) < 120_000:
|
|
459
|
+
run_start = ts
|
|
460
|
+
elif "embedded run prompt start:" in msg and f"sessionId={session_id}" in msg:
|
|
461
|
+
ts = _parse_log_ts(ts_str)
|
|
462
|
+
if ts and abs(ts - base_epoch_ms) < 120_000:
|
|
463
|
+
prompt_start = ts
|
|
464
|
+
elif "embedded run prompt end:" in msg and f"sessionId={session_id}" in msg:
|
|
465
|
+
ts = _parse_log_ts(ts_str)
|
|
466
|
+
if run_start and ts and ts > run_start:
|
|
467
|
+
prompt_end = ts
|
|
468
|
+
m = re.search(r"durationMs=(\d+)", msg)
|
|
469
|
+
duration = int(m.group(1)) if m else None
|
|
470
|
+
except OSError:
|
|
471
|
+
continue
|
|
472
|
+
if run_start is None:
|
|
473
|
+
return None
|
|
474
|
+
result: Dict[str, Any] = {}
|
|
475
|
+
if run_start and prompt_start:
|
|
476
|
+
result["run_to_prompt_ms"] = prompt_start - run_start
|
|
477
|
+
if prompt_start and prompt_end:
|
|
478
|
+
result["prompt_duration_ms"] = prompt_end - prompt_start
|
|
479
|
+
if duration:
|
|
480
|
+
result["reported_duration_ms"] = duration
|
|
481
|
+
return result if result else None
|
|
482
|
+
|
|
483
|
+
|
|
484
|
+
SEP = "═" * 66
|
|
485
|
+
LINE = "─" * 66
|
|
486
|
+
|
|
487
|
+
|
|
488
|
+
def _pct(part, total):
|
|
489
|
+
if total == 0:
|
|
490
|
+
return "0%"
|
|
491
|
+
return f"{part / total * 100:.1f}%"
|
|
492
|
+
|
|
493
|
+
|
|
494
|
+
def format_text(session_id, user_msg_index, user_msg_id, analysis,
|
|
495
|
+
traj_info=None, gw_info=None):
|
|
496
|
+
lines: List[str] = []
|
|
497
|
+
lines.append(SEP)
|
|
498
|
+
lines.append(f"Message Trace: session {session_id}")
|
|
499
|
+
lines.append(f"User Message #{user_msg_index} (id: {user_msg_id})")
|
|
500
|
+
lines.append(SEP)
|
|
501
|
+
lines.append("")
|
|
502
|
+
lines.append("Timeline:")
|
|
503
|
+
lines.append(LINE)
|
|
504
|
+
for ev in analysis["events"]:
|
|
505
|
+
off = ev["offset_ms"]
|
|
506
|
+
etype = ev["type"]
|
|
507
|
+
detail = ev.get("detail", "")
|
|
508
|
+
if etype == "user":
|
|
509
|
+
lines.append(f" T+{off:<10} [user] {detail}")
|
|
510
|
+
elif etype == "model_start":
|
|
511
|
+
lines.append(f" T+{off:<10} [model #{ev['num']}] {detail}")
|
|
512
|
+
elif etype == "model_end":
|
|
513
|
+
lines.append(f" T+{off:<10} [model #{ev['num']}] {detail}")
|
|
514
|
+
lines.append(
|
|
515
|
+
f" ├─ tokens: in={ev.get('tokens_in',0)} out={ev.get('tokens_out',0)}"
|
|
516
|
+
+ (f" cache_read={ev['cache_read']}" if ev.get("cache_read") else "")
|
|
517
|
+
+ (f" cache_write={ev['cache_write']}" if ev.get("cache_write") else "")
|
|
518
|
+
)
|
|
519
|
+
lines.append(f" ├─ duration: {fmt_duration(ev.get('duration_ms', 0))}")
|
|
520
|
+
lines.append(f" └─ rate: {ev.get('rate', 0)} tok/s")
|
|
521
|
+
elif etype == "tool_batch":
|
|
522
|
+
lines.append(f" T+{off:<10} [tool] {detail}")
|
|
523
|
+
elif etype == "error":
|
|
524
|
+
lines.append(f" T+{off:<10} [ERROR] {detail}")
|
|
525
|
+
lines.append(LINE)
|
|
526
|
+
lines.append("")
|
|
527
|
+
|
|
528
|
+
s = analysis["summary"]
|
|
529
|
+
total = s["total_ms"]
|
|
530
|
+
lines.append("Summary:")
|
|
531
|
+
lines.append(f" Total time: {fmt_duration(total)}")
|
|
532
|
+
lines.append(
|
|
533
|
+
f" Model calls: {s['model_count']}"
|
|
534
|
+
+ (f", total {fmt_duration(s['model_total_ms'])} ({_pct(s['model_total_ms'], total)})"
|
|
535
|
+
if s["model_count"] else "")
|
|
536
|
+
)
|
|
537
|
+
lines.append(
|
|
538
|
+
f" Tool executions: {s['tool_count']}"
|
|
539
|
+
+ (f", total {fmt_duration(s['tool_total_ms'])} ({_pct(s['tool_total_ms'], total)})"
|
|
540
|
+
if s["tool_count"] else "")
|
|
541
|
+
)
|
|
542
|
+
lines.append(
|
|
543
|
+
f" Tokens: in={s['total_input_tokens']} out={s['total_output_tokens']}"
|
|
544
|
+
+ (f" cache_read={s['total_cache_read']}" if s["total_cache_read"] else "")
|
|
545
|
+
+ (f" cache_write={s['total_cache_write']}" if s["total_cache_write"] else "")
|
|
546
|
+
)
|
|
547
|
+
avg_rate = s["total_output_tokens"] / (s["model_total_ms"] / 1000) if s["model_total_ms"] > 0 else 0
|
|
548
|
+
lines.append(f" Avg output rate: {avg_rate:.1f} tok/s")
|
|
549
|
+
lines.append("")
|
|
550
|
+
|
|
551
|
+
if analysis["model_calls"]:
|
|
552
|
+
lines.append(" Model breakdown:")
|
|
553
|
+
for mc in analysis["model_calls"]:
|
|
554
|
+
tools_str = ""
|
|
555
|
+
if mc["stop_reason"] == "toolUse" and mc["tool_names"]:
|
|
556
|
+
tnames = mc["tool_names"]
|
|
557
|
+
if len(tnames) <= 3:
|
|
558
|
+
tools_str = ",".join(tnames)
|
|
559
|
+
else:
|
|
560
|
+
tools_str = f"{tnames[0]}+{len(tnames)-1}more"
|
|
561
|
+
tools_str = f" (toolUse → {tools_str})"
|
|
562
|
+
elif mc["stop_reason"] == "stop":
|
|
563
|
+
tools_str = " (stop) ← final"
|
|
564
|
+
else:
|
|
565
|
+
tools_str = f" ({mc['stop_reason']})" if mc["stop_reason"] else ""
|
|
566
|
+
lines.append(
|
|
567
|
+
f" #{mc['num']:<3} {fmt_duration(mc['duration_ms']):>8} "
|
|
568
|
+
f"out={mc['tokens_out']:<6}{tools_str}"
|
|
569
|
+
)
|
|
570
|
+
lines.append("")
|
|
571
|
+
|
|
572
|
+
if analysis["tool_execs"]:
|
|
573
|
+
by_name: Dict[str, Dict] = {}
|
|
574
|
+
for te in analysis["tool_execs"]:
|
|
575
|
+
name = te["name"]
|
|
576
|
+
if name not in by_name:
|
|
577
|
+
by_name[name] = {"count": 0, "total_ms": 0, "errors": 0}
|
|
578
|
+
by_name[name]["count"] += 1
|
|
579
|
+
by_name[name]["total_ms"] += te["duration_ms"]
|
|
580
|
+
if te["is_error"]:
|
|
581
|
+
by_name[name]["errors"] += 1
|
|
582
|
+
lines.append(" Tool breakdown:")
|
|
583
|
+
for name, info in sorted(by_name.items(), key=lambda x: -x[1]["total_ms"]):
|
|
584
|
+
avg = info["total_ms"] / info["count"] if info["count"] else 0
|
|
585
|
+
err_str = f" ({info['errors']} errors)" if info["errors"] else ""
|
|
586
|
+
lines.append(
|
|
587
|
+
f" {name + ':':<24} {info['count']} call(s), "
|
|
588
|
+
f"{fmt_duration(info['total_ms'])} total, "
|
|
589
|
+
f"avg {fmt_duration(avg)}{err_str}"
|
|
590
|
+
)
|
|
591
|
+
lines.append("")
|
|
592
|
+
|
|
593
|
+
if traj_info:
|
|
594
|
+
lines.append(" Run metadata (from trajectory):")
|
|
595
|
+
lines.append(f" runId: {traj_info.get('runId', '?')}")
|
|
596
|
+
if traj_info.get("trigger"):
|
|
597
|
+
lines.append(f" trigger: {traj_info['trigger']}")
|
|
598
|
+
if traj_info.get("context_compilation_ms") is not None:
|
|
599
|
+
lines.append(f" context compilation: {fmt_duration(traj_info['context_compilation_ms'])}")
|
|
600
|
+
if traj_info.get("model_config"):
|
|
601
|
+
cfg = traj_info["model_config"]
|
|
602
|
+
parts = [f"{k}={v}" for k, v in cfg.items() if v is not None]
|
|
603
|
+
lines.append(f" model config: {', '.join(parts)}")
|
|
604
|
+
if traj_info.get("status"):
|
|
605
|
+
lines.append(f" status: {traj_info['status']}")
|
|
606
|
+
lines.append("")
|
|
607
|
+
|
|
608
|
+
if gw_info:
|
|
609
|
+
lines.append(" Gateway timing (from log):")
|
|
610
|
+
if "run_to_prompt_ms" in gw_info:
|
|
611
|
+
lines.append(f" run_start → prompt_start: {fmt_duration(gw_info['run_to_prompt_ms'])} (context compilation)")
|
|
612
|
+
if "prompt_duration_ms" in gw_info:
|
|
613
|
+
lines.append(f" prompt_start → prompt_end: {fmt_duration(gw_info['prompt_duration_ms'])} (total embedded run)")
|
|
614
|
+
lines.append("")
|
|
615
|
+
|
|
616
|
+
return "\n".join(lines)
|
|
617
|
+
|
|
618
|
+
|
|
619
|
+
def format_json(session_id, session_file, user_msg_index, user_msg_id, analysis,
|
|
620
|
+
traj_info=None, gw_info=None):
|
|
621
|
+
result = {
|
|
622
|
+
"session_id": session_id, "session_file": session_file,
|
|
623
|
+
"user_message_index": user_msg_index, "user_message_id": user_msg_id,
|
|
624
|
+
"base_epoch_ms": analysis["base_epoch_ms"],
|
|
625
|
+
"timeline": analysis["events"], "model_calls": analysis["model_calls"],
|
|
626
|
+
"tool_execs": analysis["tool_execs"], "summary": analysis["summary"],
|
|
627
|
+
}
|
|
628
|
+
if traj_info:
|
|
629
|
+
result["trajectory"] = traj_info
|
|
630
|
+
if gw_info:
|
|
631
|
+
result["gateway"] = gw_info
|
|
632
|
+
return json.dumps(result, indent=2, ensure_ascii=False)
|
|
633
|
+
|
|
634
|
+
|
|
635
|
+
def main():
|
|
636
|
+
parser = argparse.ArgumentParser(
|
|
637
|
+
description="Trace the processing timeline of a user message in an OpenClaw session.",
|
|
638
|
+
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
|
|
639
|
+
)
|
|
640
|
+
parser.add_argument("session_id", help="Session UUID to trace")
|
|
641
|
+
parser.add_argument("--msg-index", type=int, default=None, help="Nth user message (0-based)")
|
|
642
|
+
parser.add_argument("--msg-id", default=None, help="Message by id field")
|
|
643
|
+
parser.add_argument("--msg-match", default=None, help="First user message containing TEXT")
|
|
644
|
+
parser.add_argument("-o", "--output", default=None, help="Write output to file")
|
|
645
|
+
parser.add_argument("--base-dir", default=DEFAULT_BASE_DIR, help="Agents base directory")
|
|
646
|
+
parser.add_argument("--agent", default=None, help="Limit to specific agent")
|
|
647
|
+
parser.add_argument("--log-dir", default=DEFAULT_LOG_DIR, help="Gateway log directory")
|
|
648
|
+
parser.add_argument("--no-trajectory", action="store_true", help="Skip trajectory enrichment")
|
|
649
|
+
parser.add_argument("--no-log", action="store_true", help="Skip gateway log enrichment")
|
|
650
|
+
parser.add_argument("--json", action="store_true", help="Output as structured JSON")
|
|
651
|
+
args = parser.parse_args()
|
|
652
|
+
|
|
653
|
+
session_file = find_session_file(args.session_id, args.base_dir, args.agent)
|
|
654
|
+
if not session_file:
|
|
655
|
+
print(f"Error: no session file found for '{args.session_id}' under {args.base_dir}",
|
|
656
|
+
file=sys.stderr)
|
|
657
|
+
sys.exit(1)
|
|
658
|
+
|
|
659
|
+
records = load_records(session_file)
|
|
660
|
+
if not records:
|
|
661
|
+
print(f"Error: session file is empty: {session_file}", file=sys.stderr)
|
|
662
|
+
sys.exit(1)
|
|
663
|
+
|
|
664
|
+
user_msgs = find_user_messages(records) or find_first_message(records)
|
|
665
|
+
rec_idx, user_rec = select_user_message(records, args.msg_index, args.msg_id, args.msg_match)
|
|
666
|
+
try:
|
|
667
|
+
user_msg_ordinal = next(i for i, (ri, _) in enumerate(user_msgs) if ri == rec_idx)
|
|
668
|
+
except StopIteration:
|
|
669
|
+
user_msg_ordinal = 0
|
|
670
|
+
user_msg_id = user_rec.get("id", "?")
|
|
671
|
+
|
|
672
|
+
trace = extract_trace_records(records, rec_idx)
|
|
673
|
+
if len(trace) < 2:
|
|
674
|
+
print("Warning: trace contains only the user message (no response found)",
|
|
675
|
+
file=sys.stderr)
|
|
676
|
+
|
|
677
|
+
analysis = analyze_phases(trace)
|
|
678
|
+
|
|
679
|
+
traj_info = None
|
|
680
|
+
if not args.no_trajectory:
|
|
681
|
+
traj_path = find_trajectory_file(session_file)
|
|
682
|
+
if traj_path:
|
|
683
|
+
traj_info = load_trajectory_info(traj_path, analysis["base_epoch_ms"])
|
|
684
|
+
|
|
685
|
+
gw_info = None
|
|
686
|
+
if not args.no_log:
|
|
687
|
+
log_files = find_gateway_logs(args.log_dir)
|
|
688
|
+
if log_files:
|
|
689
|
+
gw_info = load_gateway_timing(log_files, args.session_id, analysis["base_epoch_ms"])
|
|
690
|
+
|
|
691
|
+
if args.json:
|
|
692
|
+
out_str = format_json(args.session_id, session_file, user_msg_ordinal,
|
|
693
|
+
user_msg_id, analysis, traj_info, gw_info)
|
|
694
|
+
else:
|
|
695
|
+
out_str = format_text(args.session_id, user_msg_ordinal, user_msg_id,
|
|
696
|
+
analysis, traj_info, gw_info)
|
|
697
|
+
|
|
698
|
+
if args.output:
|
|
699
|
+
with open(args.output, "w") as f:
|
|
700
|
+
f.write(out_str + "\n")
|
|
701
|
+
print(f"Trace written to {args.output}", file=sys.stderr)
|
|
702
|
+
else:
|
|
703
|
+
try:
|
|
704
|
+
print(out_str)
|
|
705
|
+
except BrokenPipeError:
|
|
706
|
+
pass
|
|
707
|
+
|
|
708
|
+
|
|
709
|
+
if __name__ == "__main__":
|
|
710
|
+
try:
|
|
711
|
+
main()
|
|
712
|
+
except KeyboardInterrupt:
|
|
713
|
+
sys.exit(130)
|
|
714
|
+
except BrokenPipeError:
|
|
715
|
+
sys.exit(0)
|