openclaw-diag-cli 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,246 @@
1
+ #!/usr/bin/env python3
2
+ """模块 5:近期错误日志(应用日志 + journalctl + 工具调用错误)。"""
3
+
4
+ from __future__ import annotations
5
+
6
+ import datetime
7
+ import glob
8
+ import json
9
+ import os
10
+ import re
11
+ import subprocess
12
+ import sys
13
+ from collections import Counter
14
+ from pathlib import Path
15
+ from typing import List
16
+
17
+ sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
18
+
19
+ from ocdiag import cli, output, recent_logs
20
+
21
+
22
+ _ERR_RE = re.compile(r'"logLevelName"\s*:\s*"(ERROR|FATAL)"')
23
+ _LEVEL_KEY = re.compile(r'"(logLevelName|level)"\s*:\s*"(ERROR|WARN|error|warn)"')
24
+ _HTTP_ERR_RE = re.compile(
25
+ r"HTTP [45][0-9][0-9]|\"status\":\s*(?:4[0-9][0-9]|5[0-9][0-9])|"
26
+ r"rate.limit|quota.exceeded",
27
+ re.IGNORECASE,
28
+ )
29
+ _API_EXCLUDE_SUB_RE = re.compile(
30
+ r'"subsystem":\s*"(tools|agent/embedded)"|allowlist contains',
31
+ re.IGNORECASE,
32
+ )
33
+ _API_EXCLUDE_TXT_RE = re.compile(r"embedded run agent|agent end|agent start", re.IGNORECASE)
34
+ _TS_RE = re.compile(r"\[(\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}[^\]]*)\]\s*(.*)")
35
+ _SUBSYSTEM_STRIP_RE = re.compile(r'\s*\[\{[^}]*"subsystem"[^}]*\}\]\s*')
36
+
37
+
38
+ def extract_msg(obj):
39
+ parts = []
40
+ for k in ("0", "1", "2", "msg", "message"):
41
+ v = obj.get(k, "")
42
+ if not v or not isinstance(v, str):
43
+ continue
44
+ if v.startswith("{"):
45
+ try:
46
+ inner = json.loads(v)
47
+ if isinstance(inner, dict):
48
+ meaningful = {ik: iv for ik, iv in inner.items() if ik != "subsystem"}
49
+ if meaningful:
50
+ parts.append(" ".join(f"{ik}={iv}" for ik, iv in meaningful.items()))
51
+ continue
52
+ except Exception:
53
+ pass
54
+ parts.append(v)
55
+ return " ".join(parts) if parts else None
56
+
57
+
58
+ def render_log_line(line: str, max_len: int = 300) -> str:
59
+ line = line.strip()
60
+ if not line:
61
+ return ""
62
+ try:
63
+ obj = json.loads(line)
64
+ ts = obj.get("time", "")[:19]
65
+ msg = extract_msg(obj)
66
+ if not msg:
67
+ msg = str({k: v for k, v in obj.items() if k not in ("_meta", "time")})
68
+ if isinstance(msg, str) and len(msg) > max_len:
69
+ msg = msg[:max_len] + "..."
70
+ level = obj.get("_meta", {}).get("logLevelName", "ERROR")
71
+ return f"[{ts}] {level}: {msg}"
72
+ except Exception:
73
+ line = _SUBSYSTEM_STRIP_RE.sub(" ", line).strip()
74
+ m = _TS_RE.match(line)
75
+ if m:
76
+ line = f"[{m.group(1)[:19]}] {m.group(2)}"
77
+ if len(line) > max_len:
78
+ line = line[:max_len] + "..."
79
+ return line
80
+
81
+
82
+ def collect_error_lines(log_files: List[str]) -> List[str]:
83
+ out: List[str] = []
84
+ for lf in log_files:
85
+ try:
86
+ with open(lf, errors="replace") as f:
87
+ for ln in f:
88
+ if _ERR_RE.search(ln):
89
+ out.append(ln.rstrip("\n"))
90
+ except OSError:
91
+ continue
92
+ return out
93
+
94
+
95
+ def collect_api_errors(log_files: List[str]) -> List[str]:
96
+ out: List[str] = []
97
+ for lf in log_files:
98
+ try:
99
+ with open(lf, errors="replace") as f:
100
+ for ln in f:
101
+ if not _LEVEL_KEY.search(ln):
102
+ continue
103
+ if not _HTTP_ERR_RE.search(ln):
104
+ continue
105
+ if _API_EXCLUDE_SUB_RE.search(ln):
106
+ continue
107
+ if _API_EXCLUDE_TXT_RE.search(ln):
108
+ continue
109
+ out.append(ln.rstrip("\n"))
110
+ except OSError:
111
+ continue
112
+ return out
113
+
114
+
115
+ def journalctl_errors() -> str:
116
+ try:
117
+ r = subprocess.run(
118
+ ["journalctl", "--user", "-u", "openclaw-gateway",
119
+ "--since", "today", "--priority", "err", "--no-pager"],
120
+ capture_output=True, text=True, timeout=10, check=False,
121
+ )
122
+ return r.stdout
123
+ except (subprocess.TimeoutExpired, FileNotFoundError, OSError):
124
+ return ""
125
+
126
+
127
+ def find_recent_session(sessions_base: str):
128
+ if not os.path.isdir(sessions_base):
129
+ return None
130
+ best = None
131
+ best_mtime = -1.0
132
+ for f in glob.glob(os.path.join(sessions_base, "*", "**", "*.jsonl"), recursive=True):
133
+ try:
134
+ m = os.path.getmtime(f)
135
+ except OSError:
136
+ continue
137
+ if m > best_mtime:
138
+ best_mtime = m
139
+ best = f
140
+ return best
141
+
142
+
143
+ def tool_errors_from_session(session_path: str):
144
+ counts = Counter()
145
+ try:
146
+ # tail-equivalent: load all but only keep last 500
147
+ with open(session_path, errors="replace") as f:
148
+ lines = f.readlines()
149
+ for line in lines[-500:]:
150
+ try:
151
+ obj = json.loads(line)
152
+ msg = obj.get("message", {}) or {}
153
+ if msg.get("isError"):
154
+ counts[msg.get("toolName", "unknown")] += 1
155
+ except Exception:
156
+ pass
157
+ except OSError:
158
+ pass
159
+ return counts
160
+
161
+
162
+ def main() -> int:
163
+ parser = cli.build_common_parser(
164
+ description="模块 5:采集近期错误日志",
165
+ prog="05_recent_errors",
166
+ )
167
+ args = parser.parse_args()
168
+
169
+ out = output.init("recent_errors", json_mode=args.json, no_color=args.no_color)
170
+ out.section("模块 5:近期日志")
171
+
172
+ logs = recent_logs.discover_recent_logs(args.log_dir)
173
+ out.set_data("scanned_logs", [os.path.basename(p) for p in logs])
174
+
175
+ if logs:
176
+ out.item(f"今日有更新的日志文件 ({len(logs)} 个):")
177
+ for lf in logs:
178
+ try:
179
+ ts = os.path.getmtime(lf)
180
+ ts_str = datetime.datetime.fromtimestamp(ts).strftime("%H:%M:%S")
181
+ except OSError:
182
+ ts_str = "?"
183
+ out.item(f" {os.path.basename(lf)} (mtime: {ts_str})")
184
+ else:
185
+ out.item("今日无更新的日志文件")
186
+
187
+ out.line("")
188
+
189
+ if logs:
190
+ err_lines = collect_error_lines(logs)
191
+ out.set_data("app_error_count", len(err_lines))
192
+ if err_lines:
193
+ out.item(f"应用日志 ERROR 级别: {len(err_lines)} 条 — Gateway 运行时报错,包括工具失败、模型异常等")
194
+ rendered = []
195
+ for ln in err_lines[:100]:
196
+ r = render_log_line(ln, 300)
197
+ if r:
198
+ rendered.append(r)
199
+ if len(err_lines) > 100:
200
+ rendered.append(f"... 共 {len(err_lines)} 条")
201
+ out.evidence("近期日志", "\n".join(rendered))
202
+ else:
203
+ out.item("应用日志 ERROR 级别: 0 条 — Gateway 运行时报错")
204
+
205
+ api_lines = collect_api_errors(logs)
206
+ out.set_data("api_error_count", len(api_lines))
207
+ if api_lines:
208
+ out.item(f"模型 API HTTP 错误: {len(api_lines)} 条 ")
209
+ rendered = []
210
+ for ln in api_lines[:100]:
211
+ r = render_log_line(ln, 500)
212
+ if r:
213
+ rendered.append(r)
214
+ out.evidence("近期日志", "\n".join(rendered))
215
+ else:
216
+ out.item("应用日志未找到(今日无更新的日志文件)")
217
+
218
+ journal_out = journalctl_errors()
219
+ if journal_out and "No entries" not in journal_out and "no entries" not in journal_out:
220
+ lines = journal_out.splitlines()[:50]
221
+ if lines:
222
+ out.item("Journalctl ERROR 级别:")
223
+ out.evidence("journalctl --priority err", "\n".join(lines))
224
+ out.set_data("journalctl_errors", len(lines))
225
+ else:
226
+ out.item("Journalctl ERROR: 0 条 — 系统级进程错误")
227
+ out.set_data("journalctl_errors", 0)
228
+
229
+ recent_session = find_recent_session(args.sessions_base)
230
+ if recent_session:
231
+ counts = tool_errors_from_session(recent_session)
232
+ total = sum(counts.values())
233
+ out.item(f"最近 Session 的工具调用错误: {total} — 工具返回 error 的次数,过多说明某个工具持续异常")
234
+ out.set_data("session_tool_error_count", total)
235
+ if total > 0:
236
+ detail = "; ".join(f"{n}:{c}" for n, c in counts.most_common(10))
237
+ out.evidence(os.path.basename(recent_session), detail)
238
+ out.set_data("session_tool_errors", dict(counts))
239
+ else:
240
+ out.item("未找到 Session 文件,跳过工具调用检查")
241
+
242
+ return out.done()
243
+
244
+
245
+ if __name__ == "__main__":
246
+ sys.exit(main())