openclaw-agent-dashboard 1.0.43 → 1.0.45

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -3,9 +3,12 @@
3
3
  支持按分钟查看调用详情,便于分析调用瓶颈
4
4
  """
5
5
  from fastapi import APIRouter
6
- from typing import List, Dict, Any, Optional
6
+ from typing import List, Dict, Any, Optional, Tuple
7
+ import copy
7
8
  import json
8
9
  import re
10
+ import asyncio
11
+ import time
9
12
  from pathlib import Path
10
13
  from datetime import datetime, timedelta, timezone
11
14
  from zoneinfo import ZoneInfo
@@ -19,6 +22,31 @@ TZ_DISPLAY = ZoneInfo('Asia/Shanghai')
19
22
 
20
23
  router = APIRouter()
21
24
 
25
+ # 聚合统计多次并发请求(WS + 轮询 + 多标签)共用;TTL 短以保证大致实时
26
+ _perf_stats_cache: Dict[str, Tuple[float, Dict[str, Any]]] = {}
27
+ _PERF_STATS_CACHE_TTL_SEC = 12.0
28
+
29
+ # 柱体钻取:多次点击 / 并发标签共用短缓存
30
+ _perf_details_cache: Dict[str, Tuple[float, Dict[str, Any]]] = {}
31
+ _PERF_DETAILS_CACHE_TTL_SEC = 12.0
32
+
33
+ # 轻量解析 envelope ISO 时间,便于跳过明显早于查询窗口的行(避免 json.loads + schema)
34
+ _QUICK_ENV_TS_RE = re.compile(r'"timestamp"\s*:\s*"([^"]+)"')
35
+
36
+
37
+ def _quick_envelope_timestamp_utc(line: str) -> Optional[datetime]:
38
+ m = _QUICK_ENV_TS_RE.search(line)
39
+ if not m:
40
+ return None
41
+ try:
42
+ return datetime.fromisoformat(m.group(1).replace("Z", "+00:00"))
43
+ except ValueError:
44
+ return None
45
+
46
+
47
+ def _perf_cache_key(range_minutes: int, range_hours: int, granularity: str) -> str:
48
+ return f"{range_minutes}:{range_hours}:{granularity}"
49
+
22
50
 
23
51
  def _extract_trigger_text(msg: Dict) -> str:
24
52
  """从消息中提取触发内容(完整展示)"""
@@ -177,11 +205,25 @@ def parse_session_file(session_path: Path, range_hours: int = 1) -> List[Dict]:
177
205
  range_hours: 时间范围(小时),0 表示不限制
178
206
  """
179
207
  messages = []
208
+ now = datetime.now(timezone.utc)
209
+ time_ago = now - timedelta(hours=range_hours) if range_hours > 0 else None
210
+
211
+ # 启发式:窗口内若有 assistant usage,文件通常在窗口内有过写入;过久未修改则可跳过整文件
212
+ if time_ago is not None:
213
+ try:
214
+ if session_path.stat().st_mtime < time_ago.timestamp():
215
+ return []
216
+ except OSError:
217
+ return []
180
218
 
181
219
  try:
182
220
  with open(session_path, 'r', encoding='utf-8') as f:
183
221
  for line in f:
184
222
  try:
223
+ if time_ago is not None:
224
+ qt = _quick_envelope_timestamp_utc(line)
225
+ if qt is not None and qt < time_ago:
226
+ continue
185
227
  envelope, msg = parse_session_jsonl_line(line)
186
228
  if (
187
229
  not envelope
@@ -200,11 +242,8 @@ def parse_session_file(session_path: Path, range_hours: int = 1) -> List[Dict]:
200
242
  str(envelope['timestamp']).replace('Z', '+00:00')
201
243
  )
202
244
 
203
- if range_hours > 0:
204
- now = datetime.now(timezone.utc)
205
- time_ago = now - timedelta(hours=range_hours)
206
- if timestamp < time_ago:
207
- continue
245
+ if time_ago is not None and timestamp < time_ago:
246
+ continue
208
247
 
209
248
  messages.append({
210
249
  'timestamp': timestamp,
@@ -240,14 +279,8 @@ async def get_performance_stats(range: str = "20m"):
240
279
  return stats
241
280
 
242
281
 
243
- async def get_real_stats(range_minutes: int = 20, range_hours: int = 1, granularity: str = "minute") -> Dict:
244
- """获取真实的 TPM/RPM 统计
245
-
246
- Args:
247
- range_minutes: 时间范围(分钟)
248
- range_hours: 用于解析 session 的时间范围(小时)
249
- granularity: 聚合粒度 (minute, hour)
250
- """
282
+ def _compute_real_stats_sync(range_minutes: int = 20, range_hours: int = 1, granularity: str = "minute") -> Dict:
283
+ """同步聚合 TPM/RPM(在线程池中运行,避免阻塞事件循环)。"""
251
284
  stats = {
252
285
  'current': {
253
286
  'tpm': 0,
@@ -381,24 +414,38 @@ async def get_real_stats(range_minutes: int = 20, range_hours: int = 1, granular
381
414
  return stats
382
415
 
383
416
 
384
- async def get_minute_details(
417
+ async def get_real_stats(range_minutes: int = 20, range_hours: int = 1, granularity: str = "minute") -> Dict:
418
+ """获取真实的 TPM/RPM 统计(线程池计算 + 短时缓存,减轻重复扫盘)。"""
419
+ key = _perf_cache_key(range_minutes, range_hours, granularity)
420
+ now = time.monotonic()
421
+ hit = _perf_stats_cache.get(key)
422
+ if hit is not None and (now - hit[0]) < _PERF_STATS_CACHE_TTL_SEC:
423
+ return hit[1]
424
+ data = await asyncio.to_thread(_compute_real_stats_sync, range_minutes, range_hours, granularity)
425
+ _perf_stats_cache[key] = (now, data)
426
+ return data
427
+
428
+
429
+ def _perf_details_cache_key(
430
+ timestamp_ms: int,
431
+ granularity: str,
432
+ agent: str,
433
+ search: str,
434
+ sort: str,
435
+ limit: int,
436
+ ) -> str:
437
+ return f"{timestamp_ms}:{granularity}:{agent}:{search}:{sort}:{limit}"
438
+
439
+
440
+ def _compute_minute_details_sync(
385
441
  timestamp_ms: int,
386
442
  granularity: str = "minute",
387
443
  agent: Optional[str] = None,
388
444
  search: Optional[str] = None,
389
445
  sort: str = "tokens_desc",
390
- limit: int = 50
446
+ limit: int = 50,
391
447
  ) -> Dict[str, Any]:
392
- """获取指定时间窗口的调用详情,用于柱体点击钻取。时间展示使用 Asia/Shanghai 时区
393
-
394
- Args:
395
- timestamp_ms: Unix 毫秒时间戳
396
- granularity: 粒度 (minute, hour)
397
- agent: 筛选指定 Agent
398
- search: 搜索触发内容
399
- sort: 排序方式 (tokens_desc, tokens_asc, time_asc, time_desc)
400
- limit: 返回数量限制
401
- """
448
+ """同步聚合柱体钻取数据(线程池 + 短 TTL 缓存)。"""
402
449
  try:
403
450
  ts = datetime.fromtimestamp(timestamp_ms / 1000, tz=timezone.utc)
404
451
  ts_local = ts.astimezone(TZ_DISPLAY)
@@ -419,6 +466,7 @@ async def get_minute_details(
419
466
 
420
467
  all_calls = []
421
468
  agent_set = set()
469
+ window_start_ts = time_start.timestamp()
422
470
 
423
471
  for agent_dir in agents_path.iterdir():
424
472
  if not agent_dir.is_dir():
@@ -437,6 +485,12 @@ async def get_minute_details(
437
485
  for session_file in sessions_path.glob('*.jsonl'):
438
486
  if 'lock' in session_file.name or 'deleted' in session_file.name:
439
487
  continue
488
+ try:
489
+ # 与 parse_session_file 相同启发式:窗口开始后未修改的文件不可能含该窗内的 assistant 记录
490
+ if session_file.stat().st_mtime < window_start_ts:
491
+ continue
492
+ except OSError:
493
+ continue
440
494
  records = parse_session_file_with_details(session_file, agent_id)
441
495
  for r in records:
442
496
  if time_start <= r['timestamp'] < time_end:
@@ -503,6 +557,44 @@ async def get_minute_details(
503
557
  return {'timeWindow': '', 'calls': [], 'totalCalls': 0, 'totalTokens': 0, 'summary': {'avgTokens': 0}, 'agents': [], 'pagination': {'total': 0, 'limit': limit, 'hasMore': False}}
504
558
 
505
559
 
560
+ async def get_minute_details(
561
+ timestamp_ms: int,
562
+ granularity: str = "minute",
563
+ agent: Optional[str] = None,
564
+ search: Optional[str] = None,
565
+ sort: str = "tokens_desc",
566
+ limit: int = 50
567
+ ) -> Dict[str, Any]:
568
+ """获取指定时间窗口的调用详情,用于柱体点击钻取。时间展示使用 Asia/Shanghai 时区
569
+
570
+ Args:
571
+ timestamp_ms: Unix 毫秒时间戳
572
+ granularity: 粒度 (minute, hour)
573
+ agent: 筛选指定 Agent
574
+ search: 搜索触发内容
575
+ sort: 排序方式 (tokens_desc, tokens_asc, time_asc, time_desc)
576
+ limit: 返回数量限制
577
+ """
578
+ ag = agent or ""
579
+ sr = search or ""
580
+ key = _perf_details_cache_key(timestamp_ms, granularity, ag, sr, sort, limit)
581
+ now = time.monotonic()
582
+ hit = _perf_details_cache.get(key)
583
+ if hit is not None and (now - hit[0]) < _PERF_DETAILS_CACHE_TTL_SEC:
584
+ return copy.deepcopy(hit[1])
585
+ data = await asyncio.to_thread(
586
+ _compute_minute_details_sync,
587
+ timestamp_ms,
588
+ granularity,
589
+ agent,
590
+ search,
591
+ sort,
592
+ limit,
593
+ )
594
+ _perf_details_cache[key] = (now, copy.deepcopy(data))
595
+ return data
596
+
597
+
506
598
  @router.get("/performance/details")
507
599
  async def get_performance_details(
508
600
  timestamp: int,
@@ -1,11 +1,13 @@
1
1
  """
2
2
  Timeline API 路由 - 实时执行时序图
3
3
  """
4
+ import asyncio
5
+ import copy
4
6
  import logging
5
7
  import time
6
8
  from fastapi import APIRouter, Query, HTTPException
7
9
  from pydantic import BaseModel
8
- from typing import Optional, List, Dict, Any
10
+ from typing import Optional, List, Dict, Any, Tuple
9
11
  import sys
10
12
  from pathlib import Path
11
13
 
@@ -20,6 +22,14 @@ from data.config_reader import get_agent_config
20
22
 
21
23
  router = APIRouter()
22
24
 
25
+ # 切换 agent / 轮询重复命中时减轻重复读盘解析(短时 stale 可接受)
26
+ _timeline_cache: Dict[str, Tuple[float, Dict[str, Any]]] = {}
27
+ _TIMELINE_CACHE_TTL_SEC = 5.0
28
+
29
+
30
+ def _timeline_cache_key(agent_id: str, session_key: Optional[str], limit: int) -> str:
31
+ return f"{agent_id}\x00{session_key or ''}\x00{limit}"
32
+
23
33
 
24
34
  class TimelineStats(BaseModel):
25
35
  totalDuration: int
@@ -80,11 +90,18 @@ async def get_timeline(
80
90
  raise HTTPException(status_code=404, detail=f"Agent {agent_id} not found")
81
91
 
82
92
  t0 = time.perf_counter()
83
- try:
84
- result = get_timeline_steps(agent_id, session_key, limit)
85
- except Exception as e:
86
- record_error("unknown", str(e), "api:timeline:get", exc=e)
87
- raise HTTPException(status_code=500, detail=safe_api_error_detail(e)) from e
93
+ cache_key = _timeline_cache_key(agent_id, session_key, limit)
94
+ now_mono = time.monotonic()
95
+ hit = _timeline_cache.get(cache_key)
96
+ if hit is not None and (now_mono - hit[0]) < _TIMELINE_CACHE_TTL_SEC:
97
+ result = copy.deepcopy(hit[1])
98
+ else:
99
+ try:
100
+ result = await asyncio.to_thread(get_timeline_steps, agent_id, session_key, limit)
101
+ except Exception as e:
102
+ record_error("unknown", str(e), "api:timeline:get", exc=e)
103
+ raise HTTPException(status_code=500, detail=safe_api_error_detail(e)) from e
104
+ _timeline_cache[cache_key] = (now_mono, copy.deepcopy(result))
88
105
  elapsed_ms = (time.perf_counter() - t0) * 1000
89
106
  steps_count = len(result.get("steps", []))
90
107
  if elapsed_ms >= 100.0:
@@ -127,7 +144,7 @@ async def get_timeline_steps_only(
127
144
  raise HTTPException(status_code=404, detail=f"Agent {agent_id} not found")
128
145
 
129
146
  try:
130
- result = get_timeline_steps(agent_id, session_key, limit)
147
+ result = await asyncio.to_thread(get_timeline_steps, agent_id, session_key, limit)
131
148
  except Exception as e:
132
149
  record_error("unknown", str(e), "api:timeline:steps", exc=e)
133
150
  raise HTTPException(status_code=500, detail=safe_api_error_detail(e)) from e
@@ -153,7 +170,7 @@ async def get_timeline_summary(agent_id: str, session_key: Optional[str] = Query
153
170
  raise HTTPException(status_code=404, detail=f"Agent {agent_id} not found")
154
171
 
155
172
  try:
156
- result = get_timeline_steps(agent_id, session_key, limit=10) # 只需基本信息
173
+ result = await asyncio.to_thread(get_timeline_steps, agent_id, session_key, 10) # 只需基本信息
157
174
  except Exception as e:
158
175
  record_error("unknown", str(e), "api:timeline:summary", exc=e)
159
176
  raise HTTPException(status_code=500, detail=safe_api_error_detail(e)) from e
@@ -7,6 +7,7 @@ from typing import Set, List, Dict, Any
7
7
  import json
8
8
  import asyncio
9
9
  import sys
10
+ import time
10
11
  from pathlib import Path
11
12
 
12
13
  sys.path.append(str(Path(__file__).parent.parent))
@@ -18,30 +19,46 @@ router = APIRouter()
18
19
  # 活跃的 WebSocket 连接
19
20
  active_connections: Set[WebSocket] = set()
20
21
 
21
- # 周期性推送间隔(秒)- 优化:从 3 秒缩短到 1 秒
22
- BROADCAST_INTERVAL_SEC = 1
22
+ # 周期性增量检查基准间隔(秒);空闲时会自动退避拉长(见 _periodic_broadcast_loop)
23
+ BROADCAST_INTERVAL_SEC = 5
23
24
  _broadcast_task: asyncio.Task | None = None
25
+ _broadcast_sleep_sec: float = float(BROADCAST_INTERVAL_SEC)
26
+ _broadcast_idle_streak: int = 0
27
+
28
+ # 文件监听等高频触发下合并 full_state,降低前端解析与重绘压力
29
+ FULL_STATE_MIN_INTERVAL_SEC = 2.0
30
+ _last_full_state_monotonic: float = 0.0
24
31
 
25
32
 
26
33
  async def _periodic_broadcast_loop():
27
- """周期性广播状态更新(增量),确保无文件变更时也有更新"""
34
+ """周期性广播状态更新(增量);连续无变更则拉长睡眠间隔,上限 30s。"""
35
+ global _broadcast_sleep_sec, _broadcast_idle_streak
28
36
  while True:
29
- await asyncio.sleep(BROADCAST_INTERVAL_SEC)
37
+ await asyncio.sleep(_broadcast_sleep_sec)
30
38
  if active_connections:
31
- # 只推送状态变化的 Agent
32
39
  try:
33
40
  from status.status_calculator import get_changed_agents
34
41
  changed_agents = await get_changed_agents()
35
42
  if changed_agents:
43
+ _broadcast_idle_streak = 0
44
+ _broadcast_sleep_sec = float(BROADCAST_INTERVAL_SEC)
36
45
  await broadcast_state_update(changed_agents)
46
+ else:
47
+ _broadcast_idle_streak += 1
48
+ if _broadcast_idle_streak >= 3:
49
+ _broadcast_sleep_sec = min(_broadcast_sleep_sec * 2.0, 30.0)
50
+ except asyncio.CancelledError:
51
+ raise
37
52
  except Exception as e:
38
53
  record_error("unknown", str(e), "websocket:periodic_broadcast", exc=e)
39
54
 
40
55
 
41
56
  def _ensure_broadcast_task():
42
57
  """有连接时启动周期性推送"""
43
- global _broadcast_task
58
+ global _broadcast_task, _broadcast_sleep_sec, _broadcast_idle_streak
44
59
  if active_connections and (_broadcast_task is None or _broadcast_task.done()):
60
+ _broadcast_sleep_sec = float(BROADCAST_INTERVAL_SEC)
61
+ _broadcast_idle_streak = 0
45
62
  _broadcast_task = asyncio.create_task(_periodic_broadcast_loop())
46
63
 
47
64
 
@@ -210,9 +227,15 @@ async def broadcast_full_state():
210
227
  优化点:
211
228
  1. 使用 get_collaboration_dynamic() 代替 get_collaboration()
212
229
  2. 只推送动态数据,减少数据量
230
+ 3. 短时间重复调用节流,避免监听线程连震时频繁全量推送
213
231
  """
232
+ global _last_full_state_monotonic
214
233
  if not active_connections:
215
234
  return
235
+ now = time.monotonic()
236
+ if now - _last_full_state_monotonic < FULL_STATE_MIN_INTERVAL_SEC:
237
+ return
238
+ _last_full_state_monotonic = now
216
239
  try:
217
240
  from .agents import get_agents as get_agents_list
218
241
  from .subagents import get_subagents
@@ -25,19 +25,20 @@ class SchemaValidator:
25
25
  self.schema = schema
26
26
  self.strict = strict
27
27
  self._validator = Draft202012Validator(schema)
28
- self._last_errors: List[str] = []
29
28
 
30
29
  def validate(self, data: Any) -> ValidationResult:
31
- self._last_errors = []
30
+ """线程安全:校验结果仅通过返回值给出,实例上不保留最后一次错误(避免并发覆盖)。"""
31
+ errors: List[str] = []
32
32
  if not isinstance(data, (dict, list)) and self.schema.get("type") == "object":
33
- self._last_errors.append("expected object")
34
- return ValidationResult(False, list(self._last_errors))
33
+ errors.append("expected object")
34
+ return ValidationResult(False, errors)
35
35
  try:
36
36
  self._validator.validate(data)
37
37
  return ValidationResult(True, [])
38
38
  except jsonschema.ValidationError as e:
39
- self._last_errors.append(e.message)
40
- return ValidationResult(False, list(self._last_errors))
39
+ errors.append(e.message)
40
+ return ValidationResult(False, errors)
41
41
 
42
42
  def get_error_details(self) -> Dict[str, Any]:
43
- return {"errors": list(self._last_errors)}
43
+ """兼容旧接口;共享校验器实例时不代表「最后一次校验」。请使用 validate() 的返回值。"""
44
+ return {"errors": []}
@@ -152,22 +152,32 @@ def get_models_configured_by_agents() -> List[str]:
152
152
  """
153
153
  从配置中收集「各 Agent 实际配置使用」的模型 ID(仅 primary + fallbacks)。
154
154
  用于协作流程右侧模型面板:只显示有 Agent 配置的模型,不含白名单中未使用的。
155
+
156
+ 策略:仅包含作为 primary 使用、或被某 agent 配置过的模型。
157
+ 不包含 defaults.model.fallbacks 中没有任何 agent 当 primary 使用的模型。
155
158
  """
156
159
  agents = get_agents_list()
157
- model_ids = set()
158
- defaults = get_default_config()
159
- default_model = defaults.get('model', {})
160
- if default_model.get('primary'):
161
- model_ids.add(default_model['primary'])
162
- for fb in default_model.get('fallbacks') or []:
163
- model_ids.add(fb)
160
+ # Step 1: 收集所有 primary 模型(用于判断 fallback 是否被实际使用)
161
+ primaries: List[str] = []
164
162
  for agent in agents:
165
163
  cfg = get_agent_models(agent.get('id', ''))
166
164
  if cfg.get('primary'):
167
- model_ids.add(cfg['primary'])
165
+ primaries.append(cfg['primary'])
166
+ primary_set = set(primaries)
167
+
168
+ # Step 2: 收集所有 primary
169
+ model_ids: List[str] = list(dict.fromkeys(primaries)) # 保持顺序去重
170
+
171
+ # Step 3: 只添加被某 agent 实际配置过的 fallback(不被 primary_set 包含的不添加)
172
+ seen = set(primary_set)
173
+ for agent in agents:
174
+ cfg = get_agent_models(agent.get('id', ''))
168
175
  for fb in cfg.get('fallbacks', []):
169
- model_ids.add(fb)
170
- return sorted(model_ids)
176
+ if fb and fb not in seen:
177
+ model_ids.append(fb)
178
+ seen.add(fb)
179
+
180
+ return model_ids
171
181
 
172
182
 
173
183
  def get_all_models_from_agents() -> List[str]:
@@ -187,6 +197,23 @@ def get_all_models_from_agents() -> List[str]:
187
197
  return sorted(model_ids)
188
198
 
189
199
 
200
+ def get_default_models_from_defaults() -> List[str]:
201
+ """
202
+ 仅返回 agents.defaults.model.primary + fallbacks 中的模型。
203
+ 用于协作流程右侧模型面板:当没有任何 Agent 实际配置某模型时,
204
+ 不应因白名单而显示该模型(避免「配置未使用但显示在右侧」)。
205
+ """
206
+ defaults = get_default_config()
207
+ default_model = defaults.get('model', {})
208
+ result = []
209
+ if default_model.get('primary'):
210
+ result.append(default_model['primary'])
211
+ for fb in default_model.get('fallbacks') or []:
212
+ if fb not in result:
213
+ result.append(fb)
214
+ return result
215
+
216
+
190
217
  def get_model_display_name(model_id: str) -> str:
191
218
  """获取模型显示名。展示策略:使用 id 不用别名(与 OpenClaw 白名单逻辑一致)"""
192
219
  if not model_id:
@@ -269,9 +269,12 @@ def has_recent_errors(agent_id: str, minutes: int = 5) -> bool:
269
269
 
270
270
 
271
271
  def get_last_error(agent_id: str) -> Optional[Dict[str, Any]]:
272
- """获取最近的错误信息"""
272
+ """
273
+ 获取最近的错误信息,优先从 session stopReason=error 获取,
274
+ 若无则从 runs.json 中最近结束的 error run 兜底。
275
+ """
273
276
  messages = get_recent_messages(agent_id, limit=100)
274
-
277
+
275
278
  for msg in reversed(messages):
276
279
  if msg.get('stopReason') == 'error':
277
280
  return {
@@ -279,7 +282,39 @@ def get_last_error(agent_id: str) -> Optional[Dict[str, Any]]:
279
282
  'message': msg.get('errorMessage', ''),
280
283
  'timestamp': msg.get('timestamp', 0)
281
284
  }
282
-
285
+
286
+ # 兜底:检查 runs.json 中最近结束的 error run
287
+ run_error = _get_last_run_error(agent_id)
288
+ if run_error:
289
+ return run_error
290
+
291
+ return None
292
+
293
+
294
+ def _get_last_run_error(agent_id: str) -> Optional[Dict[str, Any]]:
295
+ """
296
+ 从 runs.json 获取最近结束的 error run 的错误信息。
297
+ 用于补充 session 中未落 stopReason=error 的 Gateway 中断等场景。
298
+ """
299
+ import time
300
+ from data.subagent_reader import get_agent_runs
301
+
302
+ runs = get_agent_runs(agent_id, limit=20)
303
+ cutoff = int(time.time() * 1000) - 5 * 60 * 1000
304
+ for run in runs:
305
+ ended = run.get('endedAt')
306
+ if not ended or ended < cutoff:
307
+ continue
308
+ outcome = run.get('outcome')
309
+ if not isinstance(outcome, dict) or outcome.get('status') != 'error':
310
+ continue
311
+ error_msg = outcome.get('error', '') or ''
312
+ return {
313
+ 'type': detect_error_type(error_msg),
314
+ 'message': error_msg,
315
+ 'timestamp': ended,
316
+ 'source': 'run' # 标记来源,便于调试
317
+ }
283
318
  return None
284
319
 
285
320
 
@@ -666,22 +666,9 @@ def resolve_agent_session_jsonl(
666
666
  if isinstance(index_map.get(k), dict) and str(k).startswith(prefix)
667
667
  ]
668
668
 
669
- # 1) 与当前子任务最一致:runs.json 中该 agent 最近一次 run 的 childSessionKey
670
- runs = get_subagent_runs().get(state_id, [])
671
- if runs:
672
- runs.sort(key=lambda x: x.get('startedAt', 0), reverse=True)
673
- preferred_key = runs[0].get('childSessionKey')
674
- if preferred_key and preferred_key in index_map:
675
- ent = index_map[preferred_key]
676
- if isinstance(ent, dict):
677
- p = resolve_session_jsonl_path(sessions_path, ent)
678
- if p and p.is_file():
679
- sid = ent.get('sessionId') or preferred_key
680
- return p, sid, preferred_key
681
-
682
- # 2) 按 sessions.json 的 updatedAt/lastMessageAt 选最近会话(在 glob mtime 之前)
683
- # OpenClaw 在任务结束后可能从 runs.json 移除 run,此处仍可定位「最近活跃」子会话 jsonl。
684
- # 多文件时比仅凭 *.jsonl 的 mtime 更稳,且与 4/24 当晚最晚更新 session 一致。
669
+ # 直接按 sessions.json updatedAt 选最新会话。
670
+ # runs.json 中的 run 即使已结束也仍保留在列表中,用它优先会错误选中旧 session;
671
+ # 而 updatedAt 由 OpenClaw 维护,能准确反映会话的实际最后活跃时间。
685
672
  if agent_keys:
686
673
  agent_keys.sort(
687
674
  key=lambda k: (index_map[k].get('updatedAt') or index_map[k].get('lastMessageAt') or 0),
@@ -30,6 +30,28 @@ MAIN_AGENT_SOLO_STREAM_GRACE_SEC = 20
30
30
 
31
31
  AgentStatus = Literal['idle', 'working', 'down']
32
32
 
33
+ # 最近多久内的 error run 应视为 down 状态(分钟)
34
+ _RECENT_ERROR_RUN_WINDOW_MINUTES = 5
35
+
36
+
37
+ def _has_recent_error_run(agent_id: str, minutes: int = _RECENT_ERROR_RUN_WINDOW_MINUTES) -> bool:
38
+ """
39
+ 检查 runs.json 中是否有最近结束且 outcome.status == 'error' 的 run。
40
+ 用于补充 session stopReason=error:Gateway 重启等原因导致的 run 中断
41
+ 会写入 runs.json 但不一定会话落 stopReason=error。
42
+ """
43
+ import time
44
+ runs = get_agent_runs(agent_id, limit=20)
45
+ cutoff = int(time.time() * 1000) - minutes * 60 * 1000
46
+ for run in runs:
47
+ ended = run.get('endedAt')
48
+ if not ended or ended < cutoff:
49
+ continue
50
+ outcome = run.get('outcome')
51
+ if isinstance(outcome, dict) and outcome.get('status') == 'error':
52
+ return True
53
+ return False
54
+
33
55
 
34
56
  def _main_agent_solo_processing(agent_id: str) -> bool:
35
57
  """
@@ -58,16 +80,16 @@ def _main_agent_solo_processing(agent_id: str) -> bool:
58
80
  def calculate_agent_status(agent_id: str, use_cache: bool = True) -> AgentStatus:
59
81
  """
60
82
  计算 Agent 状态(基于 runs.json + sessions.json)
61
-
83
+
62
84
  优先级:
63
- 1. 异常 (down) - 最近5分钟有 stopReason=error
85
+ 1. 异常 (down) - 最近5分钟有 stopReason=error,或有最近结束的 error run
64
86
  2. 工作中 (working) - 有活跃 subagent run;或主 Agent 且无 run 时 thinking / 未完成工具 / 短窗内会话写入
65
87
  3. 空闲 (idle) - 其余情况(子 Agent 无 run 即空闲,与协作图 activePath 一致)
66
-
88
+
67
89
  Args:
68
90
  agent_id: Agent ID
69
91
  use_cache: 是否使用缓存(默认 True)
70
-
92
+
71
93
  Returns:
72
94
  Agent 状态
73
95
  """
@@ -82,6 +104,8 @@ def calculate_agent_status(agent_id: str, use_cache: bool = True) -> AgentStatus
82
104
  # 重新计算
83
105
  if has_recent_errors(agent_id, minutes=5):
84
106
  status = 'down'
107
+ elif _has_recent_error_run(agent_id, minutes=5):
108
+ status = 'down'
85
109
  elif is_agent_working(agent_id):
86
110
  status = 'working'
87
111
  elif _main_agent_solo_processing(agent_id):
@@ -124,7 +148,8 @@ def get_agents_with_status() -> list:
124
148
  try:
125
149
  status = calculate_agent_status(agent_id)
126
150
  current_task = get_current_task(agent_id)
127
- if status == 'idle':
151
+ # idle 且无已结束 run 任务时才清空 currentTask
152
+ if status == 'idle' and not current_task:
128
153
  current_task = ''
129
154
  last_active = get_last_active_time(agent_id)
130
155
  last_error = get_last_error(agent_id) if status == 'down' else None
@@ -155,16 +180,32 @@ def get_agents_with_status() -> list:
155
180
  def get_current_task(agent_id: str) -> str:
156
181
  """
157
182
  获取 Agent 当前任务描述。
158
- 仅从未结束的 run(endedAt 为空)读取;已结束的 run 只代表历史,不应在空闲时仍当「当前任务」展示。
183
+
184
+ 优先级:
185
+ 1. 活跃 run(endedAt 为空)—— 代表正在执行的任务
186
+ 2. 最近结束的 run——即使已结束也要展示(run 失败中断后仍需可见)
159
187
  """
160
188
  runs = get_agent_runs(agent_id, limit=40)
189
+
190
+ # 优先级1:未结束的 run
191
+ for run in runs:
192
+ if run.get('endedAt') is None:
193
+ task = run.get('task', '') or ''
194
+ if len(task) > 60:
195
+ task = task[:57] + '...'
196
+ return task
197
+
198
+ # 优先级2:最近的已结束 run(确保失败中断的任务也能在 Dashboard 上看到)
161
199
  for run in runs:
162
200
  if run.get('endedAt') is not None:
163
- continue
164
- task = run.get('task', '') or ''
165
- if len(task) > 60:
166
- task = task[:57] + '...'
167
- return task
201
+ task = run.get('task', '') or ''
202
+ if task:
203
+ outcome = run.get('outcome', {})
204
+ status = outcome.get('status') if isinstance(outcome, dict) else None
205
+ prefix = '[失败] ' if status == 'error' else '[已结束] '
206
+ if len(task) > 57:
207
+ task = task[:57] + '...'
208
+ return prefix + task
168
209
 
169
210
  return ''
170
211