openclaw-agent-dashboard 1.0.43 → 1.0.45
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dashboard/api/performance.py +118 -26
- package/dashboard/api/timeline.py +25 -8
- package/dashboard/api/websocket.py +29 -6
- package/dashboard/core/schemas/base.py +8 -7
- package/dashboard/data/config_reader.py +37 -10
- package/dashboard/data/session_reader.py +38 -3
- package/dashboard/data/timeline_reader.py +3 -16
- package/dashboard/status/status_calculator.py +52 -11
- package/dashboard/utils/data_repair.py +13 -14
- package/dashboard/watchers/file_watcher.py +3 -3
- package/frontend-dist/assets/{index-CrwySDZq.css → index-BNC0j5Qz.css} +1 -1
- package/frontend-dist/assets/index-DhvK9bbq.js +24 -0
- package/frontend-dist/index.html +2 -2
- package/openclaw.plugin.json +4 -1
- package/package.json +1 -1
- package/frontend-dist/assets/index-BtN_FdUX.js +0 -24
|
@@ -3,9 +3,12 @@
|
|
|
3
3
|
支持按分钟查看调用详情,便于分析调用瓶颈
|
|
4
4
|
"""
|
|
5
5
|
from fastapi import APIRouter
|
|
6
|
-
from typing import List, Dict, Any, Optional
|
|
6
|
+
from typing import List, Dict, Any, Optional, Tuple
|
|
7
|
+
import copy
|
|
7
8
|
import json
|
|
8
9
|
import re
|
|
10
|
+
import asyncio
|
|
11
|
+
import time
|
|
9
12
|
from pathlib import Path
|
|
10
13
|
from datetime import datetime, timedelta, timezone
|
|
11
14
|
from zoneinfo import ZoneInfo
|
|
@@ -19,6 +22,31 @@ TZ_DISPLAY = ZoneInfo('Asia/Shanghai')
|
|
|
19
22
|
|
|
20
23
|
router = APIRouter()
|
|
21
24
|
|
|
25
|
+
# 聚合统计多次并发请求(WS + 轮询 + 多标签)共用;TTL 短以保证大致实时
|
|
26
|
+
_perf_stats_cache: Dict[str, Tuple[float, Dict[str, Any]]] = {}
|
|
27
|
+
_PERF_STATS_CACHE_TTL_SEC = 12.0
|
|
28
|
+
|
|
29
|
+
# 柱体钻取:多次点击 / 并发标签共用短缓存
|
|
30
|
+
_perf_details_cache: Dict[str, Tuple[float, Dict[str, Any]]] = {}
|
|
31
|
+
_PERF_DETAILS_CACHE_TTL_SEC = 12.0
|
|
32
|
+
|
|
33
|
+
# 轻量解析 envelope ISO 时间,便于跳过明显早于查询窗口的行(避免 json.loads + schema)
|
|
34
|
+
_QUICK_ENV_TS_RE = re.compile(r'"timestamp"\s*:\s*"([^"]+)"')
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def _quick_envelope_timestamp_utc(line: str) -> Optional[datetime]:
|
|
38
|
+
m = _QUICK_ENV_TS_RE.search(line)
|
|
39
|
+
if not m:
|
|
40
|
+
return None
|
|
41
|
+
try:
|
|
42
|
+
return datetime.fromisoformat(m.group(1).replace("Z", "+00:00"))
|
|
43
|
+
except ValueError:
|
|
44
|
+
return None
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def _perf_cache_key(range_minutes: int, range_hours: int, granularity: str) -> str:
|
|
48
|
+
return f"{range_minutes}:{range_hours}:{granularity}"
|
|
49
|
+
|
|
22
50
|
|
|
23
51
|
def _extract_trigger_text(msg: Dict) -> str:
|
|
24
52
|
"""从消息中提取触发内容(完整展示)"""
|
|
@@ -177,11 +205,25 @@ def parse_session_file(session_path: Path, range_hours: int = 1) -> List[Dict]:
|
|
|
177
205
|
range_hours: 时间范围(小时),0 表示不限制
|
|
178
206
|
"""
|
|
179
207
|
messages = []
|
|
208
|
+
now = datetime.now(timezone.utc)
|
|
209
|
+
time_ago = now - timedelta(hours=range_hours) if range_hours > 0 else None
|
|
210
|
+
|
|
211
|
+
# 启发式:窗口内若有 assistant usage,文件通常在窗口内有过写入;过久未修改则可跳过整文件
|
|
212
|
+
if time_ago is not None:
|
|
213
|
+
try:
|
|
214
|
+
if session_path.stat().st_mtime < time_ago.timestamp():
|
|
215
|
+
return []
|
|
216
|
+
except OSError:
|
|
217
|
+
return []
|
|
180
218
|
|
|
181
219
|
try:
|
|
182
220
|
with open(session_path, 'r', encoding='utf-8') as f:
|
|
183
221
|
for line in f:
|
|
184
222
|
try:
|
|
223
|
+
if time_ago is not None:
|
|
224
|
+
qt = _quick_envelope_timestamp_utc(line)
|
|
225
|
+
if qt is not None and qt < time_ago:
|
|
226
|
+
continue
|
|
185
227
|
envelope, msg = parse_session_jsonl_line(line)
|
|
186
228
|
if (
|
|
187
229
|
not envelope
|
|
@@ -200,11 +242,8 @@ def parse_session_file(session_path: Path, range_hours: int = 1) -> List[Dict]:
|
|
|
200
242
|
str(envelope['timestamp']).replace('Z', '+00:00')
|
|
201
243
|
)
|
|
202
244
|
|
|
203
|
-
if
|
|
204
|
-
|
|
205
|
-
time_ago = now - timedelta(hours=range_hours)
|
|
206
|
-
if timestamp < time_ago:
|
|
207
|
-
continue
|
|
245
|
+
if time_ago is not None and timestamp < time_ago:
|
|
246
|
+
continue
|
|
208
247
|
|
|
209
248
|
messages.append({
|
|
210
249
|
'timestamp': timestamp,
|
|
@@ -240,14 +279,8 @@ async def get_performance_stats(range: str = "20m"):
|
|
|
240
279
|
return stats
|
|
241
280
|
|
|
242
281
|
|
|
243
|
-
|
|
244
|
-
"""
|
|
245
|
-
|
|
246
|
-
Args:
|
|
247
|
-
range_minutes: 时间范围(分钟)
|
|
248
|
-
range_hours: 用于解析 session 的时间范围(小时)
|
|
249
|
-
granularity: 聚合粒度 (minute, hour)
|
|
250
|
-
"""
|
|
282
|
+
def _compute_real_stats_sync(range_minutes: int = 20, range_hours: int = 1, granularity: str = "minute") -> Dict:
|
|
283
|
+
"""同步聚合 TPM/RPM(在线程池中运行,避免阻塞事件循环)。"""
|
|
251
284
|
stats = {
|
|
252
285
|
'current': {
|
|
253
286
|
'tpm': 0,
|
|
@@ -381,24 +414,38 @@ async def get_real_stats(range_minutes: int = 20, range_hours: int = 1, granular
|
|
|
381
414
|
return stats
|
|
382
415
|
|
|
383
416
|
|
|
384
|
-
async def
|
|
417
|
+
async def get_real_stats(range_minutes: int = 20, range_hours: int = 1, granularity: str = "minute") -> Dict:
|
|
418
|
+
"""获取真实的 TPM/RPM 统计(线程池计算 + 短时缓存,减轻重复扫盘)。"""
|
|
419
|
+
key = _perf_cache_key(range_minutes, range_hours, granularity)
|
|
420
|
+
now = time.monotonic()
|
|
421
|
+
hit = _perf_stats_cache.get(key)
|
|
422
|
+
if hit is not None and (now - hit[0]) < _PERF_STATS_CACHE_TTL_SEC:
|
|
423
|
+
return hit[1]
|
|
424
|
+
data = await asyncio.to_thread(_compute_real_stats_sync, range_minutes, range_hours, granularity)
|
|
425
|
+
_perf_stats_cache[key] = (now, data)
|
|
426
|
+
return data
|
|
427
|
+
|
|
428
|
+
|
|
429
|
+
def _perf_details_cache_key(
|
|
430
|
+
timestamp_ms: int,
|
|
431
|
+
granularity: str,
|
|
432
|
+
agent: str,
|
|
433
|
+
search: str,
|
|
434
|
+
sort: str,
|
|
435
|
+
limit: int,
|
|
436
|
+
) -> str:
|
|
437
|
+
return f"{timestamp_ms}:{granularity}:{agent}:{search}:{sort}:{limit}"
|
|
438
|
+
|
|
439
|
+
|
|
440
|
+
def _compute_minute_details_sync(
|
|
385
441
|
timestamp_ms: int,
|
|
386
442
|
granularity: str = "minute",
|
|
387
443
|
agent: Optional[str] = None,
|
|
388
444
|
search: Optional[str] = None,
|
|
389
445
|
sort: str = "tokens_desc",
|
|
390
|
-
limit: int = 50
|
|
446
|
+
limit: int = 50,
|
|
391
447
|
) -> Dict[str, Any]:
|
|
392
|
-
"""
|
|
393
|
-
|
|
394
|
-
Args:
|
|
395
|
-
timestamp_ms: Unix 毫秒时间戳
|
|
396
|
-
granularity: 粒度 (minute, hour)
|
|
397
|
-
agent: 筛选指定 Agent
|
|
398
|
-
search: 搜索触发内容
|
|
399
|
-
sort: 排序方式 (tokens_desc, tokens_asc, time_asc, time_desc)
|
|
400
|
-
limit: 返回数量限制
|
|
401
|
-
"""
|
|
448
|
+
"""同步聚合柱体钻取数据(线程池 + 短 TTL 缓存)。"""
|
|
402
449
|
try:
|
|
403
450
|
ts = datetime.fromtimestamp(timestamp_ms / 1000, tz=timezone.utc)
|
|
404
451
|
ts_local = ts.astimezone(TZ_DISPLAY)
|
|
@@ -419,6 +466,7 @@ async def get_minute_details(
|
|
|
419
466
|
|
|
420
467
|
all_calls = []
|
|
421
468
|
agent_set = set()
|
|
469
|
+
window_start_ts = time_start.timestamp()
|
|
422
470
|
|
|
423
471
|
for agent_dir in agents_path.iterdir():
|
|
424
472
|
if not agent_dir.is_dir():
|
|
@@ -437,6 +485,12 @@ async def get_minute_details(
|
|
|
437
485
|
for session_file in sessions_path.glob('*.jsonl'):
|
|
438
486
|
if 'lock' in session_file.name or 'deleted' in session_file.name:
|
|
439
487
|
continue
|
|
488
|
+
try:
|
|
489
|
+
# 与 parse_session_file 相同启发式:窗口开始后未修改的文件不可能含该窗内的 assistant 记录
|
|
490
|
+
if session_file.stat().st_mtime < window_start_ts:
|
|
491
|
+
continue
|
|
492
|
+
except OSError:
|
|
493
|
+
continue
|
|
440
494
|
records = parse_session_file_with_details(session_file, agent_id)
|
|
441
495
|
for r in records:
|
|
442
496
|
if time_start <= r['timestamp'] < time_end:
|
|
@@ -503,6 +557,44 @@ async def get_minute_details(
|
|
|
503
557
|
return {'timeWindow': '', 'calls': [], 'totalCalls': 0, 'totalTokens': 0, 'summary': {'avgTokens': 0}, 'agents': [], 'pagination': {'total': 0, 'limit': limit, 'hasMore': False}}
|
|
504
558
|
|
|
505
559
|
|
|
560
|
+
async def get_minute_details(
|
|
561
|
+
timestamp_ms: int,
|
|
562
|
+
granularity: str = "minute",
|
|
563
|
+
agent: Optional[str] = None,
|
|
564
|
+
search: Optional[str] = None,
|
|
565
|
+
sort: str = "tokens_desc",
|
|
566
|
+
limit: int = 50
|
|
567
|
+
) -> Dict[str, Any]:
|
|
568
|
+
"""获取指定时间窗口的调用详情,用于柱体点击钻取。时间展示使用 Asia/Shanghai 时区
|
|
569
|
+
|
|
570
|
+
Args:
|
|
571
|
+
timestamp_ms: Unix 毫秒时间戳
|
|
572
|
+
granularity: 粒度 (minute, hour)
|
|
573
|
+
agent: 筛选指定 Agent
|
|
574
|
+
search: 搜索触发内容
|
|
575
|
+
sort: 排序方式 (tokens_desc, tokens_asc, time_asc, time_desc)
|
|
576
|
+
limit: 返回数量限制
|
|
577
|
+
"""
|
|
578
|
+
ag = agent or ""
|
|
579
|
+
sr = search or ""
|
|
580
|
+
key = _perf_details_cache_key(timestamp_ms, granularity, ag, sr, sort, limit)
|
|
581
|
+
now = time.monotonic()
|
|
582
|
+
hit = _perf_details_cache.get(key)
|
|
583
|
+
if hit is not None and (now - hit[0]) < _PERF_DETAILS_CACHE_TTL_SEC:
|
|
584
|
+
return copy.deepcopy(hit[1])
|
|
585
|
+
data = await asyncio.to_thread(
|
|
586
|
+
_compute_minute_details_sync,
|
|
587
|
+
timestamp_ms,
|
|
588
|
+
granularity,
|
|
589
|
+
agent,
|
|
590
|
+
search,
|
|
591
|
+
sort,
|
|
592
|
+
limit,
|
|
593
|
+
)
|
|
594
|
+
_perf_details_cache[key] = (now, copy.deepcopy(data))
|
|
595
|
+
return data
|
|
596
|
+
|
|
597
|
+
|
|
506
598
|
@router.get("/performance/details")
|
|
507
599
|
async def get_performance_details(
|
|
508
600
|
timestamp: int,
|
|
@@ -1,11 +1,13 @@
|
|
|
1
1
|
"""
|
|
2
2
|
Timeline API 路由 - 实时执行时序图
|
|
3
3
|
"""
|
|
4
|
+
import asyncio
|
|
5
|
+
import copy
|
|
4
6
|
import logging
|
|
5
7
|
import time
|
|
6
8
|
from fastapi import APIRouter, Query, HTTPException
|
|
7
9
|
from pydantic import BaseModel
|
|
8
|
-
from typing import Optional, List, Dict, Any
|
|
10
|
+
from typing import Optional, List, Dict, Any, Tuple
|
|
9
11
|
import sys
|
|
10
12
|
from pathlib import Path
|
|
11
13
|
|
|
@@ -20,6 +22,14 @@ from data.config_reader import get_agent_config
|
|
|
20
22
|
|
|
21
23
|
router = APIRouter()
|
|
22
24
|
|
|
25
|
+
# 切换 agent / 轮询重复命中时减轻重复读盘解析(短时 stale 可接受)
|
|
26
|
+
_timeline_cache: Dict[str, Tuple[float, Dict[str, Any]]] = {}
|
|
27
|
+
_TIMELINE_CACHE_TTL_SEC = 5.0
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def _timeline_cache_key(agent_id: str, session_key: Optional[str], limit: int) -> str:
|
|
31
|
+
return f"{agent_id}\x00{session_key or ''}\x00{limit}"
|
|
32
|
+
|
|
23
33
|
|
|
24
34
|
class TimelineStats(BaseModel):
|
|
25
35
|
totalDuration: int
|
|
@@ -80,11 +90,18 @@ async def get_timeline(
|
|
|
80
90
|
raise HTTPException(status_code=404, detail=f"Agent {agent_id} not found")
|
|
81
91
|
|
|
82
92
|
t0 = time.perf_counter()
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
93
|
+
cache_key = _timeline_cache_key(agent_id, session_key, limit)
|
|
94
|
+
now_mono = time.monotonic()
|
|
95
|
+
hit = _timeline_cache.get(cache_key)
|
|
96
|
+
if hit is not None and (now_mono - hit[0]) < _TIMELINE_CACHE_TTL_SEC:
|
|
97
|
+
result = copy.deepcopy(hit[1])
|
|
98
|
+
else:
|
|
99
|
+
try:
|
|
100
|
+
result = await asyncio.to_thread(get_timeline_steps, agent_id, session_key, limit)
|
|
101
|
+
except Exception as e:
|
|
102
|
+
record_error("unknown", str(e), "api:timeline:get", exc=e)
|
|
103
|
+
raise HTTPException(status_code=500, detail=safe_api_error_detail(e)) from e
|
|
104
|
+
_timeline_cache[cache_key] = (now_mono, copy.deepcopy(result))
|
|
88
105
|
elapsed_ms = (time.perf_counter() - t0) * 1000
|
|
89
106
|
steps_count = len(result.get("steps", []))
|
|
90
107
|
if elapsed_ms >= 100.0:
|
|
@@ -127,7 +144,7 @@ async def get_timeline_steps_only(
|
|
|
127
144
|
raise HTTPException(status_code=404, detail=f"Agent {agent_id} not found")
|
|
128
145
|
|
|
129
146
|
try:
|
|
130
|
-
result = get_timeline_steps
|
|
147
|
+
result = await asyncio.to_thread(get_timeline_steps, agent_id, session_key, limit)
|
|
131
148
|
except Exception as e:
|
|
132
149
|
record_error("unknown", str(e), "api:timeline:steps", exc=e)
|
|
133
150
|
raise HTTPException(status_code=500, detail=safe_api_error_detail(e)) from e
|
|
@@ -153,7 +170,7 @@ async def get_timeline_summary(agent_id: str, session_key: Optional[str] = Query
|
|
|
153
170
|
raise HTTPException(status_code=404, detail=f"Agent {agent_id} not found")
|
|
154
171
|
|
|
155
172
|
try:
|
|
156
|
-
result = get_timeline_steps
|
|
173
|
+
result = await asyncio.to_thread(get_timeline_steps, agent_id, session_key, 10) # 只需基本信息
|
|
157
174
|
except Exception as e:
|
|
158
175
|
record_error("unknown", str(e), "api:timeline:summary", exc=e)
|
|
159
176
|
raise HTTPException(status_code=500, detail=safe_api_error_detail(e)) from e
|
|
@@ -7,6 +7,7 @@ from typing import Set, List, Dict, Any
|
|
|
7
7
|
import json
|
|
8
8
|
import asyncio
|
|
9
9
|
import sys
|
|
10
|
+
import time
|
|
10
11
|
from pathlib import Path
|
|
11
12
|
|
|
12
13
|
sys.path.append(str(Path(__file__).parent.parent))
|
|
@@ -18,30 +19,46 @@ router = APIRouter()
|
|
|
18
19
|
# 活跃的 WebSocket 连接
|
|
19
20
|
active_connections: Set[WebSocket] = set()
|
|
20
21
|
|
|
21
|
-
#
|
|
22
|
-
BROADCAST_INTERVAL_SEC =
|
|
22
|
+
# 周期性增量检查基准间隔(秒);空闲时会自动退避拉长(见 _periodic_broadcast_loop)
|
|
23
|
+
BROADCAST_INTERVAL_SEC = 5
|
|
23
24
|
_broadcast_task: asyncio.Task | None = None
|
|
25
|
+
_broadcast_sleep_sec: float = float(BROADCAST_INTERVAL_SEC)
|
|
26
|
+
_broadcast_idle_streak: int = 0
|
|
27
|
+
|
|
28
|
+
# 文件监听等高频触发下合并 full_state,降低前端解析与重绘压力
|
|
29
|
+
FULL_STATE_MIN_INTERVAL_SEC = 2.0
|
|
30
|
+
_last_full_state_monotonic: float = 0.0
|
|
24
31
|
|
|
25
32
|
|
|
26
33
|
async def _periodic_broadcast_loop():
|
|
27
|
-
"""
|
|
34
|
+
"""周期性广播状态更新(增量);连续无变更则拉长睡眠间隔,上限 30s。"""
|
|
35
|
+
global _broadcast_sleep_sec, _broadcast_idle_streak
|
|
28
36
|
while True:
|
|
29
|
-
await asyncio.sleep(
|
|
37
|
+
await asyncio.sleep(_broadcast_sleep_sec)
|
|
30
38
|
if active_connections:
|
|
31
|
-
# 只推送状态变化的 Agent
|
|
32
39
|
try:
|
|
33
40
|
from status.status_calculator import get_changed_agents
|
|
34
41
|
changed_agents = await get_changed_agents()
|
|
35
42
|
if changed_agents:
|
|
43
|
+
_broadcast_idle_streak = 0
|
|
44
|
+
_broadcast_sleep_sec = float(BROADCAST_INTERVAL_SEC)
|
|
36
45
|
await broadcast_state_update(changed_agents)
|
|
46
|
+
else:
|
|
47
|
+
_broadcast_idle_streak += 1
|
|
48
|
+
if _broadcast_idle_streak >= 3:
|
|
49
|
+
_broadcast_sleep_sec = min(_broadcast_sleep_sec * 2.0, 30.0)
|
|
50
|
+
except asyncio.CancelledError:
|
|
51
|
+
raise
|
|
37
52
|
except Exception as e:
|
|
38
53
|
record_error("unknown", str(e), "websocket:periodic_broadcast", exc=e)
|
|
39
54
|
|
|
40
55
|
|
|
41
56
|
def _ensure_broadcast_task():
|
|
42
57
|
"""有连接时启动周期性推送"""
|
|
43
|
-
global _broadcast_task
|
|
58
|
+
global _broadcast_task, _broadcast_sleep_sec, _broadcast_idle_streak
|
|
44
59
|
if active_connections and (_broadcast_task is None or _broadcast_task.done()):
|
|
60
|
+
_broadcast_sleep_sec = float(BROADCAST_INTERVAL_SEC)
|
|
61
|
+
_broadcast_idle_streak = 0
|
|
45
62
|
_broadcast_task = asyncio.create_task(_periodic_broadcast_loop())
|
|
46
63
|
|
|
47
64
|
|
|
@@ -210,9 +227,15 @@ async def broadcast_full_state():
|
|
|
210
227
|
优化点:
|
|
211
228
|
1. 使用 get_collaboration_dynamic() 代替 get_collaboration()
|
|
212
229
|
2. 只推送动态数据,减少数据量
|
|
230
|
+
3. 短时间重复调用节流,避免监听线程连震时频繁全量推送
|
|
213
231
|
"""
|
|
232
|
+
global _last_full_state_monotonic
|
|
214
233
|
if not active_connections:
|
|
215
234
|
return
|
|
235
|
+
now = time.monotonic()
|
|
236
|
+
if now - _last_full_state_monotonic < FULL_STATE_MIN_INTERVAL_SEC:
|
|
237
|
+
return
|
|
238
|
+
_last_full_state_monotonic = now
|
|
216
239
|
try:
|
|
217
240
|
from .agents import get_agents as get_agents_list
|
|
218
241
|
from .subagents import get_subagents
|
|
@@ -25,19 +25,20 @@ class SchemaValidator:
|
|
|
25
25
|
self.schema = schema
|
|
26
26
|
self.strict = strict
|
|
27
27
|
self._validator = Draft202012Validator(schema)
|
|
28
|
-
self._last_errors: List[str] = []
|
|
29
28
|
|
|
30
29
|
def validate(self, data: Any) -> ValidationResult:
|
|
31
|
-
|
|
30
|
+
"""线程安全:校验结果仅通过返回值给出,实例上不保留最后一次错误(避免并发覆盖)。"""
|
|
31
|
+
errors: List[str] = []
|
|
32
32
|
if not isinstance(data, (dict, list)) and self.schema.get("type") == "object":
|
|
33
|
-
|
|
34
|
-
return ValidationResult(False,
|
|
33
|
+
errors.append("expected object")
|
|
34
|
+
return ValidationResult(False, errors)
|
|
35
35
|
try:
|
|
36
36
|
self._validator.validate(data)
|
|
37
37
|
return ValidationResult(True, [])
|
|
38
38
|
except jsonschema.ValidationError as e:
|
|
39
|
-
|
|
40
|
-
return ValidationResult(False,
|
|
39
|
+
errors.append(e.message)
|
|
40
|
+
return ValidationResult(False, errors)
|
|
41
41
|
|
|
42
42
|
def get_error_details(self) -> Dict[str, Any]:
|
|
43
|
-
|
|
43
|
+
"""兼容旧接口;共享校验器实例时不代表「最后一次校验」。请使用 validate() 的返回值。"""
|
|
44
|
+
return {"errors": []}
|
|
@@ -152,22 +152,32 @@ def get_models_configured_by_agents() -> List[str]:
|
|
|
152
152
|
"""
|
|
153
153
|
从配置中收集「各 Agent 实际配置使用」的模型 ID(仅 primary + fallbacks)。
|
|
154
154
|
用于协作流程右侧模型面板:只显示有 Agent 配置的模型,不含白名单中未使用的。
|
|
155
|
+
|
|
156
|
+
策略:仅包含作为 primary 使用、或被某 agent 配置过的模型。
|
|
157
|
+
不包含 defaults.model.fallbacks 中没有任何 agent 当 primary 使用的模型。
|
|
155
158
|
"""
|
|
156
159
|
agents = get_agents_list()
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
default_model = defaults.get('model', {})
|
|
160
|
-
if default_model.get('primary'):
|
|
161
|
-
model_ids.add(default_model['primary'])
|
|
162
|
-
for fb in default_model.get('fallbacks') or []:
|
|
163
|
-
model_ids.add(fb)
|
|
160
|
+
# Step 1: 收集所有 primary 模型(用于判断 fallback 是否被实际使用)
|
|
161
|
+
primaries: List[str] = []
|
|
164
162
|
for agent in agents:
|
|
165
163
|
cfg = get_agent_models(agent.get('id', ''))
|
|
166
164
|
if cfg.get('primary'):
|
|
167
|
-
|
|
165
|
+
primaries.append(cfg['primary'])
|
|
166
|
+
primary_set = set(primaries)
|
|
167
|
+
|
|
168
|
+
# Step 2: 收集所有 primary
|
|
169
|
+
model_ids: List[str] = list(dict.fromkeys(primaries)) # 保持顺序去重
|
|
170
|
+
|
|
171
|
+
# Step 3: 只添加被某 agent 实际配置过的 fallback(不被 primary_set 包含的不添加)
|
|
172
|
+
seen = set(primary_set)
|
|
173
|
+
for agent in agents:
|
|
174
|
+
cfg = get_agent_models(agent.get('id', ''))
|
|
168
175
|
for fb in cfg.get('fallbacks', []):
|
|
169
|
-
|
|
170
|
-
|
|
176
|
+
if fb and fb not in seen:
|
|
177
|
+
model_ids.append(fb)
|
|
178
|
+
seen.add(fb)
|
|
179
|
+
|
|
180
|
+
return model_ids
|
|
171
181
|
|
|
172
182
|
|
|
173
183
|
def get_all_models_from_agents() -> List[str]:
|
|
@@ -187,6 +197,23 @@ def get_all_models_from_agents() -> List[str]:
|
|
|
187
197
|
return sorted(model_ids)
|
|
188
198
|
|
|
189
199
|
|
|
200
|
+
def get_default_models_from_defaults() -> List[str]:
|
|
201
|
+
"""
|
|
202
|
+
仅返回 agents.defaults.model.primary + fallbacks 中的模型。
|
|
203
|
+
用于协作流程右侧模型面板:当没有任何 Agent 实际配置某模型时,
|
|
204
|
+
不应因白名单而显示该模型(避免「配置未使用但显示在右侧」)。
|
|
205
|
+
"""
|
|
206
|
+
defaults = get_default_config()
|
|
207
|
+
default_model = defaults.get('model', {})
|
|
208
|
+
result = []
|
|
209
|
+
if default_model.get('primary'):
|
|
210
|
+
result.append(default_model['primary'])
|
|
211
|
+
for fb in default_model.get('fallbacks') or []:
|
|
212
|
+
if fb not in result:
|
|
213
|
+
result.append(fb)
|
|
214
|
+
return result
|
|
215
|
+
|
|
216
|
+
|
|
190
217
|
def get_model_display_name(model_id: str) -> str:
|
|
191
218
|
"""获取模型显示名。展示策略:使用 id 不用别名(与 OpenClaw 白名单逻辑一致)"""
|
|
192
219
|
if not model_id:
|
|
@@ -269,9 +269,12 @@ def has_recent_errors(agent_id: str, minutes: int = 5) -> bool:
|
|
|
269
269
|
|
|
270
270
|
|
|
271
271
|
def get_last_error(agent_id: str) -> Optional[Dict[str, Any]]:
|
|
272
|
-
"""
|
|
272
|
+
"""
|
|
273
|
+
获取最近的错误信息,优先从 session stopReason=error 获取,
|
|
274
|
+
若无则从 runs.json 中最近结束的 error run 兜底。
|
|
275
|
+
"""
|
|
273
276
|
messages = get_recent_messages(agent_id, limit=100)
|
|
274
|
-
|
|
277
|
+
|
|
275
278
|
for msg in reversed(messages):
|
|
276
279
|
if msg.get('stopReason') == 'error':
|
|
277
280
|
return {
|
|
@@ -279,7 +282,39 @@ def get_last_error(agent_id: str) -> Optional[Dict[str, Any]]:
|
|
|
279
282
|
'message': msg.get('errorMessage', ''),
|
|
280
283
|
'timestamp': msg.get('timestamp', 0)
|
|
281
284
|
}
|
|
282
|
-
|
|
285
|
+
|
|
286
|
+
# 兜底:检查 runs.json 中最近结束的 error run
|
|
287
|
+
run_error = _get_last_run_error(agent_id)
|
|
288
|
+
if run_error:
|
|
289
|
+
return run_error
|
|
290
|
+
|
|
291
|
+
return None
|
|
292
|
+
|
|
293
|
+
|
|
294
|
+
def _get_last_run_error(agent_id: str) -> Optional[Dict[str, Any]]:
|
|
295
|
+
"""
|
|
296
|
+
从 runs.json 获取最近结束的 error run 的错误信息。
|
|
297
|
+
用于补充 session 中未落 stopReason=error 的 Gateway 中断等场景。
|
|
298
|
+
"""
|
|
299
|
+
import time
|
|
300
|
+
from data.subagent_reader import get_agent_runs
|
|
301
|
+
|
|
302
|
+
runs = get_agent_runs(agent_id, limit=20)
|
|
303
|
+
cutoff = int(time.time() * 1000) - 5 * 60 * 1000
|
|
304
|
+
for run in runs:
|
|
305
|
+
ended = run.get('endedAt')
|
|
306
|
+
if not ended or ended < cutoff:
|
|
307
|
+
continue
|
|
308
|
+
outcome = run.get('outcome')
|
|
309
|
+
if not isinstance(outcome, dict) or outcome.get('status') != 'error':
|
|
310
|
+
continue
|
|
311
|
+
error_msg = outcome.get('error', '') or ''
|
|
312
|
+
return {
|
|
313
|
+
'type': detect_error_type(error_msg),
|
|
314
|
+
'message': error_msg,
|
|
315
|
+
'timestamp': ended,
|
|
316
|
+
'source': 'run' # 标记来源,便于调试
|
|
317
|
+
}
|
|
283
318
|
return None
|
|
284
319
|
|
|
285
320
|
|
|
@@ -666,22 +666,9 @@ def resolve_agent_session_jsonl(
|
|
|
666
666
|
if isinstance(index_map.get(k), dict) and str(k).startswith(prefix)
|
|
667
667
|
]
|
|
668
668
|
|
|
669
|
-
#
|
|
670
|
-
runs
|
|
671
|
-
|
|
672
|
-
runs.sort(key=lambda x: x.get('startedAt', 0), reverse=True)
|
|
673
|
-
preferred_key = runs[0].get('childSessionKey')
|
|
674
|
-
if preferred_key and preferred_key in index_map:
|
|
675
|
-
ent = index_map[preferred_key]
|
|
676
|
-
if isinstance(ent, dict):
|
|
677
|
-
p = resolve_session_jsonl_path(sessions_path, ent)
|
|
678
|
-
if p and p.is_file():
|
|
679
|
-
sid = ent.get('sessionId') or preferred_key
|
|
680
|
-
return p, sid, preferred_key
|
|
681
|
-
|
|
682
|
-
# 2) 按 sessions.json 的 updatedAt/lastMessageAt 选最近会话(在 glob mtime 之前)
|
|
683
|
-
# OpenClaw 在任务结束后可能从 runs.json 移除 run,此处仍可定位「最近活跃」子会话 jsonl。
|
|
684
|
-
# 多文件时比仅凭 *.jsonl 的 mtime 更稳,且与 4/24 当晚最晚更新 session 一致。
|
|
669
|
+
# 直接按 sessions.json 的 updatedAt 选最新会话。
|
|
670
|
+
# runs.json 中的 run 即使已结束也仍保留在列表中,用它优先会错误选中旧 session;
|
|
671
|
+
# 而 updatedAt 由 OpenClaw 维护,能准确反映会话的实际最后活跃时间。
|
|
685
672
|
if agent_keys:
|
|
686
673
|
agent_keys.sort(
|
|
687
674
|
key=lambda k: (index_map[k].get('updatedAt') or index_map[k].get('lastMessageAt') or 0),
|
|
@@ -30,6 +30,28 @@ MAIN_AGENT_SOLO_STREAM_GRACE_SEC = 20
|
|
|
30
30
|
|
|
31
31
|
AgentStatus = Literal['idle', 'working', 'down']
|
|
32
32
|
|
|
33
|
+
# 最近多久内的 error run 应视为 down 状态(分钟)
|
|
34
|
+
_RECENT_ERROR_RUN_WINDOW_MINUTES = 5
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def _has_recent_error_run(agent_id: str, minutes: int = _RECENT_ERROR_RUN_WINDOW_MINUTES) -> bool:
|
|
38
|
+
"""
|
|
39
|
+
检查 runs.json 中是否有最近结束且 outcome.status == 'error' 的 run。
|
|
40
|
+
用于补充 session stopReason=error:Gateway 重启等原因导致的 run 中断
|
|
41
|
+
会写入 runs.json 但不一定会话落 stopReason=error。
|
|
42
|
+
"""
|
|
43
|
+
import time
|
|
44
|
+
runs = get_agent_runs(agent_id, limit=20)
|
|
45
|
+
cutoff = int(time.time() * 1000) - minutes * 60 * 1000
|
|
46
|
+
for run in runs:
|
|
47
|
+
ended = run.get('endedAt')
|
|
48
|
+
if not ended or ended < cutoff:
|
|
49
|
+
continue
|
|
50
|
+
outcome = run.get('outcome')
|
|
51
|
+
if isinstance(outcome, dict) and outcome.get('status') == 'error':
|
|
52
|
+
return True
|
|
53
|
+
return False
|
|
54
|
+
|
|
33
55
|
|
|
34
56
|
def _main_agent_solo_processing(agent_id: str) -> bool:
|
|
35
57
|
"""
|
|
@@ -58,16 +80,16 @@ def _main_agent_solo_processing(agent_id: str) -> bool:
|
|
|
58
80
|
def calculate_agent_status(agent_id: str, use_cache: bool = True) -> AgentStatus:
|
|
59
81
|
"""
|
|
60
82
|
计算 Agent 状态(基于 runs.json + sessions.json)
|
|
61
|
-
|
|
83
|
+
|
|
62
84
|
优先级:
|
|
63
|
-
1. 异常 (down) - 最近5分钟有 stopReason=error
|
|
85
|
+
1. 异常 (down) - 最近5分钟有 stopReason=error,或有最近结束的 error run
|
|
64
86
|
2. 工作中 (working) - 有活跃 subagent run;或主 Agent 且无 run 时 thinking / 未完成工具 / 短窗内会话写入
|
|
65
87
|
3. 空闲 (idle) - 其余情况(子 Agent 无 run 即空闲,与协作图 activePath 一致)
|
|
66
|
-
|
|
88
|
+
|
|
67
89
|
Args:
|
|
68
90
|
agent_id: Agent ID
|
|
69
91
|
use_cache: 是否使用缓存(默认 True)
|
|
70
|
-
|
|
92
|
+
|
|
71
93
|
Returns:
|
|
72
94
|
Agent 状态
|
|
73
95
|
"""
|
|
@@ -82,6 +104,8 @@ def calculate_agent_status(agent_id: str, use_cache: bool = True) -> AgentStatus
|
|
|
82
104
|
# 重新计算
|
|
83
105
|
if has_recent_errors(agent_id, minutes=5):
|
|
84
106
|
status = 'down'
|
|
107
|
+
elif _has_recent_error_run(agent_id, minutes=5):
|
|
108
|
+
status = 'down'
|
|
85
109
|
elif is_agent_working(agent_id):
|
|
86
110
|
status = 'working'
|
|
87
111
|
elif _main_agent_solo_processing(agent_id):
|
|
@@ -124,7 +148,8 @@ def get_agents_with_status() -> list:
|
|
|
124
148
|
try:
|
|
125
149
|
status = calculate_agent_status(agent_id)
|
|
126
150
|
current_task = get_current_task(agent_id)
|
|
127
|
-
|
|
151
|
+
# idle 且无已结束 run 任务时才清空 currentTask
|
|
152
|
+
if status == 'idle' and not current_task:
|
|
128
153
|
current_task = ''
|
|
129
154
|
last_active = get_last_active_time(agent_id)
|
|
130
155
|
last_error = get_last_error(agent_id) if status == 'down' else None
|
|
@@ -155,16 +180,32 @@ def get_agents_with_status() -> list:
|
|
|
155
180
|
def get_current_task(agent_id: str) -> str:
|
|
156
181
|
"""
|
|
157
182
|
获取 Agent 当前任务描述。
|
|
158
|
-
|
|
183
|
+
|
|
184
|
+
优先级:
|
|
185
|
+
1. 活跃 run(endedAt 为空)—— 代表正在执行的任务
|
|
186
|
+
2. 最近结束的 run——即使已结束也要展示(run 失败中断后仍需可见)
|
|
159
187
|
"""
|
|
160
188
|
runs = get_agent_runs(agent_id, limit=40)
|
|
189
|
+
|
|
190
|
+
# 优先级1:未结束的 run
|
|
191
|
+
for run in runs:
|
|
192
|
+
if run.get('endedAt') is None:
|
|
193
|
+
task = run.get('task', '') or ''
|
|
194
|
+
if len(task) > 60:
|
|
195
|
+
task = task[:57] + '...'
|
|
196
|
+
return task
|
|
197
|
+
|
|
198
|
+
# 优先级2:最近的已结束 run(确保失败中断的任务也能在 Dashboard 上看到)
|
|
161
199
|
for run in runs:
|
|
162
200
|
if run.get('endedAt') is not None:
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
201
|
+
task = run.get('task', '') or ''
|
|
202
|
+
if task:
|
|
203
|
+
outcome = run.get('outcome', {})
|
|
204
|
+
status = outcome.get('status') if isinstance(outcome, dict) else None
|
|
205
|
+
prefix = '[失败] ' if status == 'error' else '[已结束] '
|
|
206
|
+
if len(task) > 57:
|
|
207
|
+
task = task[:57] + '...'
|
|
208
|
+
return prefix + task
|
|
168
209
|
|
|
169
210
|
return ''
|
|
170
211
|
|