openclaw-agent-dashboard 1.0.39 → 1.0.40

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. package/dashboard/api/agent_config_api.py +28 -7
  2. package/dashboard/api/agents.py +48 -10
  3. package/dashboard/api/agents_config.py +5 -1
  4. package/dashboard/api/chains.py +25 -5
  5. package/dashboard/api/collaboration.py +10 -9
  6. package/dashboard/api/debug_paths.py +5 -1
  7. package/dashboard/api/error_analysis.py +29 -11
  8. package/dashboard/api/errors.py +27 -11
  9. package/dashboard/api/fortify_routes.py +80 -0
  10. package/dashboard/api/input_safety.py +60 -0
  11. package/dashboard/api/performance.py +73 -53
  12. package/dashboard/api/subagents.py +95 -99
  13. package/dashboard/api/timeline.py +24 -3
  14. package/dashboard/api/version.py +2 -0
  15. package/dashboard/api/websocket.py +9 -7
  16. package/dashboard/core/__init__.py +1 -0
  17. package/dashboard/core/config_fortify.py +112 -0
  18. package/dashboard/core/error_handler.py +339 -0
  19. package/dashboard/core/fallback_manager.py +70 -0
  20. package/dashboard/core/safe_api_error.py +76 -0
  21. package/dashboard/core/schemas/__init__.py +16 -0
  22. package/dashboard/core/schemas/base.py +43 -0
  23. package/dashboard/core/schemas/session_schema.py +40 -0
  24. package/dashboard/core/schemas/subagent_schema.py +23 -0
  25. package/dashboard/data/agent_config_manager.py +6 -4
  26. package/dashboard/data/chain_reader.py +16 -12
  27. package/dashboard/data/error_analyzer.py +15 -11
  28. package/dashboard/data/session_reader.py +268 -46
  29. package/dashboard/data/subagent_reader.py +74 -49
  30. package/dashboard/data/timeline_reader.py +35 -49
  31. package/dashboard/main.py +24 -2
  32. package/dashboard/mechanism_reader.py +4 -5
  33. package/dashboard/mechanisms.py +2 -2
  34. package/dashboard/pytest.ini +3 -0
  35. package/dashboard/requirements.txt +5 -0
  36. package/dashboard/status/cache_fp_probe.py +40 -0
  37. package/dashboard/status/status_cache.py +199 -72
  38. package/dashboard/status/status_calculator.py +50 -30
  39. package/dashboard/tests/conftest.py +84 -0
  40. package/dashboard/tests/test_api_contracts.py +372 -0
  41. package/dashboard/tests/test_bench_fortify.py +176 -0
  42. package/dashboard/tests/test_fortify.py +741 -0
  43. package/dashboard/utils/__init__.py +1 -0
  44. package/dashboard/utils/data_repair.py +210 -0
  45. package/dashboard/watchers/file_watcher.py +367 -77
  46. package/openclaw.plugin.json +1 -1
  47. package/package.json +1 -1
  48. package/dashboard/agents.py +0 -74
  49. package/dashboard/collaboration.py +0 -407
  50. package/dashboard/errors.py +0 -63
  51. package/dashboard/performance.py +0 -474
  52. package/dashboard/session_reader.py +0 -240
  53. package/dashboard/status_calculator.py +0 -121
  54. package/dashboard/subagent_reader.py +0 -232
@@ -9,6 +9,7 @@ from pydantic import BaseModel
9
9
  from typing import Optional
10
10
 
11
11
  # 导入版本信息读取器
12
+ from core.error_handler import record_error
12
13
  from data.version_info_reader import get_version_reader
13
14
 
14
15
  logger = logging.getLogger(__name__)
@@ -38,6 +39,7 @@ async def get_version_info() -> VersionInfo:
38
39
  version_data = reader.read_version_info()
39
40
  return VersionInfo(**version_data)
40
41
  except Exception as e:
42
+ record_error("unknown", str(e), "api:version", exc=e)
41
43
  logger.exception("get_version_info 异常,返回降级数据: %s", e)
42
44
  return VersionInfo(
43
45
  version="unknown",
@@ -11,6 +11,8 @@ from pathlib import Path
11
11
 
12
12
  sys.path.append(str(Path(__file__).parent.parent))
13
13
 
14
+ from core.error_handler import record_error
15
+
14
16
  router = APIRouter()
15
17
 
16
18
  # 活跃的 WebSocket 连接
@@ -33,7 +35,7 @@ async def _periodic_broadcast_loop():
33
35
  if changed_agents:
34
36
  await broadcast_state_update(changed_agents)
35
37
  except Exception as e:
36
- print(f"[WebSocket] 周期性推送失败: {e}")
38
+ record_error("unknown", str(e), "websocket:periodic_broadcast", exc=e)
37
39
 
38
40
 
39
41
  def _ensure_broadcast_task():
@@ -110,26 +112,26 @@ async def send_initial_state(websocket: WebSocket):
110
112
  collab = await get_collaboration()
111
113
  data['collaboration'] = collab.model_dump() if hasattr(collab, "model_dump") else collab
112
114
  except Exception as e:
113
- print(f"[WebSocket] collaboration 获取失败: {e}")
115
+ record_error("unknown", str(e), "websocket:initial_collaboration", exc=e)
114
116
  try:
115
117
  tasks_result = await get_tasks()
116
118
  data['tasks'] = tasks_result.get("tasks", []) if isinstance(tasks_result, dict) else []
117
119
  except Exception as e:
118
- print(f"[WebSocket] tasks 获取失败: {e}")
120
+ record_error("unknown", str(e), "websocket:initial_tasks", exc=e)
119
121
  try:
120
122
  from .performance import get_real_stats
121
123
  data['performance'] = await get_real_stats()
122
124
  except Exception as e:
123
- print(f"[WebSocket] performance 获取失败: {e}")
125
+ record_error("unknown", str(e), "websocket:initial_performance", exc=e)
124
126
  try:
125
127
  from .workflow import list_workflows
126
128
  data['workflows'] = await list_workflows()
127
129
  except Exception as e:
128
- print(f"[WebSocket] workflows 获取失败: {e}")
130
+ record_error("unknown", str(e), "websocket:initial_workflows", exc=e)
129
131
 
130
132
  await websocket.send_json({'type': 'full_state', 'data': data})
131
133
  except Exception as e:
132
- print(f"发送初始状态失败: {e}")
134
+ record_error("unknown", str(e), "websocket:send_initial_state", exc=e)
133
135
 
134
136
 
135
137
  async def broadcast_agent_update(agent_id: str, status: str):
@@ -250,7 +252,7 @@ async def broadcast_full_state():
250
252
  },
251
253
  })
252
254
  except Exception as e:
253
- print(f"[WebSocket] broadcast_full_state 失败: {e}")
255
+ record_error("unknown", str(e), "websocket:broadcast_full_state", exc=e)
254
256
 
255
257
 
256
258
  async def broadcast_state_update(changed_agents: List[Dict[str, Any]]) -> None:
@@ -0,0 +1 @@
1
+ """Core fortify modules (error handling, config, schemas)."""
@@ -0,0 +1,112 @@
1
+ """
2
+ TECHDEBT_FORTIFY: centralized environment configuration.
3
+
4
+ OPENCLAW_CACHE_MAX_SIZE = max cache memory in MB (PRD).
5
+ OPENCLAW_CACHE_MAX_ENTRIES = max number of cache entries (distinct from memory cap).
6
+ """
7
+ from __future__ import annotations
8
+
9
+ import os
10
+ from dataclasses import dataclass
11
+ from functools import lru_cache
12
+
13
+
14
+ def _env_int(key: str, default: int, min_v: int | None = None, max_v: int | None = None) -> int:
15
+ raw = os.environ.get(key)
16
+ if raw is None or raw == "":
17
+ v = default
18
+ else:
19
+ try:
20
+ v = int(raw)
21
+ except ValueError:
22
+ v = default
23
+ if min_v is not None:
24
+ v = max(min_v, v)
25
+ if max_v is not None:
26
+ v = min(max_v, v)
27
+ return v
28
+
29
+
30
+ def _env_float(key: str, default: float) -> float:
31
+ raw = os.environ.get(key)
32
+ if raw is None or raw == "":
33
+ return default
34
+ try:
35
+ return float(raw)
36
+ except ValueError:
37
+ return default
38
+
39
+
40
+ def _env_bool(key: str, default: bool) -> bool:
41
+ raw = os.environ.get(key)
42
+ if raw is None or raw == "":
43
+ return default
44
+ return raw.lower() in ("1", "true", "yes", "on")
45
+
46
+
47
+ def _env_str(key: str, default: str) -> str:
48
+ raw = os.environ.get(key)
49
+ if raw is None or raw == "":
50
+ return default
51
+ return raw
52
+
53
+
54
+ @dataclass(frozen=True)
55
+ class FortifyConfig:
56
+ cache_ttl_seconds: int
57
+ cache_max_entries: int
58
+ cache_max_memory_mb: int
59
+ cache_preload: bool
60
+ cache_double_check: bool
61
+ cache_fp_probe_interval_sec: float
62
+
63
+ max_retry: int
64
+ retry_base_delay: float
65
+ retry_budget_per_minute: int
66
+ enable_fallback: bool
67
+ fallback_cache_on_io: bool
68
+ error_log_level: str
69
+ sanitize_api_errors: bool
70
+
71
+ json_strict: bool
72
+ auto_repair_json: bool
73
+ auto_repair_write_back: bool
74
+ repair_backup_path: str | None
75
+ max_repair_attempts: int
76
+
77
+ watcher_max_retries: int
78
+ watcher_poll_interval_sec: float
79
+ watcher_failure_window_sec: float
80
+
81
+
82
+ @lru_cache(maxsize=1)
83
+ def get_fortify_config() -> FortifyConfig:
84
+ ttl = _env_int("OPENCLAW_CACHE_TTL", 1, min_v=1, max_v=60)
85
+ return FortifyConfig(
86
+ cache_ttl_seconds=ttl,
87
+ cache_max_entries=_env_int("OPENCLAW_CACHE_MAX_ENTRIES", 100, min_v=1, max_v=10_000),
88
+ cache_max_memory_mb=_env_int("OPENCLAW_CACHE_MAX_SIZE", 100, min_v=1, max_v=4096),
89
+ cache_preload=_env_bool("OPENCLAW_CACHE_PRELOAD", True),
90
+ cache_double_check=_env_bool("OPENCLAW_CACHE_DOUBLE_CHECK", True),
91
+ cache_fp_probe_interval_sec=_env_float("OPENCLAW_CACHE_FP_PROBE_INTERVAL", 0.0),
92
+ max_retry=_env_int("OPENCLAW_MAX_RETRY", 3, min_v=0, max_v=20),
93
+ retry_base_delay=_env_float("OPENCLAW_RETRY_BASE_DELAY", 1.0),
94
+ retry_budget_per_minute=_env_int("OPENCLAW_RETRY_BUDGET_PER_MINUTE", 300, min_v=0, max_v=100_000),
95
+ enable_fallback=_env_bool("OPENCLAW_ENABLE_FALLBACK", True),
96
+ fallback_cache_on_io=_env_bool("OPENCLAW_FALLBACK_CACHE_ON_IO", True),
97
+ error_log_level=_env_str("OPENCLAW_ERROR_LOG_LEVEL", "INFO").upper(),
98
+ sanitize_api_errors=_env_bool("OPENCLAW_API_ERROR_SANITIZE", True),
99
+ json_strict=_env_bool("OPENCLAW_JSON_STRICT", True),
100
+ auto_repair_json=_env_bool("OPENCLAW_AUTO_REPAIR_JSON", True),
101
+ auto_repair_write_back=_env_bool("OPENCLAW_AUTO_REPAIR_WB", False),
102
+ repair_backup_path=os.environ.get("OPENCLAW_REPAIR_BACKUP") or None,
103
+ max_repair_attempts=_env_int("OPENCLAW_MAX_REPAIR_ATTEMPTS", 3, min_v=1, max_v=10),
104
+ watcher_max_retries=_env_int("OPENCLAW_WATCHER_MAX_RETRIES", 3, min_v=1, max_v=10),
105
+ watcher_poll_interval_sec=_env_float("OPENCLAW_WATCHER_POLL_INTERVAL", 5.0),
106
+ watcher_failure_window_sec=_env_float("OPENCLAW_WATCHER_FAILURE_WINDOW", 30.0),
107
+ )
108
+
109
+
110
+ def refresh_fortify_config_cache() -> FortifyConfig:
111
+ get_fortify_config.cache_clear()
112
+ return get_fortify_config()
@@ -0,0 +1,339 @@
1
+ """
2
+ Unified error handling: classification, exponential backoff retry, in-process stats, structured logging.
3
+
4
+ 降级策略集中注册见 core.fallback_manager(REQ_003-SPEC-04);IO 失败读缓存见 status_cache.get_stale_fallback(REQ_003-AC-003)。
5
+ """
6
+ from __future__ import annotations
7
+
8
+ import logging
9
+ import threading
10
+ import time
11
+ import functools
12
+ from collections import defaultdict, deque
13
+ from dataclasses import dataclass, field
14
+ from datetime import datetime, timezone
15
+ from typing import Any, Callable, Dict, List, Optional, Tuple, Type
16
+
17
+ from core.config_fortify import get_fortify_config
18
+
19
+ _LOG = logging.getLogger("openclaw.fortify")
20
+
21
+
22
+ def _ensure_fortify_logging() -> None:
23
+ if getattr(_ensure_fortify_logging, "_done", False):
24
+ return
25
+ cfg = get_fortify_config()
26
+ level = getattr(logging, cfg.error_log_level, logging.INFO)
27
+ _LOG.setLevel(level)
28
+ if not _LOG.handlers:
29
+ h = logging.StreamHandler()
30
+ h.setFormatter(
31
+ logging.Formatter(
32
+ fmt="%(asctime)s | %(levelname)s | fortify | %(message)s",
33
+ datefmt="%Y-%m-%dT%H:%M:%S%z",
34
+ )
35
+ )
36
+ _LOG.addHandler(h)
37
+ _ensure_fortify_logging._done = True # type: ignore[attr-defined]
38
+
39
+
40
+ @dataclass
41
+ class ErrorHandlerStats:
42
+ total_count: int = 0
43
+ by_type: Dict[str, int] = field(default_factory=dict)
44
+ by_scope: Dict[str, int] = field(default_factory=dict)
45
+ hourly_trend: List[Dict[str, Any]] = field(default_factory=list)
46
+ last_error: Optional[Dict[str, Any]] = None
47
+ last_update_iso: Optional[str] = None
48
+
49
+
50
+ _stats_lock = threading.Lock()
51
+ _stats = ErrorHandlerStats()
52
+ _retry_totals = defaultdict(int)
53
+
54
+ _retry_budget_lock = threading.Lock()
55
+ _retry_budget_deques: Dict[str, deque] = {}
56
+ _retry_budget_blocks = 0
57
+
58
+
59
+ def _consume_retry_budget(operation: str) -> bool:
60
+ """
61
+ 滑动窗口(60s)内同一 operation 的退避重试次数上限,缓解重试风暴(RISK-005)。
62
+ OPENCLAW_RETRY_BUDGET_PER_MINUTE=0 表示不限制。
63
+ """
64
+ global _retry_budget_blocks
65
+ cfg = get_fortify_config()
66
+ limit = cfg.retry_budget_per_minute
67
+ if limit <= 0:
68
+ return True
69
+ op = (operation or "default").strip() or "default"
70
+ now = time.monotonic()
71
+ with _retry_budget_lock:
72
+ dq = _retry_budget_deques.setdefault(op, deque())
73
+ while dq and now - dq[0] > 60.0:
74
+ dq.popleft()
75
+ if len(dq) >= limit:
76
+ _retry_budget_blocks += 1
77
+ return False
78
+ dq.append(now)
79
+ return True
80
+
81
+
82
+ def classify_exception(exc: BaseException) -> str:
83
+ """Map exception to PRD-style category."""
84
+ import json as _json
85
+
86
+ if isinstance(exc, TimeoutError):
87
+ return "timeout"
88
+ if isinstance(exc, PermissionError):
89
+ return "permission-error"
90
+ if isinstance(exc, FileNotFoundError):
91
+ return "io-error"
92
+ if isinstance(exc, (BrokenPipeError, ConnectionResetError, ConnectionAbortedError)):
93
+ return "network"
94
+ if isinstance(exc, (ConnectionError, OSError)):
95
+ msg = str(exc).lower()
96
+ if "network" in msg or "connection" in msg or "broken pipe" in msg:
97
+ return "network"
98
+ return "io-error"
99
+ if isinstance(exc, _json.JSONDecodeError):
100
+ return "parsing-error"
101
+ if isinstance(exc, UnicodeDecodeError):
102
+ return "parsing-error"
103
+ if isinstance(exc, MemoryError):
104
+ return "compute-error"
105
+ if isinstance(exc, RecursionError):
106
+ return "compute-error"
107
+ if isinstance(exc, (KeyError, TypeError, AttributeError)):
108
+ return "validation-error"
109
+ if isinstance(exc, ValueError):
110
+ return "validation-error"
111
+ try:
112
+ import ssl
113
+ except ImportError:
114
+ pass
115
+ else:
116
+ if isinstance(exc, ssl.SSLError):
117
+ return "network"
118
+ return "unknown"
119
+
120
+
121
+ class ErrorHandler:
122
+ """Per-use-case handler; global stats still aggregate via record_error."""
123
+
124
+ def __init__(
125
+ self,
126
+ max_retry: Optional[int] = None,
127
+ base_delay: Optional[float] = None,
128
+ enable_fallback: Optional[bool] = None,
129
+ ):
130
+ _ensure_fortify_logging()
131
+ cfg = get_fortify_config()
132
+ self.max_retry = cfg.max_retry if max_retry is None else max_retry
133
+ self.base_delay = cfg.retry_base_delay if base_delay is None else base_delay
134
+ self.enable_fallback = cfg.enable_fallback if enable_fallback is None else enable_fallback
135
+
136
+ def log_error(
137
+ self,
138
+ error_type: str,
139
+ error_detail: str,
140
+ affected_scope: str = "",
141
+ exc: Optional[BaseException] = None,
142
+ ) -> None:
143
+ record_error(error_type, error_detail, affected_scope, exc)
144
+
145
+ def get_stats(self) -> Dict[str, Any]:
146
+ return get_framework_error_stats()
147
+
148
+ def run_with_retry(
149
+ self,
150
+ fn: Callable[[], Any],
151
+ *,
152
+ operation: str = "operation",
153
+ error_type: str = "unknown",
154
+ fallback: Optional[Callable[[], Any]] = None,
155
+ retryable: Optional[Tuple[Type[BaseException], ...]] = None,
156
+ ) -> Any:
157
+ if retryable is None:
158
+ retryable = (OSError, IOError, TimeoutError, ConnectionError)
159
+ attempts = max(1, self.max_retry + 1)
160
+ last_exc: Optional[BaseException] = None
161
+ for attempt in range(attempts):
162
+ try:
163
+ return fn()
164
+ except retryable as e:
165
+ last_exc = e
166
+ if attempt + 1 >= attempts:
167
+ break
168
+ if not _consume_retry_budget(operation):
169
+ record_error(
170
+ "compute-error",
171
+ f"retry budget exceeded (60s window) op={operation}",
172
+ f"retry_budget:{operation}",
173
+ )
174
+ if self.enable_fallback and fallback is not None:
175
+ return fallback()
176
+ if last_exc:
177
+ raise last_exc
178
+ raise RuntimeError(operation)
179
+ delay = self.base_delay * (2**attempt)
180
+ _LOG.warning(
181
+ "retry operation=%s attempt=%s/%s delay=%.2fs err=%s",
182
+ operation,
183
+ attempt + 1,
184
+ attempts,
185
+ delay,
186
+ e,
187
+ )
188
+ time.sleep(delay)
189
+ record_error(error_type, str(last_exc) if last_exc else "failed", operation, last_exc)
190
+ if self.enable_fallback and fallback is not None:
191
+ return fallback()
192
+ if last_exc:
193
+ raise last_exc
194
+ raise RuntimeError(operation)
195
+
196
+
197
+ def record_error(
198
+ error_type: str,
199
+ error_detail: str,
200
+ affected_scope: str = "",
201
+ exc: Optional[BaseException] = None,
202
+ ) -> None:
203
+ _ensure_fortify_logging()
204
+ if exc is not None:
205
+ error_type = classify_exception(exc) if error_type in ("", "unknown") else error_type
206
+ detail = (error_detail or "")[:2000]
207
+ scope = affected_scope or ""
208
+ exc_type_name = type(exc).__name__ if exc is not None else ""
209
+ exc_module = type(exc).__module__ if exc is not None else ""
210
+ _LOG.error(
211
+ "fortify_event error_type=%s scope=%s exc_type=%s exc_module=%s detail=%s",
212
+ error_type,
213
+ scope,
214
+ exc_type_name,
215
+ exc_module,
216
+ detail,
217
+ exc_info=exc is not None,
218
+ )
219
+ now = datetime.now(timezone.utc)
220
+ hour_key = now.strftime("%Y-%m-%d %H:00")
221
+ with _stats_lock:
222
+ _stats.total_count += 1
223
+ _stats.by_type[error_type] = _stats.by_type.get(error_type, 0) + 1
224
+ if scope:
225
+ _stats.by_scope[scope] = _stats.by_scope.get(scope, 0) + 1
226
+ _stats.last_error = {
227
+ "type": error_type,
228
+ "detail": detail,
229
+ "scope": scope,
230
+ "time": now.isoformat(),
231
+ "exc_type": exc_type_name or None,
232
+ "exc_module": exc_module or None,
233
+ }
234
+ _stats.last_update_iso = now.isoformat()
235
+ # rolling hourly bucket (merge into hourly_trend max 24 entries)
236
+ found = False
237
+ for row in _stats.hourly_trend:
238
+ if row.get("hour") == hour_key:
239
+ row["count"] = row.get("count", 0) + 1
240
+ found = True
241
+ break
242
+ if not found:
243
+ _stats.hourly_trend.append({"hour": hour_key, "count": 1})
244
+ if len(_stats.hourly_trend) > 24:
245
+ _stats.hourly_trend = _stats.hourly_trend[-24:]
246
+
247
+
248
+ def record_retry(operation: str) -> None:
249
+ _retry_totals[operation] += 1
250
+
251
+
252
+ def get_framework_error_stats_for_client() -> Dict[str, Any]:
253
+ """供 HTTP 返回:在开启脱敏时处理 last_error。"""
254
+ from core.safe_api_error import redact_framework_stats_for_client
255
+
256
+ return redact_framework_stats_for_client(get_framework_error_stats())
257
+
258
+
259
+ def get_framework_error_stats() -> Dict[str, Any]:
260
+ with _stats_lock:
261
+ by_type_out: Dict[str, Any] = {}
262
+ type_labels = {
263
+ "network": ("网络错误", "#f59e0b"),
264
+ "timeout": ("超时错误", "#ef4444"),
265
+ "parsing": ("解析错误", "#8b5cf6"),
266
+ "parsing-error": ("解析错误", "#8b5cf6"),
267
+ "io-error": ("IO 错误", "#64748b"),
268
+ "permission-error": ("权限错误", "#b45309"),
269
+ "compute-error": ("计算错误", "#dc2626"),
270
+ "validation-error": ("校验错误", "#ca8a04"),
271
+ "unknown": ("未知错误", "#6b7280"),
272
+ }
273
+ for t, c in _stats.by_type.items():
274
+ label, color = type_labels.get(t, (t, "#6b7280"))
275
+ by_type_out[t] = {"count": c, "label": label, "color": color}
276
+ by_agent: Dict[str, Any] = {}
277
+ for scope, c in _stats.by_scope.items():
278
+ if scope.startswith("agent_id:"):
279
+ aid = scope.split(":", 1)[-1]
280
+ by_agent[aid] = {"count": c, "agentId": aid}
281
+ else:
282
+ by_agent[scope] = {"count": c, "agentId": scope}
283
+ sum_by_type = sum(t.get("count", 0) if isinstance(t, dict) else 0 for t in by_type_out.values())
284
+ top_scopes = [
285
+ {"scope": k, "count": v}
286
+ for k, v in sorted(_stats.by_scope.items(), key=lambda kv: -kv[1])[:50]
287
+ ]
288
+ return {
289
+ "total_count": _stats.total_count,
290
+ "by_type": by_type_out,
291
+ "by_agent": by_agent,
292
+ "by_scope_top": top_scopes,
293
+ "sum_by_type": sum_by_type,
294
+ "totals_consistent": sum_by_type == _stats.total_count,
295
+ "hourly_trend": list(_stats.hourly_trend),
296
+ "last_update": _stats.last_update_iso,
297
+ "last_error": _stats.last_error,
298
+ "retry_by_operation": dict(_retry_totals),
299
+ "retry_budget_blocks": _retry_budget_blocks,
300
+ }
301
+
302
+
303
+ def execute_with_retry(
304
+ max_attempts: int = 3,
305
+ delay_base: float = 1.0,
306
+ exceptions: Tuple[Type[BaseException], ...] = (Exception,),
307
+ ) -> Callable[[Callable[..., Any]], Callable[..., Any]]:
308
+ def deco(fn: Callable[..., Any]) -> Callable[..., Any]:
309
+ @functools.wraps(fn)
310
+ def wrapped(*args: Any, **kwargs: Any) -> Any:
311
+ cfg = get_fortify_config()
312
+ attempts = max_attempts if max_attempts is not None else cfg.max_retry + 1
313
+ base = delay_base if delay_base is not None else cfg.retry_base_delay
314
+ last: Optional[BaseException] = None
315
+ for attempt in range(attempts):
316
+ try:
317
+ return fn(*args, **kwargs)
318
+ except exceptions as e:
319
+ last = e
320
+ record_retry(fn.__name__)
321
+ if attempt + 1 >= attempts:
322
+ break
323
+ if not _consume_retry_budget(fn.__name__):
324
+ record_error(
325
+ "compute-error",
326
+ f"retry budget exceeded (60s window) op={fn.__name__}",
327
+ f"retry_budget:{fn.__name__}",
328
+ )
329
+ if last:
330
+ raise last
331
+ raise RuntimeError(fn.__name__)
332
+ time.sleep(base * (2**attempt))
333
+ if last:
334
+ raise last
335
+ raise RuntimeError(fn.__name__)
336
+
337
+ return wrapped
338
+
339
+ return deco
@@ -0,0 +1,70 @@
1
+ """
2
+ REQ_003-SPEC-04:按错误类型注册集中降级策略;供状态计算、列表聚合等路径调用。
3
+
4
+ REQ_003-AC-003:网络/IO 等错误在重试仍失败或未覆盖时,可读 StatusCache 中的最近状态(见 status_cache.get_stale_fallback)。
5
+ """
6
+ from __future__ import annotations
7
+
8
+ import threading
9
+ from typing import Any, Callable, Dict, Optional
10
+
11
+ _Handler = Callable[..., Any]
12
+
13
+ _lock = threading.Lock()
14
+ _handlers: Dict[str, _Handler] = {}
15
+ _defaults_registered = False
16
+
17
+
18
+ def register_fallback(error_category: str, handler: _Handler) -> None:
19
+ """注册某 classify_exception 类别对应的降级函数;handler 签名为 (agent_id=None, **kwargs) -> Any。"""
20
+ with _lock:
21
+ _handlers[error_category] = handler
22
+
23
+
24
+ def run_fallback(error_category: str, *, agent_id: Optional[str] = None, **kwargs: Any) -> Any:
25
+ """按类别执行已注册降级;无匹配则返回 None。"""
26
+ _ensure_default_fallbacks()
27
+ with _lock:
28
+ h = _handlers.get(error_category)
29
+ if h is None:
30
+ return None
31
+ return h(agent_id=agent_id, **kwargs)
32
+
33
+
34
+ def _stale_agent_status_handler(agent_id: Optional[str] = None, **_: Any) -> Optional[str]:
35
+ if not agent_id:
36
+ return None
37
+ from core.config_fortify import get_fortify_config
38
+
39
+ if not get_fortify_config().fallback_cache_on_io:
40
+ return None
41
+ from status.status_cache import get_cache
42
+
43
+ row = get_cache().get_stale_fallback(agent_id)
44
+ if not row:
45
+ return None
46
+ s = row.get("status")
47
+ if s in ("idle", "working", "down"):
48
+ return str(s)
49
+ return None
50
+
51
+
52
+ def _ensure_default_fallbacks() -> None:
53
+ global _defaults_registered
54
+ if _defaults_registered:
55
+ return
56
+ with _lock:
57
+ if _defaults_registered:
58
+ return
59
+ for cat in ("network", "io-error", "timeout", "permission-error"):
60
+ if cat not in _handlers:
61
+ _handlers[cat] = _stale_agent_status_handler
62
+ _defaults_registered = True
63
+
64
+
65
+ def reset_fallback_handlers_for_tests() -> None:
66
+ """单测隔离:清空注册表并允许重新挂载默认处理器。"""
67
+ global _handlers, _defaults_registered
68
+ with _lock:
69
+ _handlers.clear()
70
+ _defaults_registered = False
@@ -0,0 +1,76 @@
1
+ """
2
+ 面向浏览器/API 客户端的错误文案脱敏(NFR-S-001)。
3
+ 服务端日志仍由 record_error 记录完整信息(默认不截断)。
4
+ """
5
+ from __future__ import annotations
6
+
7
+ import re
8
+ from typing import Any, Dict
9
+
10
+
11
+ def sanitize_client_error_text(raw: str, max_len: int = 1200) -> str:
12
+ """去除常见密钥/路径/邮箱形态,压缩长度;含 Traceback 时整段替换。"""
13
+ if not raw:
14
+ return "internal error"
15
+ s = raw.replace("\r", " ").replace("\n", " ")
16
+ if len(s) > max_len * 2:
17
+ s = s[: max_len * 2]
18
+ if "Traceback (most recent call last)" in raw or '\n File "' in raw:
19
+ return "Internal error (details redacted; see server logs)"
20
+
21
+ s = re.sub(r"\bsk-[a-zA-Z0-9]{12,}\b", "sk-[REDACTED]", s, flags=re.I)
22
+ s = re.sub(r"\bxox[baprs]-[a-zA-Z0-9-]{10,}\b", "[slack-token]", s)
23
+ s = re.sub(r"Bearer\s+[a-zA-Z0-9._=+\/-]{12,}", "Bearer [REDACTED]", s, flags=re.I)
24
+ s = re.sub(
25
+ r"\bAKIA[0-9A-Z]{16}\b",
26
+ "AKIA[REDACTED]",
27
+ s,
28
+ )
29
+ s = re.sub(r"(?i)password\s*[=:]\s*[^\s,}\"]{2,}", "password=[REDACTED]", s)
30
+ s = re.sub(
31
+ r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b",
32
+ "[email]",
33
+ s,
34
+ )
35
+ s = re.sub(r"(?:/home/|/Users/)[^\s:]{1,80}/[^\s:]{0,200}", "[path]", s)
36
+ s = re.sub(r"[A-Za-z]:\\(?:[^\\\s]+\\){0,8}[^\s\\]{0,120}", "[path]", s)
37
+ s = re.sub(r"/[^\s:]{0,16}\.openclaw(?:/[^\s:]{0,160})?", "[openclaw-path]", s)
38
+
39
+ if len(s) > max_len:
40
+ s = s[:max_len] + "…"
41
+ return s
42
+
43
+
44
+ def safe_client_string(message: str) -> str:
45
+ """JSON 响应体中的 error 等字符串字段脱敏。"""
46
+ from core.config_fortify import get_fortify_config
47
+
48
+ raw = message or ""
49
+ if not get_fortify_config().sanitize_api_errors:
50
+ return raw[:4000]
51
+ return sanitize_client_error_text(raw)
52
+
53
+
54
+ def safe_api_error_detail(exc: BaseException) -> str:
55
+ """HTTP 500 等返回给客户端的 detail 字符串。"""
56
+ from core.config_fortify import get_fortify_config
57
+
58
+ raw = str(exc).strip() or type(exc).__name__
59
+ if not get_fortify_config().sanitize_api_errors:
60
+ return raw[:4000]
61
+ return sanitize_client_error_text(raw)
62
+
63
+
64
+ def redact_framework_stats_for_client(data: Dict[str, Any]) -> Dict[str, Any]:
65
+ """为 /api/errors/stats 等接口脱敏 last_error.detail。"""
66
+ from core.config_fortify import get_fortify_config
67
+
68
+ if not get_fortify_config().sanitize_api_errors:
69
+ return data
70
+ out = dict(data)
71
+ le = out.get("last_error")
72
+ if isinstance(le, dict) and le.get("detail"):
73
+ le = dict(le)
74
+ le["detail"] = sanitize_client_error_text(str(le["detail"]))
75
+ out["last_error"] = le
76
+ return out