openclaw-agent-dashboard 1.0.39 → 1.0.41

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (58) hide show
  1. package/dashboard/api/agent_config_api.py +28 -7
  2. package/dashboard/api/agents.py +48 -10
  3. package/dashboard/api/agents_config.py +5 -1
  4. package/dashboard/api/chains.py +25 -5
  5. package/dashboard/api/collaboration.py +10 -9
  6. package/dashboard/api/debug_paths.py +5 -1
  7. package/dashboard/api/error_analysis.py +29 -11
  8. package/dashboard/api/errors.py +37 -11
  9. package/dashboard/api/fortify_routes.py +108 -0
  10. package/dashboard/api/input_safety.py +60 -0
  11. package/dashboard/api/performance.py +73 -53
  12. package/dashboard/api/subagents.py +95 -99
  13. package/dashboard/api/timeline.py +24 -3
  14. package/dashboard/api/version.py +2 -0
  15. package/dashboard/api/websocket.py +9 -7
  16. package/dashboard/core/__init__.py +1 -0
  17. package/dashboard/core/config_fortify.py +125 -0
  18. package/dashboard/core/error_handler.py +488 -0
  19. package/dashboard/core/fallback_manager.py +81 -0
  20. package/dashboard/core/logging_config.py +217 -0
  21. package/dashboard/core/safe_api_error.py +76 -0
  22. package/dashboard/core/schemas/__init__.py +16 -0
  23. package/dashboard/core/schemas/base.py +43 -0
  24. package/dashboard/core/schemas/session_schema.py +40 -0
  25. package/dashboard/core/schemas/subagent_schema.py +23 -0
  26. package/dashboard/data/agent_config_manager.py +6 -4
  27. package/dashboard/data/chain_reader.py +16 -12
  28. package/dashboard/data/error_analyzer.py +15 -11
  29. package/dashboard/data/session_reader.py +268 -46
  30. package/dashboard/data/subagent_reader.py +74 -49
  31. package/dashboard/data/timeline_reader.py +35 -49
  32. package/dashboard/main.py +24 -2
  33. package/dashboard/mechanism_reader.py +4 -5
  34. package/dashboard/mechanisms.py +2 -2
  35. package/dashboard/pytest.ini +3 -0
  36. package/dashboard/requirements.txt +5 -0
  37. package/dashboard/status/cache_fp_probe.py +40 -0
  38. package/dashboard/status/status_cache.py +199 -72
  39. package/dashboard/status/status_calculator.py +50 -30
  40. package/dashboard/tests/conftest.py +87 -0
  41. package/dashboard/tests/test_api_contracts.py +372 -0
  42. package/dashboard/tests/test_bench_fortify.py +176 -0
  43. package/dashboard/tests/test_fortify.py +952 -0
  44. package/dashboard/utils/__init__.py +1 -0
  45. package/dashboard/utils/data_repair.py +210 -0
  46. package/dashboard/watchers/file_watcher.py +380 -77
  47. package/frontend-dist/assets/{index-cYIOn3Wq.css → index-BIZ2xHfw.css} +1 -1
  48. package/frontend-dist/assets/{index-DyRXGevD.js → index-Cnr0b02R.js} +1 -1
  49. package/frontend-dist/index.html +2 -2
  50. package/openclaw.plugin.json +1 -1
  51. package/package.json +1 -1
  52. package/dashboard/agents.py +0 -74
  53. package/dashboard/collaboration.py +0 -407
  54. package/dashboard/errors.py +0 -63
  55. package/dashboard/performance.py +0 -474
  56. package/dashboard/session_reader.py +0 -240
  57. package/dashboard/status_calculator.py +0 -121
  58. package/dashboard/subagent_reader.py +0 -232
@@ -9,6 +9,7 @@ from pydantic import BaseModel
9
9
  from typing import Optional
10
10
 
11
11
  # 导入版本信息读取器
12
+ from core.error_handler import record_error
12
13
  from data.version_info_reader import get_version_reader
13
14
 
14
15
  logger = logging.getLogger(__name__)
@@ -38,6 +39,7 @@ async def get_version_info() -> VersionInfo:
38
39
  version_data = reader.read_version_info()
39
40
  return VersionInfo(**version_data)
40
41
  except Exception as e:
42
+ record_error("unknown", str(e), "api:version", exc=e)
41
43
  logger.exception("get_version_info 异常,返回降级数据: %s", e)
42
44
  return VersionInfo(
43
45
  version="unknown",
@@ -11,6 +11,8 @@ from pathlib import Path
11
11
 
12
12
  sys.path.append(str(Path(__file__).parent.parent))
13
13
 
14
+ from core.error_handler import record_error
15
+
14
16
  router = APIRouter()
15
17
 
16
18
  # 活跃的 WebSocket 连接
@@ -33,7 +35,7 @@ async def _periodic_broadcast_loop():
33
35
  if changed_agents:
34
36
  await broadcast_state_update(changed_agents)
35
37
  except Exception as e:
36
- print(f"[WebSocket] 周期性推送失败: {e}")
38
+ record_error("unknown", str(e), "websocket:periodic_broadcast", exc=e)
37
39
 
38
40
 
39
41
  def _ensure_broadcast_task():
@@ -110,26 +112,26 @@ async def send_initial_state(websocket: WebSocket):
110
112
  collab = await get_collaboration()
111
113
  data['collaboration'] = collab.model_dump() if hasattr(collab, "model_dump") else collab
112
114
  except Exception as e:
113
- print(f"[WebSocket] collaboration 获取失败: {e}")
115
+ record_error("unknown", str(e), "websocket:initial_collaboration", exc=e)
114
116
  try:
115
117
  tasks_result = await get_tasks()
116
118
  data['tasks'] = tasks_result.get("tasks", []) if isinstance(tasks_result, dict) else []
117
119
  except Exception as e:
118
- print(f"[WebSocket] tasks 获取失败: {e}")
120
+ record_error("unknown", str(e), "websocket:initial_tasks", exc=e)
119
121
  try:
120
122
  from .performance import get_real_stats
121
123
  data['performance'] = await get_real_stats()
122
124
  except Exception as e:
123
- print(f"[WebSocket] performance 获取失败: {e}")
125
+ record_error("unknown", str(e), "websocket:initial_performance", exc=e)
124
126
  try:
125
127
  from .workflow import list_workflows
126
128
  data['workflows'] = await list_workflows()
127
129
  except Exception as e:
128
- print(f"[WebSocket] workflows 获取失败: {e}")
130
+ record_error("unknown", str(e), "websocket:initial_workflows", exc=e)
129
131
 
130
132
  await websocket.send_json({'type': 'full_state', 'data': data})
131
133
  except Exception as e:
132
- print(f"发送初始状态失败: {e}")
134
+ record_error("unknown", str(e), "websocket:send_initial_state", exc=e)
133
135
 
134
136
 
135
137
  async def broadcast_agent_update(agent_id: str, status: str):
@@ -250,7 +252,7 @@ async def broadcast_full_state():
250
252
  },
251
253
  })
252
254
  except Exception as e:
253
- print(f"[WebSocket] broadcast_full_state 失败: {e}")
255
+ record_error("unknown", str(e), "websocket:broadcast_full_state", exc=e)
254
256
 
255
257
 
256
258
  async def broadcast_state_update(changed_agents: List[Dict[str, Any]]) -> None:
@@ -0,0 +1 @@
1
+ """Core fortify modules (error handling, config, schemas)."""
@@ -0,0 +1,125 @@
1
+ """
2
+ TECHDEBT_FORTIFY: centralized environment configuration.
3
+
4
+ OPENCLAW_CACHE_MAX_SIZE = max cache memory in MB (PRD).
5
+ OPENCLAW_CACHE_MAX_ENTRIES = max number of cache entries (distinct from memory cap).
6
+ """
7
+ from __future__ import annotations
8
+
9
+ import os
10
+ from dataclasses import dataclass
11
+ from functools import lru_cache
12
+
13
+
14
+ def _env_int(key: str, default: int, min_v: int | None = None, max_v: int | None = None) -> int:
15
+ raw = os.environ.get(key)
16
+ if raw is None or raw == "":
17
+ v = default
18
+ else:
19
+ try:
20
+ v = int(raw)
21
+ except ValueError:
22
+ v = default
23
+ if min_v is not None:
24
+ v = max(min_v, v)
25
+ if max_v is not None:
26
+ v = min(max_v, v)
27
+ return v
28
+
29
+
30
+ def _env_float(key: str, default: float) -> float:
31
+ raw = os.environ.get(key)
32
+ if raw is None or raw == "":
33
+ return default
34
+ try:
35
+ return float(raw)
36
+ except ValueError:
37
+ return default
38
+
39
+
40
+ def _env_bool(key: str, default: bool) -> bool:
41
+ raw = os.environ.get(key)
42
+ if raw is None or raw == "":
43
+ return default
44
+ return raw.lower() in ("1", "true", "yes", "on")
45
+
46
+
47
+ def _env_str(key: str, default: str) -> str:
48
+ raw = os.environ.get(key)
49
+ if raw is None or raw == "":
50
+ return default
51
+ return raw
52
+
53
+
54
+ @dataclass(frozen=True)
55
+ class FortifyConfig:
56
+ cache_ttl_seconds: int
57
+ cache_max_entries: int
58
+ cache_max_memory_mb: int
59
+ cache_preload: bool
60
+ cache_double_check: bool
61
+ cache_fp_probe_interval_sec: float
62
+
63
+ max_retry: int
64
+ retry_base_delay: float
65
+ retry_budget_per_minute: int
66
+ enable_fallback: bool
67
+ fallback_cache_on_io: bool
68
+ error_log_level: str
69
+ sanitize_api_errors: bool
70
+
71
+ json_strict: bool
72
+ auto_repair_json: bool
73
+ auto_repair_write_back: bool
74
+ repair_backup_path: str | None
75
+ max_repair_attempts: int
76
+
77
+ watcher_max_retries: int
78
+ watcher_poll_interval_sec: float
79
+ watcher_failure_window_sec: float
80
+
81
+ # NFR-S-003: Logging storage security
82
+ log_retention_days: int
83
+ log_max_size_mb: int
84
+ log_backup_count: int
85
+ log_file_path: str | None
86
+ log_compression: bool
87
+
88
+
89
+ @lru_cache(maxsize=1)
90
+ def get_fortify_config() -> FortifyConfig:
91
+ ttl = _env_int("OPENCLAW_CACHE_TTL", 1, min_v=1, max_v=60)
92
+ return FortifyConfig(
93
+ cache_ttl_seconds=ttl,
94
+ cache_max_entries=_env_int("OPENCLAW_CACHE_MAX_ENTRIES", 100, min_v=1, max_v=10_000),
95
+ cache_max_memory_mb=_env_int("OPENCLAW_CACHE_MAX_SIZE", 100, min_v=1, max_v=4096),
96
+ cache_preload=_env_bool("OPENCLAW_CACHE_PRELOAD", True),
97
+ cache_double_check=_env_bool("OPENCLAW_CACHE_DOUBLE_CHECK", True),
98
+ cache_fp_probe_interval_sec=_env_float("OPENCLAW_CACHE_FP_PROBE_INTERVAL", 0.0),
99
+ max_retry=_env_int("OPENCLAW_MAX_RETRY", 3, min_v=0, max_v=20),
100
+ retry_base_delay=_env_float("OPENCLAW_RETRY_BASE_DELAY", 1.0),
101
+ retry_budget_per_minute=_env_int("OPENCLAW_RETRY_BUDGET_PER_MINUTE", 300, min_v=0, max_v=100_000),
102
+ enable_fallback=_env_bool("OPENCLAW_ENABLE_FALLBACK", True),
103
+ fallback_cache_on_io=_env_bool("OPENCLAW_FALLBACK_CACHE_ON_IO", True),
104
+ error_log_level=_env_str("OPENCLAW_ERROR_LOG_LEVEL", "INFO").upper(),
105
+ sanitize_api_errors=_env_bool("OPENCLAW_API_ERROR_SANITIZE", True),
106
+ json_strict=_env_bool("OPENCLAW_JSON_STRICT", True),
107
+ auto_repair_json=_env_bool("OPENCLAW_AUTO_REPAIR_JSON", True),
108
+ auto_repair_write_back=_env_bool("OPENCLAW_AUTO_REPAIR_WB", False),
109
+ repair_backup_path=os.environ.get("OPENCLAW_REPAIR_BACKUP") or None,
110
+ max_repair_attempts=_env_int("OPENCLAW_MAX_REPAIR_ATTEMPTS", 3, min_v=1, max_v=10),
111
+ watcher_max_retries=_env_int("OPENCLAW_WATCHER_MAX_RETRIES", 3, min_v=1, max_v=10),
112
+ watcher_poll_interval_sec=_env_float("OPENCLAW_WATCHER_POLL_INTERVAL", 5.0),
113
+ watcher_failure_window_sec=_env_float("OPENCLAW_WATCHER_FAILURE_WINDOW", 30.0),
114
+ # NFR-S-003: Logging storage security
115
+ log_retention_days=_env_int("OPENCLAW_LOG_RETENTION_DAYS", 30, min_v=1, max_v=365),
116
+ log_max_size_mb=_env_int("OPENCLAW_LOG_MAX_SIZE_MB", 100, min_v=1, max_v=1024),
117
+ log_backup_count=_env_int("OPENCLAW_LOG_BACKUP_COUNT", 5, min_v=1, max_v=50),
118
+ log_file_path=os.environ.get("OPENCLAW_LOG_FILE_PATH") or None,
119
+ log_compression=_env_bool("OPENCLAW_LOG_COMPRESSION", True),
120
+ )
121
+
122
+
123
+ def refresh_fortify_config_cache() -> FortifyConfig:
124
+ get_fortify_config.cache_clear()
125
+ return get_fortify_config()
@@ -0,0 +1,488 @@
1
+ """
2
+ Unified error handling: classification, exponential backoff retry, in-process stats, structured logging.
3
+
4
+ 降级策略集中注册见 core.fallback_manager(REQ_003-SPEC-04);IO 失败读缓存见 status_cache.get_stale_fallback(REQ_003-AC-003)。
5
+ """
6
+ from __future__ import annotations
7
+
8
+ import logging
9
+ import threading
10
+ import time
11
+ import functools
12
+ from collections import defaultdict, deque
13
+ from dataclasses import dataclass, field
14
+ from datetime import datetime, timezone
15
+ from typing import Any, Callable, Dict, List, Optional, Tuple, Type
16
+
17
+ from core.config_fortify import get_fortify_config
18
+
19
+ _LOG = logging.getLogger("openclaw.fortify")
20
+
21
+
22
+ def _ensure_fortify_logging() -> None:
23
+ if getattr(_ensure_fortify_logging, "_done", False):
24
+ return
25
+ cfg = get_fortify_config()
26
+ level = getattr(logging, cfg.error_log_level, logging.INFO)
27
+ _LOG.setLevel(level)
28
+ if not _LOG.handlers:
29
+ # Try to use secure file-based logging if configured
30
+ try:
31
+ from core.logging_config import setup_secure_logging, get_log_file_path
32
+ log_path = get_log_file_path()
33
+ if log_path is not None:
34
+ # Secure logging is configured, skip console-only handler
35
+ # setup_secure_logging() already added file handlers
36
+ _ensure_fortify_logging._done = True # type: ignore[attr-defined]
37
+ return
38
+ except ImportError:
39
+ pass # Fall back to console handler
40
+
41
+ h = logging.StreamHandler()
42
+ h.setFormatter(
43
+ logging.Formatter(
44
+ fmt="%(asctime)s | %(levelname)s | fortify | %(message)s",
45
+ datefmt="%Y-%m-%dT%H:%M:%S%z",
46
+ )
47
+ )
48
+ _LOG.addHandler(h)
49
+ _ensure_fortify_logging._done = True # type: ignore[attr-defined]
50
+
51
+
52
+ @dataclass
53
+ class ErrorHandlerStats:
54
+ total_count: int = 0
55
+ by_type: Dict[str, int] = field(default_factory=dict)
56
+ by_scope: Dict[str, int] = field(default_factory=dict)
57
+ hourly_trend: List[Dict[str, Any]] = field(default_factory=list)
58
+ last_error: Optional[Dict[str, Any]] = None
59
+ last_update_iso: Optional[str] = None
60
+
61
+
62
+ _stats_lock = threading.Lock()
63
+ _stats = ErrorHandlerStats()
64
+ _retry_totals = defaultdict(int)
65
+
66
+ _retry_budget_lock = threading.Lock()
67
+ _retry_budget_deques: Dict[str, deque] = {}
68
+ _retry_budget_blocks = 0
69
+
70
+ # NFR-R: Reliability metrics
71
+ _reliability_lock = threading.Lock()
72
+ # Error recovery tracking
73
+ _error_recovery_times: deque = deque(maxlen=100) # last 100 recovery times in seconds
74
+ _last_error_timestamp: Optional[float] = None
75
+ _last_recovery_timestamp: Optional[float] = None
76
+
77
+ # Graceful degradation tracking
78
+ _fallback_total_attempts = 0
79
+ _fallback_success_count = 0
80
+
81
+ # Watcher availability tracking
82
+ _watcher_uptime_start: Optional[float] = None
83
+ _watcher_total_uptime_seconds = 0.0
84
+ _watcher_total_downtime_seconds = 0.0
85
+ _watchdog_last_failure_time: Optional[float] = None
86
+
87
+
88
+ def record_fallback_attempt(success: bool) -> None:
89
+ """Record graceful degradation attempt (NFR-R-005)."""
90
+ global _fallback_total_attempts, _fallback_success_count
91
+ with _reliability_lock:
92
+ _fallback_total_attempts += 1
93
+ if success:
94
+ _fallback_success_count += 1
95
+
96
+
97
+ def record_error_recovery(duration_seconds: float) -> None:
98
+ """Record error recovery time (NFR-R-003)."""
99
+ with _reliability_lock:
100
+ _error_recovery_times.append(duration_seconds)
101
+
102
+
103
+ def record_watcher_failure() -> None:
104
+ """Mark watchdog failure start time."""
105
+ global _watchdog_last_failure_time, _watcher_uptime_start, _watcher_total_uptime_seconds
106
+ now = time.time()
107
+ with _reliability_lock:
108
+ if _watchdog_last_failure_time is None:
109
+ _watchdog_last_failure_time = now
110
+ if _watcher_uptime_start is not None:
111
+ _watcher_total_uptime_seconds += now - _watcher_uptime_start
112
+ _watcher_uptime_start = None
113
+
114
+
115
+ def record_watcher_recovery() -> None:
116
+ """Mark watchdog recovery and record recovery time (NFR-R-003)."""
117
+ global _watchdog_last_failure_time, _watcher_uptime_start, _watcher_total_downtime_seconds
118
+ now = time.time()
119
+ recovery_time = 0.0
120
+ with _reliability_lock:
121
+ if _watchdog_last_failure_time is not None:
122
+ recovery_time = now - _watchdog_last_failure_time
123
+ _watcher_total_downtime_seconds += recovery_time
124
+ _watchdog_last_failure_time = None
125
+ _watcher_uptime_start = now
126
+ # Record outside lock to avoid deadlock (record_error_recovery also uses _reliability_lock)
127
+ if recovery_time > 0:
128
+ _error_recovery_times.append(recovery_time)
129
+
130
+
131
+ def get_reliability_metrics() -> Dict[str, Any]:
132
+ """Get all reliability metrics for NFR-R-002/003/005."""
133
+ import statistics as _statistics
134
+
135
+ with _reliability_lock:
136
+ current_time = time.time()
137
+ current_uptime = 0.0
138
+ if _watcher_uptime_start is not None:
139
+ current_uptime = current_time - _watcher_uptime_start
140
+
141
+ total_uptime = _watcher_total_uptime_seconds + current_uptime
142
+ total_downtime = _watcher_total_downtime_seconds
143
+ total_time = total_uptime + total_downtime
144
+
145
+ # NFR-R-002: Watcher availability/success rate
146
+ availability_rate = 1.0
147
+ if total_time > 0:
148
+ availability_rate = total_uptime / total_time
149
+
150
+ # NFR-R-003: Error recovery time
151
+ recovery_times_list = list(_error_recovery_times)
152
+ avg_recovery_time = 0.0
153
+ p95_recovery_time = 0.0
154
+ if recovery_times_list:
155
+ avg_recovery_time = _statistics.mean(recovery_times_list)
156
+ sorted_times = sorted(recovery_times_list)
157
+ p95_idx = int(len(sorted_times) * 0.95)
158
+ p95_recovery_time = sorted_times[min(p95_idx, len(sorted_times) - 1)]
159
+
160
+ # NFR-R-005: Graceful degradation rate
161
+ graceful_degradation_rate = 1.0
162
+ if _fallback_total_attempts > 0:
163
+ graceful_degradation_rate = _fallback_success_count / _fallback_total_attempts
164
+
165
+ return {
166
+ # NFR-R-002: Watcher availability
167
+ "watcher_uptime_seconds": total_uptime,
168
+ "watcher_downtime_seconds": total_downtime,
169
+ "watcher_availability_rate": round(availability_rate, 4),
170
+ "watcher_uptime_percentage": round(availability_rate * 100, 2),
171
+ # NFR-R-003: Error recovery time
172
+ "avg_error_recovery_seconds": round(avg_recovery_time, 3),
173
+ "p95_error_recovery_seconds": round(p95_recovery_time, 3),
174
+ "error_recovery_count": len(recovery_times_list),
175
+ "last_error_recovery_time": recovery_times_list[-1] if recovery_times_list else None,
176
+ # NFR-R-005: Graceful degradation
177
+ "graceful_degradation_attempts": _fallback_total_attempts,
178
+ "graceful_degradation_successes": _fallback_success_count,
179
+ "graceful_degradation_rate": round(graceful_degradation_rate, 4),
180
+ "graceful_degradation_percentage": round(graceful_degradation_rate * 100, 2),
181
+ }
182
+
183
+
184
+ def reset_reliability_metrics_for_tests() -> None:
185
+ """Reset reliability metrics for testing."""
186
+ global _error_recovery_times, _last_error_timestamp, _last_recovery_timestamp
187
+ global _fallback_total_attempts, _fallback_success_count
188
+ global _watcher_uptime_start, _watcher_total_uptime_seconds, _watcher_total_downtime_seconds
189
+ global _watchdog_last_failure_time
190
+ with _reliability_lock:
191
+ _error_recovery_times.clear()
192
+ _last_error_timestamp = None
193
+ _last_recovery_timestamp = None
194
+ _fallback_total_attempts = 0
195
+ _fallback_success_count = 0
196
+ _watcher_uptime_start = None
197
+ _watcher_total_uptime_seconds = 0.0
198
+ _watcher_total_downtime_seconds = 0.0
199
+ _watchdog_last_failure_time = None
200
+
201
+
202
+ def _consume_retry_budget(operation: str) -> bool:
203
+ """
204
+ 滑动窗口(60s)内同一 operation 的退避重试次数上限,缓解重试风暴(RISK-005)。
205
+ OPENCLAW_RETRY_BUDGET_PER_MINUTE=0 表示不限制。
206
+ """
207
+ global _retry_budget_blocks
208
+ cfg = get_fortify_config()
209
+ limit = cfg.retry_budget_per_minute
210
+ if limit <= 0:
211
+ return True
212
+ op = (operation or "default").strip() or "default"
213
+ now = time.monotonic()
214
+ with _retry_budget_lock:
215
+ dq = _retry_budget_deques.setdefault(op, deque())
216
+ while dq and now - dq[0] > 60.0:
217
+ dq.popleft()
218
+ if len(dq) >= limit:
219
+ _retry_budget_blocks += 1
220
+ return False
221
+ dq.append(now)
222
+ return True
223
+
224
+
225
+ def classify_exception(exc: BaseException) -> str:
226
+ """Map exception to PRD-style category."""
227
+ import json as _json
228
+
229
+ if isinstance(exc, TimeoutError):
230
+ return "timeout"
231
+ if isinstance(exc, PermissionError):
232
+ return "permission-error"
233
+ if isinstance(exc, FileNotFoundError):
234
+ return "io-error"
235
+ if isinstance(exc, (BrokenPipeError, ConnectionResetError, ConnectionAbortedError)):
236
+ return "network"
237
+ if isinstance(exc, (ConnectionError, OSError)):
238
+ msg = str(exc).lower()
239
+ if "network" in msg or "connection" in msg or "broken pipe" in msg:
240
+ return "network"
241
+ return "io-error"
242
+ if isinstance(exc, _json.JSONDecodeError):
243
+ return "parsing-error"
244
+ if isinstance(exc, UnicodeDecodeError):
245
+ return "parsing-error"
246
+ if isinstance(exc, MemoryError):
247
+ return "compute-error"
248
+ if isinstance(exc, RecursionError):
249
+ return "compute-error"
250
+ if isinstance(exc, (KeyError, TypeError, AttributeError)):
251
+ return "validation-error"
252
+ if isinstance(exc, ValueError):
253
+ return "validation-error"
254
+ try:
255
+ import ssl
256
+ except ImportError:
257
+ pass
258
+ else:
259
+ if isinstance(exc, ssl.SSLError):
260
+ return "network"
261
+ return "unknown"
262
+
263
+
264
+ class ErrorHandler:
265
+ """Per-use-case handler; global stats still aggregate via record_error."""
266
+
267
+ def __init__(
268
+ self,
269
+ max_retry: Optional[int] = None,
270
+ base_delay: Optional[float] = None,
271
+ enable_fallback: Optional[bool] = None,
272
+ ):
273
+ _ensure_fortify_logging()
274
+ cfg = get_fortify_config()
275
+ self.max_retry = cfg.max_retry if max_retry is None else max_retry
276
+ self.base_delay = cfg.retry_base_delay if base_delay is None else base_delay
277
+ self.enable_fallback = cfg.enable_fallback if enable_fallback is None else enable_fallback
278
+
279
+ def log_error(
280
+ self,
281
+ error_type: str,
282
+ error_detail: str,
283
+ affected_scope: str = "",
284
+ exc: Optional[BaseException] = None,
285
+ ) -> None:
286
+ record_error(error_type, error_detail, affected_scope, exc)
287
+
288
+ def get_stats(self) -> Dict[str, Any]:
289
+ return get_framework_error_stats()
290
+
291
+ def run_with_retry(
292
+ self,
293
+ fn: Callable[[], Any],
294
+ *,
295
+ operation: str = "operation",
296
+ error_type: str = "unknown",
297
+ fallback: Optional[Callable[[], Any]] = None,
298
+ retryable: Optional[Tuple[Type[BaseException], ...]] = None,
299
+ ) -> Any:
300
+ if retryable is None:
301
+ retryable = (OSError, IOError, TimeoutError, ConnectionError)
302
+ attempts = max(1, self.max_retry + 1)
303
+ last_exc: Optional[BaseException] = None
304
+ for attempt in range(attempts):
305
+ try:
306
+ return fn()
307
+ except retryable as e:
308
+ last_exc = e
309
+ if attempt + 1 >= attempts:
310
+ break
311
+ if not _consume_retry_budget(operation):
312
+ record_error(
313
+ "compute-error",
314
+ f"retry budget exceeded (60s window) op={operation}",
315
+ f"retry_budget:{operation}",
316
+ )
317
+ if self.enable_fallback and fallback is not None:
318
+ return fallback()
319
+ if last_exc:
320
+ raise last_exc
321
+ raise RuntimeError(operation)
322
+ delay = self.base_delay * (2**attempt)
323
+ _LOG.warning(
324
+ "retry operation=%s attempt=%s/%s delay=%.2fs err=%s",
325
+ operation,
326
+ attempt + 1,
327
+ attempts,
328
+ delay,
329
+ e,
330
+ )
331
+ time.sleep(delay)
332
+ record_error(error_type, str(last_exc) if last_exc else "failed", operation, last_exc)
333
+ if self.enable_fallback and fallback is not None:
334
+ return fallback()
335
+ if last_exc:
336
+ raise last_exc
337
+ raise RuntimeError(operation)
338
+
339
+
340
+ def record_error(
341
+ error_type: str,
342
+ error_detail: str,
343
+ affected_scope: str = "",
344
+ exc: Optional[BaseException] = None,
345
+ ) -> None:
346
+ _ensure_fortify_logging()
347
+ if exc is not None:
348
+ error_type = classify_exception(exc) if error_type in ("", "unknown") else error_type
349
+ detail = (error_detail or "")[:2000]
350
+ scope = affected_scope or ""
351
+ exc_type_name = type(exc).__name__ if exc is not None else ""
352
+ exc_module = type(exc).__module__ if exc is not None else ""
353
+ _LOG.error(
354
+ "fortify_event error_type=%s scope=%s exc_type=%s exc_module=%s detail=%s",
355
+ error_type,
356
+ scope,
357
+ exc_type_name,
358
+ exc_module,
359
+ detail,
360
+ exc_info=exc is not None,
361
+ )
362
+ now = datetime.now(timezone.utc)
363
+ hour_key = now.strftime("%Y-%m-%d %H:00")
364
+ with _stats_lock:
365
+ _stats.total_count += 1
366
+ _stats.by_type[error_type] = _stats.by_type.get(error_type, 0) + 1
367
+ if scope:
368
+ _stats.by_scope[scope] = _stats.by_scope.get(scope, 0) + 1
369
+ _stats.last_error = {
370
+ "type": error_type,
371
+ "detail": detail,
372
+ "scope": scope,
373
+ "time": now.isoformat(),
374
+ "exc_type": exc_type_name or None,
375
+ "exc_module": exc_module or None,
376
+ }
377
+ _stats.last_update_iso = now.isoformat()
378
+ # rolling hourly bucket (merge into hourly_trend max 24 entries)
379
+ found = False
380
+ for row in _stats.hourly_trend:
381
+ if row.get("hour") == hour_key:
382
+ row["count"] = row.get("count", 0) + 1
383
+ found = True
384
+ break
385
+ if not found:
386
+ _stats.hourly_trend.append({"hour": hour_key, "count": 1})
387
+ if len(_stats.hourly_trend) > 24:
388
+ _stats.hourly_trend = _stats.hourly_trend[-24:]
389
+
390
+
391
+ def record_retry(operation: str) -> None:
392
+ _retry_totals[operation] += 1
393
+
394
+
395
+ def get_framework_error_stats_for_client() -> Dict[str, Any]:
396
+ """供 HTTP 返回:在开启脱敏时处理 last_error。"""
397
+ from core.safe_api_error import redact_framework_stats_for_client
398
+
399
+ return redact_framework_stats_for_client(get_framework_error_stats())
400
+
401
+
402
+ def get_framework_error_stats() -> Dict[str, Any]:
403
+ with _stats_lock:
404
+ by_type_out: Dict[str, Any] = {}
405
+ type_labels = {
406
+ "network": ("网络错误", "#f59e0b"),
407
+ "timeout": ("超时错误", "#ef4444"),
408
+ "parsing": ("解析错误", "#8b5cf6"),
409
+ "parsing-error": ("解析错误", "#8b5cf6"),
410
+ "io-error": ("IO 错误", "#64748b"),
411
+ "permission-error": ("权限错误", "#b45309"),
412
+ "compute-error": ("计算错误", "#dc2626"),
413
+ "validation-error": ("校验错误", "#ca8a04"),
414
+ "unknown": ("未知错误", "#6b7280"),
415
+ }
416
+ for t, c in _stats.by_type.items():
417
+ label, color = type_labels.get(t, (t, "#6b7280"))
418
+ by_type_out[t] = {"count": c, "label": label, "color": color}
419
+ by_agent: Dict[str, Any] = {}
420
+ for scope, c in _stats.by_scope.items():
421
+ if scope.startswith("agent_id:"):
422
+ aid = scope.split(":", 1)[-1]
423
+ by_agent[aid] = {"count": c, "agentId": aid}
424
+ else:
425
+ by_agent[scope] = {"count": c, "agentId": scope}
426
+ sum_by_type = sum(t.get("count", 0) if isinstance(t, dict) else 0 for t in by_type_out.values())
427
+ top_scopes = [
428
+ {"scope": k, "count": v}
429
+ for k, v in sorted(_stats.by_scope.items(), key=lambda kv: -kv[1])[:50]
430
+ ]
431
+
432
+ # NFR-R reliability metrics
433
+ reliability = get_reliability_metrics()
434
+
435
+ return {
436
+ "total_count": _stats.total_count,
437
+ "by_type": by_type_out,
438
+ "by_agent": by_agent,
439
+ "by_scope_top": top_scopes,
440
+ "sum_by_type": sum_by_type,
441
+ "totals_consistent": sum_by_type == _stats.total_count,
442
+ "hourly_trend": list(_stats.hourly_trend),
443
+ "last_update": _stats.last_update_iso,
444
+ "last_error": _stats.last_error,
445
+ "retry_by_operation": dict(_retry_totals),
446
+ "retry_budget_blocks": _retry_budget_blocks,
447
+ # NFR-R Reliability
448
+ "reliability": reliability,
449
+ }
450
+
451
+
452
+ def execute_with_retry(
453
+ max_attempts: int = 3,
454
+ delay_base: float = 1.0,
455
+ exceptions: Tuple[Type[BaseException], ...] = (Exception,),
456
+ ) -> Callable[[Callable[..., Any]], Callable[..., Any]]:
457
+ def deco(fn: Callable[..., Any]) -> Callable[..., Any]:
458
+ @functools.wraps(fn)
459
+ def wrapped(*args: Any, **kwargs: Any) -> Any:
460
+ cfg = get_fortify_config()
461
+ attempts = max_attempts if max_attempts is not None else cfg.max_retry + 1
462
+ base = delay_base if delay_base is not None else cfg.retry_base_delay
463
+ last: Optional[BaseException] = None
464
+ for attempt in range(attempts):
465
+ try:
466
+ return fn(*args, **kwargs)
467
+ except exceptions as e:
468
+ last = e
469
+ record_retry(fn.__name__)
470
+ if attempt + 1 >= attempts:
471
+ break
472
+ if not _consume_retry_budget(fn.__name__):
473
+ record_error(
474
+ "compute-error",
475
+ f"retry budget exceeded (60s window) op={fn.__name__}",
476
+ f"retry_budget:{fn.__name__}",
477
+ )
478
+ if last:
479
+ raise last
480
+ raise RuntimeError(fn.__name__)
481
+ time.sleep(base * (2**attempt))
482
+ if last:
483
+ raise last
484
+ raise RuntimeError(fn.__name__)
485
+
486
+ return wrapped
487
+
488
+ return deco