openclaw-agent-dashboard 1.0.39 → 1.0.40
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dashboard/api/agent_config_api.py +28 -7
- package/dashboard/api/agents.py +48 -10
- package/dashboard/api/agents_config.py +5 -1
- package/dashboard/api/chains.py +25 -5
- package/dashboard/api/collaboration.py +10 -9
- package/dashboard/api/debug_paths.py +5 -1
- package/dashboard/api/error_analysis.py +29 -11
- package/dashboard/api/errors.py +27 -11
- package/dashboard/api/fortify_routes.py +80 -0
- package/dashboard/api/input_safety.py +60 -0
- package/dashboard/api/performance.py +73 -53
- package/dashboard/api/subagents.py +95 -99
- package/dashboard/api/timeline.py +24 -3
- package/dashboard/api/version.py +2 -0
- package/dashboard/api/websocket.py +9 -7
- package/dashboard/core/__init__.py +1 -0
- package/dashboard/core/config_fortify.py +112 -0
- package/dashboard/core/error_handler.py +339 -0
- package/dashboard/core/fallback_manager.py +70 -0
- package/dashboard/core/safe_api_error.py +76 -0
- package/dashboard/core/schemas/__init__.py +16 -0
- package/dashboard/core/schemas/base.py +43 -0
- package/dashboard/core/schemas/session_schema.py +40 -0
- package/dashboard/core/schemas/subagent_schema.py +23 -0
- package/dashboard/data/agent_config_manager.py +6 -4
- package/dashboard/data/chain_reader.py +16 -12
- package/dashboard/data/error_analyzer.py +15 -11
- package/dashboard/data/session_reader.py +268 -46
- package/dashboard/data/subagent_reader.py +74 -49
- package/dashboard/data/timeline_reader.py +35 -49
- package/dashboard/main.py +24 -2
- package/dashboard/mechanism_reader.py +4 -5
- package/dashboard/mechanisms.py +2 -2
- package/dashboard/pytest.ini +3 -0
- package/dashboard/requirements.txt +5 -0
- package/dashboard/status/cache_fp_probe.py +40 -0
- package/dashboard/status/status_cache.py +199 -72
- package/dashboard/status/status_calculator.py +50 -30
- package/dashboard/tests/conftest.py +84 -0
- package/dashboard/tests/test_api_contracts.py +372 -0
- package/dashboard/tests/test_bench_fortify.py +176 -0
- package/dashboard/tests/test_fortify.py +741 -0
- package/dashboard/utils/__init__.py +1 -0
- package/dashboard/utils/data_repair.py +210 -0
- package/dashboard/watchers/file_watcher.py +367 -77
- package/openclaw.plugin.json +1 -1
- package/package.json +1 -1
- package/dashboard/agents.py +0 -74
- package/dashboard/collaboration.py +0 -407
- package/dashboard/errors.py +0 -63
- package/dashboard/performance.py +0 -474
- package/dashboard/session_reader.py +0 -240
- package/dashboard/status_calculator.py +0 -121
- package/dashboard/subagent_reader.py +0 -232
package/dashboard/api/version.py
CHANGED
|
@@ -9,6 +9,7 @@ from pydantic import BaseModel
|
|
|
9
9
|
from typing import Optional
|
|
10
10
|
|
|
11
11
|
# 导入版本信息读取器
|
|
12
|
+
from core.error_handler import record_error
|
|
12
13
|
from data.version_info_reader import get_version_reader
|
|
13
14
|
|
|
14
15
|
logger = logging.getLogger(__name__)
|
|
@@ -38,6 +39,7 @@ async def get_version_info() -> VersionInfo:
|
|
|
38
39
|
version_data = reader.read_version_info()
|
|
39
40
|
return VersionInfo(**version_data)
|
|
40
41
|
except Exception as e:
|
|
42
|
+
record_error("unknown", str(e), "api:version", exc=e)
|
|
41
43
|
logger.exception("get_version_info 异常,返回降级数据: %s", e)
|
|
42
44
|
return VersionInfo(
|
|
43
45
|
version="unknown",
|
|
@@ -11,6 +11,8 @@ from pathlib import Path
|
|
|
11
11
|
|
|
12
12
|
sys.path.append(str(Path(__file__).parent.parent))
|
|
13
13
|
|
|
14
|
+
from core.error_handler import record_error
|
|
15
|
+
|
|
14
16
|
router = APIRouter()
|
|
15
17
|
|
|
16
18
|
# 活跃的 WebSocket 连接
|
|
@@ -33,7 +35,7 @@ async def _periodic_broadcast_loop():
|
|
|
33
35
|
if changed_agents:
|
|
34
36
|
await broadcast_state_update(changed_agents)
|
|
35
37
|
except Exception as e:
|
|
36
|
-
|
|
38
|
+
record_error("unknown", str(e), "websocket:periodic_broadcast", exc=e)
|
|
37
39
|
|
|
38
40
|
|
|
39
41
|
def _ensure_broadcast_task():
|
|
@@ -110,26 +112,26 @@ async def send_initial_state(websocket: WebSocket):
|
|
|
110
112
|
collab = await get_collaboration()
|
|
111
113
|
data['collaboration'] = collab.model_dump() if hasattr(collab, "model_dump") else collab
|
|
112
114
|
except Exception as e:
|
|
113
|
-
|
|
115
|
+
record_error("unknown", str(e), "websocket:initial_collaboration", exc=e)
|
|
114
116
|
try:
|
|
115
117
|
tasks_result = await get_tasks()
|
|
116
118
|
data['tasks'] = tasks_result.get("tasks", []) if isinstance(tasks_result, dict) else []
|
|
117
119
|
except Exception as e:
|
|
118
|
-
|
|
120
|
+
record_error("unknown", str(e), "websocket:initial_tasks", exc=e)
|
|
119
121
|
try:
|
|
120
122
|
from .performance import get_real_stats
|
|
121
123
|
data['performance'] = await get_real_stats()
|
|
122
124
|
except Exception as e:
|
|
123
|
-
|
|
125
|
+
record_error("unknown", str(e), "websocket:initial_performance", exc=e)
|
|
124
126
|
try:
|
|
125
127
|
from .workflow import list_workflows
|
|
126
128
|
data['workflows'] = await list_workflows()
|
|
127
129
|
except Exception as e:
|
|
128
|
-
|
|
130
|
+
record_error("unknown", str(e), "websocket:initial_workflows", exc=e)
|
|
129
131
|
|
|
130
132
|
await websocket.send_json({'type': 'full_state', 'data': data})
|
|
131
133
|
except Exception as e:
|
|
132
|
-
|
|
134
|
+
record_error("unknown", str(e), "websocket:send_initial_state", exc=e)
|
|
133
135
|
|
|
134
136
|
|
|
135
137
|
async def broadcast_agent_update(agent_id: str, status: str):
|
|
@@ -250,7 +252,7 @@ async def broadcast_full_state():
|
|
|
250
252
|
},
|
|
251
253
|
})
|
|
252
254
|
except Exception as e:
|
|
253
|
-
|
|
255
|
+
record_error("unknown", str(e), "websocket:broadcast_full_state", exc=e)
|
|
254
256
|
|
|
255
257
|
|
|
256
258
|
async def broadcast_state_update(changed_agents: List[Dict[str, Any]]) -> None:
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""Core fortify modules (error handling, config, schemas)."""
|
|
@@ -0,0 +1,112 @@
|
|
|
1
|
+
"""
|
|
2
|
+
TECHDEBT_FORTIFY: centralized environment configuration.
|
|
3
|
+
|
|
4
|
+
OPENCLAW_CACHE_MAX_SIZE = max cache memory in MB (PRD).
|
|
5
|
+
OPENCLAW_CACHE_MAX_ENTRIES = max number of cache entries (distinct from memory cap).
|
|
6
|
+
"""
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
import os
|
|
10
|
+
from dataclasses import dataclass
|
|
11
|
+
from functools import lru_cache
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def _env_int(key: str, default: int, min_v: int | None = None, max_v: int | None = None) -> int:
|
|
15
|
+
raw = os.environ.get(key)
|
|
16
|
+
if raw is None or raw == "":
|
|
17
|
+
v = default
|
|
18
|
+
else:
|
|
19
|
+
try:
|
|
20
|
+
v = int(raw)
|
|
21
|
+
except ValueError:
|
|
22
|
+
v = default
|
|
23
|
+
if min_v is not None:
|
|
24
|
+
v = max(min_v, v)
|
|
25
|
+
if max_v is not None:
|
|
26
|
+
v = min(max_v, v)
|
|
27
|
+
return v
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def _env_float(key: str, default: float) -> float:
|
|
31
|
+
raw = os.environ.get(key)
|
|
32
|
+
if raw is None or raw == "":
|
|
33
|
+
return default
|
|
34
|
+
try:
|
|
35
|
+
return float(raw)
|
|
36
|
+
except ValueError:
|
|
37
|
+
return default
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def _env_bool(key: str, default: bool) -> bool:
|
|
41
|
+
raw = os.environ.get(key)
|
|
42
|
+
if raw is None or raw == "":
|
|
43
|
+
return default
|
|
44
|
+
return raw.lower() in ("1", "true", "yes", "on")
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def _env_str(key: str, default: str) -> str:
|
|
48
|
+
raw = os.environ.get(key)
|
|
49
|
+
if raw is None or raw == "":
|
|
50
|
+
return default
|
|
51
|
+
return raw
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
@dataclass(frozen=True)
|
|
55
|
+
class FortifyConfig:
|
|
56
|
+
cache_ttl_seconds: int
|
|
57
|
+
cache_max_entries: int
|
|
58
|
+
cache_max_memory_mb: int
|
|
59
|
+
cache_preload: bool
|
|
60
|
+
cache_double_check: bool
|
|
61
|
+
cache_fp_probe_interval_sec: float
|
|
62
|
+
|
|
63
|
+
max_retry: int
|
|
64
|
+
retry_base_delay: float
|
|
65
|
+
retry_budget_per_minute: int
|
|
66
|
+
enable_fallback: bool
|
|
67
|
+
fallback_cache_on_io: bool
|
|
68
|
+
error_log_level: str
|
|
69
|
+
sanitize_api_errors: bool
|
|
70
|
+
|
|
71
|
+
json_strict: bool
|
|
72
|
+
auto_repair_json: bool
|
|
73
|
+
auto_repair_write_back: bool
|
|
74
|
+
repair_backup_path: str | None
|
|
75
|
+
max_repair_attempts: int
|
|
76
|
+
|
|
77
|
+
watcher_max_retries: int
|
|
78
|
+
watcher_poll_interval_sec: float
|
|
79
|
+
watcher_failure_window_sec: float
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
@lru_cache(maxsize=1)
|
|
83
|
+
def get_fortify_config() -> FortifyConfig:
|
|
84
|
+
ttl = _env_int("OPENCLAW_CACHE_TTL", 1, min_v=1, max_v=60)
|
|
85
|
+
return FortifyConfig(
|
|
86
|
+
cache_ttl_seconds=ttl,
|
|
87
|
+
cache_max_entries=_env_int("OPENCLAW_CACHE_MAX_ENTRIES", 100, min_v=1, max_v=10_000),
|
|
88
|
+
cache_max_memory_mb=_env_int("OPENCLAW_CACHE_MAX_SIZE", 100, min_v=1, max_v=4096),
|
|
89
|
+
cache_preload=_env_bool("OPENCLAW_CACHE_PRELOAD", True),
|
|
90
|
+
cache_double_check=_env_bool("OPENCLAW_CACHE_DOUBLE_CHECK", True),
|
|
91
|
+
cache_fp_probe_interval_sec=_env_float("OPENCLAW_CACHE_FP_PROBE_INTERVAL", 0.0),
|
|
92
|
+
max_retry=_env_int("OPENCLAW_MAX_RETRY", 3, min_v=0, max_v=20),
|
|
93
|
+
retry_base_delay=_env_float("OPENCLAW_RETRY_BASE_DELAY", 1.0),
|
|
94
|
+
retry_budget_per_minute=_env_int("OPENCLAW_RETRY_BUDGET_PER_MINUTE", 300, min_v=0, max_v=100_000),
|
|
95
|
+
enable_fallback=_env_bool("OPENCLAW_ENABLE_FALLBACK", True),
|
|
96
|
+
fallback_cache_on_io=_env_bool("OPENCLAW_FALLBACK_CACHE_ON_IO", True),
|
|
97
|
+
error_log_level=_env_str("OPENCLAW_ERROR_LOG_LEVEL", "INFO").upper(),
|
|
98
|
+
sanitize_api_errors=_env_bool("OPENCLAW_API_ERROR_SANITIZE", True),
|
|
99
|
+
json_strict=_env_bool("OPENCLAW_JSON_STRICT", True),
|
|
100
|
+
auto_repair_json=_env_bool("OPENCLAW_AUTO_REPAIR_JSON", True),
|
|
101
|
+
auto_repair_write_back=_env_bool("OPENCLAW_AUTO_REPAIR_WB", False),
|
|
102
|
+
repair_backup_path=os.environ.get("OPENCLAW_REPAIR_BACKUP") or None,
|
|
103
|
+
max_repair_attempts=_env_int("OPENCLAW_MAX_REPAIR_ATTEMPTS", 3, min_v=1, max_v=10),
|
|
104
|
+
watcher_max_retries=_env_int("OPENCLAW_WATCHER_MAX_RETRIES", 3, min_v=1, max_v=10),
|
|
105
|
+
watcher_poll_interval_sec=_env_float("OPENCLAW_WATCHER_POLL_INTERVAL", 5.0),
|
|
106
|
+
watcher_failure_window_sec=_env_float("OPENCLAW_WATCHER_FAILURE_WINDOW", 30.0),
|
|
107
|
+
)
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
def refresh_fortify_config_cache() -> FortifyConfig:
|
|
111
|
+
get_fortify_config.cache_clear()
|
|
112
|
+
return get_fortify_config()
|
|
@@ -0,0 +1,339 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Unified error handling: classification, exponential backoff retry, in-process stats, structured logging.
|
|
3
|
+
|
|
4
|
+
降级策略集中注册见 core.fallback_manager(REQ_003-SPEC-04);IO 失败读缓存见 status_cache.get_stale_fallback(REQ_003-AC-003)。
|
|
5
|
+
"""
|
|
6
|
+
from __future__ import annotations
|
|
7
|
+
|
|
8
|
+
import logging
|
|
9
|
+
import threading
|
|
10
|
+
import time
|
|
11
|
+
import functools
|
|
12
|
+
from collections import defaultdict, deque
|
|
13
|
+
from dataclasses import dataclass, field
|
|
14
|
+
from datetime import datetime, timezone
|
|
15
|
+
from typing import Any, Callable, Dict, List, Optional, Tuple, Type
|
|
16
|
+
|
|
17
|
+
from core.config_fortify import get_fortify_config
|
|
18
|
+
|
|
19
|
+
_LOG = logging.getLogger("openclaw.fortify")
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def _ensure_fortify_logging() -> None:
|
|
23
|
+
if getattr(_ensure_fortify_logging, "_done", False):
|
|
24
|
+
return
|
|
25
|
+
cfg = get_fortify_config()
|
|
26
|
+
level = getattr(logging, cfg.error_log_level, logging.INFO)
|
|
27
|
+
_LOG.setLevel(level)
|
|
28
|
+
if not _LOG.handlers:
|
|
29
|
+
h = logging.StreamHandler()
|
|
30
|
+
h.setFormatter(
|
|
31
|
+
logging.Formatter(
|
|
32
|
+
fmt="%(asctime)s | %(levelname)s | fortify | %(message)s",
|
|
33
|
+
datefmt="%Y-%m-%dT%H:%M:%S%z",
|
|
34
|
+
)
|
|
35
|
+
)
|
|
36
|
+
_LOG.addHandler(h)
|
|
37
|
+
_ensure_fortify_logging._done = True # type: ignore[attr-defined]
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
@dataclass
|
|
41
|
+
class ErrorHandlerStats:
|
|
42
|
+
total_count: int = 0
|
|
43
|
+
by_type: Dict[str, int] = field(default_factory=dict)
|
|
44
|
+
by_scope: Dict[str, int] = field(default_factory=dict)
|
|
45
|
+
hourly_trend: List[Dict[str, Any]] = field(default_factory=list)
|
|
46
|
+
last_error: Optional[Dict[str, Any]] = None
|
|
47
|
+
last_update_iso: Optional[str] = None
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
_stats_lock = threading.Lock()
|
|
51
|
+
_stats = ErrorHandlerStats()
|
|
52
|
+
_retry_totals = defaultdict(int)
|
|
53
|
+
|
|
54
|
+
_retry_budget_lock = threading.Lock()
|
|
55
|
+
_retry_budget_deques: Dict[str, deque] = {}
|
|
56
|
+
_retry_budget_blocks = 0
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def _consume_retry_budget(operation: str) -> bool:
|
|
60
|
+
"""
|
|
61
|
+
滑动窗口(60s)内同一 operation 的退避重试次数上限,缓解重试风暴(RISK-005)。
|
|
62
|
+
OPENCLAW_RETRY_BUDGET_PER_MINUTE=0 表示不限制。
|
|
63
|
+
"""
|
|
64
|
+
global _retry_budget_blocks
|
|
65
|
+
cfg = get_fortify_config()
|
|
66
|
+
limit = cfg.retry_budget_per_minute
|
|
67
|
+
if limit <= 0:
|
|
68
|
+
return True
|
|
69
|
+
op = (operation or "default").strip() or "default"
|
|
70
|
+
now = time.monotonic()
|
|
71
|
+
with _retry_budget_lock:
|
|
72
|
+
dq = _retry_budget_deques.setdefault(op, deque())
|
|
73
|
+
while dq and now - dq[0] > 60.0:
|
|
74
|
+
dq.popleft()
|
|
75
|
+
if len(dq) >= limit:
|
|
76
|
+
_retry_budget_blocks += 1
|
|
77
|
+
return False
|
|
78
|
+
dq.append(now)
|
|
79
|
+
return True
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def classify_exception(exc: BaseException) -> str:
|
|
83
|
+
"""Map exception to PRD-style category."""
|
|
84
|
+
import json as _json
|
|
85
|
+
|
|
86
|
+
if isinstance(exc, TimeoutError):
|
|
87
|
+
return "timeout"
|
|
88
|
+
if isinstance(exc, PermissionError):
|
|
89
|
+
return "permission-error"
|
|
90
|
+
if isinstance(exc, FileNotFoundError):
|
|
91
|
+
return "io-error"
|
|
92
|
+
if isinstance(exc, (BrokenPipeError, ConnectionResetError, ConnectionAbortedError)):
|
|
93
|
+
return "network"
|
|
94
|
+
if isinstance(exc, (ConnectionError, OSError)):
|
|
95
|
+
msg = str(exc).lower()
|
|
96
|
+
if "network" in msg or "connection" in msg or "broken pipe" in msg:
|
|
97
|
+
return "network"
|
|
98
|
+
return "io-error"
|
|
99
|
+
if isinstance(exc, _json.JSONDecodeError):
|
|
100
|
+
return "parsing-error"
|
|
101
|
+
if isinstance(exc, UnicodeDecodeError):
|
|
102
|
+
return "parsing-error"
|
|
103
|
+
if isinstance(exc, MemoryError):
|
|
104
|
+
return "compute-error"
|
|
105
|
+
if isinstance(exc, RecursionError):
|
|
106
|
+
return "compute-error"
|
|
107
|
+
if isinstance(exc, (KeyError, TypeError, AttributeError)):
|
|
108
|
+
return "validation-error"
|
|
109
|
+
if isinstance(exc, ValueError):
|
|
110
|
+
return "validation-error"
|
|
111
|
+
try:
|
|
112
|
+
import ssl
|
|
113
|
+
except ImportError:
|
|
114
|
+
pass
|
|
115
|
+
else:
|
|
116
|
+
if isinstance(exc, ssl.SSLError):
|
|
117
|
+
return "network"
|
|
118
|
+
return "unknown"
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
class ErrorHandler:
|
|
122
|
+
"""Per-use-case handler; global stats still aggregate via record_error."""
|
|
123
|
+
|
|
124
|
+
def __init__(
|
|
125
|
+
self,
|
|
126
|
+
max_retry: Optional[int] = None,
|
|
127
|
+
base_delay: Optional[float] = None,
|
|
128
|
+
enable_fallback: Optional[bool] = None,
|
|
129
|
+
):
|
|
130
|
+
_ensure_fortify_logging()
|
|
131
|
+
cfg = get_fortify_config()
|
|
132
|
+
self.max_retry = cfg.max_retry if max_retry is None else max_retry
|
|
133
|
+
self.base_delay = cfg.retry_base_delay if base_delay is None else base_delay
|
|
134
|
+
self.enable_fallback = cfg.enable_fallback if enable_fallback is None else enable_fallback
|
|
135
|
+
|
|
136
|
+
def log_error(
|
|
137
|
+
self,
|
|
138
|
+
error_type: str,
|
|
139
|
+
error_detail: str,
|
|
140
|
+
affected_scope: str = "",
|
|
141
|
+
exc: Optional[BaseException] = None,
|
|
142
|
+
) -> None:
|
|
143
|
+
record_error(error_type, error_detail, affected_scope, exc)
|
|
144
|
+
|
|
145
|
+
def get_stats(self) -> Dict[str, Any]:
|
|
146
|
+
return get_framework_error_stats()
|
|
147
|
+
|
|
148
|
+
def run_with_retry(
|
|
149
|
+
self,
|
|
150
|
+
fn: Callable[[], Any],
|
|
151
|
+
*,
|
|
152
|
+
operation: str = "operation",
|
|
153
|
+
error_type: str = "unknown",
|
|
154
|
+
fallback: Optional[Callable[[], Any]] = None,
|
|
155
|
+
retryable: Optional[Tuple[Type[BaseException], ...]] = None,
|
|
156
|
+
) -> Any:
|
|
157
|
+
if retryable is None:
|
|
158
|
+
retryable = (OSError, IOError, TimeoutError, ConnectionError)
|
|
159
|
+
attempts = max(1, self.max_retry + 1)
|
|
160
|
+
last_exc: Optional[BaseException] = None
|
|
161
|
+
for attempt in range(attempts):
|
|
162
|
+
try:
|
|
163
|
+
return fn()
|
|
164
|
+
except retryable as e:
|
|
165
|
+
last_exc = e
|
|
166
|
+
if attempt + 1 >= attempts:
|
|
167
|
+
break
|
|
168
|
+
if not _consume_retry_budget(operation):
|
|
169
|
+
record_error(
|
|
170
|
+
"compute-error",
|
|
171
|
+
f"retry budget exceeded (60s window) op={operation}",
|
|
172
|
+
f"retry_budget:{operation}",
|
|
173
|
+
)
|
|
174
|
+
if self.enable_fallback and fallback is not None:
|
|
175
|
+
return fallback()
|
|
176
|
+
if last_exc:
|
|
177
|
+
raise last_exc
|
|
178
|
+
raise RuntimeError(operation)
|
|
179
|
+
delay = self.base_delay * (2**attempt)
|
|
180
|
+
_LOG.warning(
|
|
181
|
+
"retry operation=%s attempt=%s/%s delay=%.2fs err=%s",
|
|
182
|
+
operation,
|
|
183
|
+
attempt + 1,
|
|
184
|
+
attempts,
|
|
185
|
+
delay,
|
|
186
|
+
e,
|
|
187
|
+
)
|
|
188
|
+
time.sleep(delay)
|
|
189
|
+
record_error(error_type, str(last_exc) if last_exc else "failed", operation, last_exc)
|
|
190
|
+
if self.enable_fallback and fallback is not None:
|
|
191
|
+
return fallback()
|
|
192
|
+
if last_exc:
|
|
193
|
+
raise last_exc
|
|
194
|
+
raise RuntimeError(operation)
|
|
195
|
+
|
|
196
|
+
|
|
197
|
+
def record_error(
|
|
198
|
+
error_type: str,
|
|
199
|
+
error_detail: str,
|
|
200
|
+
affected_scope: str = "",
|
|
201
|
+
exc: Optional[BaseException] = None,
|
|
202
|
+
) -> None:
|
|
203
|
+
_ensure_fortify_logging()
|
|
204
|
+
if exc is not None:
|
|
205
|
+
error_type = classify_exception(exc) if error_type in ("", "unknown") else error_type
|
|
206
|
+
detail = (error_detail or "")[:2000]
|
|
207
|
+
scope = affected_scope or ""
|
|
208
|
+
exc_type_name = type(exc).__name__ if exc is not None else ""
|
|
209
|
+
exc_module = type(exc).__module__ if exc is not None else ""
|
|
210
|
+
_LOG.error(
|
|
211
|
+
"fortify_event error_type=%s scope=%s exc_type=%s exc_module=%s detail=%s",
|
|
212
|
+
error_type,
|
|
213
|
+
scope,
|
|
214
|
+
exc_type_name,
|
|
215
|
+
exc_module,
|
|
216
|
+
detail,
|
|
217
|
+
exc_info=exc is not None,
|
|
218
|
+
)
|
|
219
|
+
now = datetime.now(timezone.utc)
|
|
220
|
+
hour_key = now.strftime("%Y-%m-%d %H:00")
|
|
221
|
+
with _stats_lock:
|
|
222
|
+
_stats.total_count += 1
|
|
223
|
+
_stats.by_type[error_type] = _stats.by_type.get(error_type, 0) + 1
|
|
224
|
+
if scope:
|
|
225
|
+
_stats.by_scope[scope] = _stats.by_scope.get(scope, 0) + 1
|
|
226
|
+
_stats.last_error = {
|
|
227
|
+
"type": error_type,
|
|
228
|
+
"detail": detail,
|
|
229
|
+
"scope": scope,
|
|
230
|
+
"time": now.isoformat(),
|
|
231
|
+
"exc_type": exc_type_name or None,
|
|
232
|
+
"exc_module": exc_module or None,
|
|
233
|
+
}
|
|
234
|
+
_stats.last_update_iso = now.isoformat()
|
|
235
|
+
# rolling hourly bucket (merge into hourly_trend max 24 entries)
|
|
236
|
+
found = False
|
|
237
|
+
for row in _stats.hourly_trend:
|
|
238
|
+
if row.get("hour") == hour_key:
|
|
239
|
+
row["count"] = row.get("count", 0) + 1
|
|
240
|
+
found = True
|
|
241
|
+
break
|
|
242
|
+
if not found:
|
|
243
|
+
_stats.hourly_trend.append({"hour": hour_key, "count": 1})
|
|
244
|
+
if len(_stats.hourly_trend) > 24:
|
|
245
|
+
_stats.hourly_trend = _stats.hourly_trend[-24:]
|
|
246
|
+
|
|
247
|
+
|
|
248
|
+
def record_retry(operation: str) -> None:
|
|
249
|
+
_retry_totals[operation] += 1
|
|
250
|
+
|
|
251
|
+
|
|
252
|
+
def get_framework_error_stats_for_client() -> Dict[str, Any]:
|
|
253
|
+
"""供 HTTP 返回:在开启脱敏时处理 last_error。"""
|
|
254
|
+
from core.safe_api_error import redact_framework_stats_for_client
|
|
255
|
+
|
|
256
|
+
return redact_framework_stats_for_client(get_framework_error_stats())
|
|
257
|
+
|
|
258
|
+
|
|
259
|
+
def get_framework_error_stats() -> Dict[str, Any]:
|
|
260
|
+
with _stats_lock:
|
|
261
|
+
by_type_out: Dict[str, Any] = {}
|
|
262
|
+
type_labels = {
|
|
263
|
+
"network": ("网络错误", "#f59e0b"),
|
|
264
|
+
"timeout": ("超时错误", "#ef4444"),
|
|
265
|
+
"parsing": ("解析错误", "#8b5cf6"),
|
|
266
|
+
"parsing-error": ("解析错误", "#8b5cf6"),
|
|
267
|
+
"io-error": ("IO 错误", "#64748b"),
|
|
268
|
+
"permission-error": ("权限错误", "#b45309"),
|
|
269
|
+
"compute-error": ("计算错误", "#dc2626"),
|
|
270
|
+
"validation-error": ("校验错误", "#ca8a04"),
|
|
271
|
+
"unknown": ("未知错误", "#6b7280"),
|
|
272
|
+
}
|
|
273
|
+
for t, c in _stats.by_type.items():
|
|
274
|
+
label, color = type_labels.get(t, (t, "#6b7280"))
|
|
275
|
+
by_type_out[t] = {"count": c, "label": label, "color": color}
|
|
276
|
+
by_agent: Dict[str, Any] = {}
|
|
277
|
+
for scope, c in _stats.by_scope.items():
|
|
278
|
+
if scope.startswith("agent_id:"):
|
|
279
|
+
aid = scope.split(":", 1)[-1]
|
|
280
|
+
by_agent[aid] = {"count": c, "agentId": aid}
|
|
281
|
+
else:
|
|
282
|
+
by_agent[scope] = {"count": c, "agentId": scope}
|
|
283
|
+
sum_by_type = sum(t.get("count", 0) if isinstance(t, dict) else 0 for t in by_type_out.values())
|
|
284
|
+
top_scopes = [
|
|
285
|
+
{"scope": k, "count": v}
|
|
286
|
+
for k, v in sorted(_stats.by_scope.items(), key=lambda kv: -kv[1])[:50]
|
|
287
|
+
]
|
|
288
|
+
return {
|
|
289
|
+
"total_count": _stats.total_count,
|
|
290
|
+
"by_type": by_type_out,
|
|
291
|
+
"by_agent": by_agent,
|
|
292
|
+
"by_scope_top": top_scopes,
|
|
293
|
+
"sum_by_type": sum_by_type,
|
|
294
|
+
"totals_consistent": sum_by_type == _stats.total_count,
|
|
295
|
+
"hourly_trend": list(_stats.hourly_trend),
|
|
296
|
+
"last_update": _stats.last_update_iso,
|
|
297
|
+
"last_error": _stats.last_error,
|
|
298
|
+
"retry_by_operation": dict(_retry_totals),
|
|
299
|
+
"retry_budget_blocks": _retry_budget_blocks,
|
|
300
|
+
}
|
|
301
|
+
|
|
302
|
+
|
|
303
|
+
def execute_with_retry(
|
|
304
|
+
max_attempts: int = 3,
|
|
305
|
+
delay_base: float = 1.0,
|
|
306
|
+
exceptions: Tuple[Type[BaseException], ...] = (Exception,),
|
|
307
|
+
) -> Callable[[Callable[..., Any]], Callable[..., Any]]:
|
|
308
|
+
def deco(fn: Callable[..., Any]) -> Callable[..., Any]:
|
|
309
|
+
@functools.wraps(fn)
|
|
310
|
+
def wrapped(*args: Any, **kwargs: Any) -> Any:
|
|
311
|
+
cfg = get_fortify_config()
|
|
312
|
+
attempts = max_attempts if max_attempts is not None else cfg.max_retry + 1
|
|
313
|
+
base = delay_base if delay_base is not None else cfg.retry_base_delay
|
|
314
|
+
last: Optional[BaseException] = None
|
|
315
|
+
for attempt in range(attempts):
|
|
316
|
+
try:
|
|
317
|
+
return fn(*args, **kwargs)
|
|
318
|
+
except exceptions as e:
|
|
319
|
+
last = e
|
|
320
|
+
record_retry(fn.__name__)
|
|
321
|
+
if attempt + 1 >= attempts:
|
|
322
|
+
break
|
|
323
|
+
if not _consume_retry_budget(fn.__name__):
|
|
324
|
+
record_error(
|
|
325
|
+
"compute-error",
|
|
326
|
+
f"retry budget exceeded (60s window) op={fn.__name__}",
|
|
327
|
+
f"retry_budget:{fn.__name__}",
|
|
328
|
+
)
|
|
329
|
+
if last:
|
|
330
|
+
raise last
|
|
331
|
+
raise RuntimeError(fn.__name__)
|
|
332
|
+
time.sleep(base * (2**attempt))
|
|
333
|
+
if last:
|
|
334
|
+
raise last
|
|
335
|
+
raise RuntimeError(fn.__name__)
|
|
336
|
+
|
|
337
|
+
return wrapped
|
|
338
|
+
|
|
339
|
+
return deco
|
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
"""
|
|
2
|
+
REQ_003-SPEC-04:按错误类型注册集中降级策略;供状态计算、列表聚合等路径调用。
|
|
3
|
+
|
|
4
|
+
REQ_003-AC-003:网络/IO 等错误在重试仍失败或未覆盖时,可读 StatusCache 中的最近状态(见 status_cache.get_stale_fallback)。
|
|
5
|
+
"""
|
|
6
|
+
from __future__ import annotations
|
|
7
|
+
|
|
8
|
+
import threading
|
|
9
|
+
from typing import Any, Callable, Dict, Optional
|
|
10
|
+
|
|
11
|
+
_Handler = Callable[..., Any]
|
|
12
|
+
|
|
13
|
+
_lock = threading.Lock()
|
|
14
|
+
_handlers: Dict[str, _Handler] = {}
|
|
15
|
+
_defaults_registered = False
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def register_fallback(error_category: str, handler: _Handler) -> None:
|
|
19
|
+
"""注册某 classify_exception 类别对应的降级函数;handler 签名为 (agent_id=None, **kwargs) -> Any。"""
|
|
20
|
+
with _lock:
|
|
21
|
+
_handlers[error_category] = handler
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def run_fallback(error_category: str, *, agent_id: Optional[str] = None, **kwargs: Any) -> Any:
|
|
25
|
+
"""按类别执行已注册降级;无匹配则返回 None。"""
|
|
26
|
+
_ensure_default_fallbacks()
|
|
27
|
+
with _lock:
|
|
28
|
+
h = _handlers.get(error_category)
|
|
29
|
+
if h is None:
|
|
30
|
+
return None
|
|
31
|
+
return h(agent_id=agent_id, **kwargs)
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def _stale_agent_status_handler(agent_id: Optional[str] = None, **_: Any) -> Optional[str]:
|
|
35
|
+
if not agent_id:
|
|
36
|
+
return None
|
|
37
|
+
from core.config_fortify import get_fortify_config
|
|
38
|
+
|
|
39
|
+
if not get_fortify_config().fallback_cache_on_io:
|
|
40
|
+
return None
|
|
41
|
+
from status.status_cache import get_cache
|
|
42
|
+
|
|
43
|
+
row = get_cache().get_stale_fallback(agent_id)
|
|
44
|
+
if not row:
|
|
45
|
+
return None
|
|
46
|
+
s = row.get("status")
|
|
47
|
+
if s in ("idle", "working", "down"):
|
|
48
|
+
return str(s)
|
|
49
|
+
return None
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def _ensure_default_fallbacks() -> None:
|
|
53
|
+
global _defaults_registered
|
|
54
|
+
if _defaults_registered:
|
|
55
|
+
return
|
|
56
|
+
with _lock:
|
|
57
|
+
if _defaults_registered:
|
|
58
|
+
return
|
|
59
|
+
for cat in ("network", "io-error", "timeout", "permission-error"):
|
|
60
|
+
if cat not in _handlers:
|
|
61
|
+
_handlers[cat] = _stale_agent_status_handler
|
|
62
|
+
_defaults_registered = True
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def reset_fallback_handlers_for_tests() -> None:
|
|
66
|
+
"""单测隔离:清空注册表并允许重新挂载默认处理器。"""
|
|
67
|
+
global _handlers, _defaults_registered
|
|
68
|
+
with _lock:
|
|
69
|
+
_handlers.clear()
|
|
70
|
+
_defaults_registered = False
|
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
"""
|
|
2
|
+
面向浏览器/API 客户端的错误文案脱敏(NFR-S-001)。
|
|
3
|
+
服务端日志仍由 record_error 记录完整信息(默认不截断)。
|
|
4
|
+
"""
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
|
|
7
|
+
import re
|
|
8
|
+
from typing import Any, Dict
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def sanitize_client_error_text(raw: str, max_len: int = 1200) -> str:
|
|
12
|
+
"""去除常见密钥/路径/邮箱形态,压缩长度;含 Traceback 时整段替换。"""
|
|
13
|
+
if not raw:
|
|
14
|
+
return "internal error"
|
|
15
|
+
s = raw.replace("\r", " ").replace("\n", " ")
|
|
16
|
+
if len(s) > max_len * 2:
|
|
17
|
+
s = s[: max_len * 2]
|
|
18
|
+
if "Traceback (most recent call last)" in raw or '\n File "' in raw:
|
|
19
|
+
return "Internal error (details redacted; see server logs)"
|
|
20
|
+
|
|
21
|
+
s = re.sub(r"\bsk-[a-zA-Z0-9]{12,}\b", "sk-[REDACTED]", s, flags=re.I)
|
|
22
|
+
s = re.sub(r"\bxox[baprs]-[a-zA-Z0-9-]{10,}\b", "[slack-token]", s)
|
|
23
|
+
s = re.sub(r"Bearer\s+[a-zA-Z0-9._=+\/-]{12,}", "Bearer [REDACTED]", s, flags=re.I)
|
|
24
|
+
s = re.sub(
|
|
25
|
+
r"\bAKIA[0-9A-Z]{16}\b",
|
|
26
|
+
"AKIA[REDACTED]",
|
|
27
|
+
s,
|
|
28
|
+
)
|
|
29
|
+
s = re.sub(r"(?i)password\s*[=:]\s*[^\s,}\"]{2,}", "password=[REDACTED]", s)
|
|
30
|
+
s = re.sub(
|
|
31
|
+
r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b",
|
|
32
|
+
"[email]",
|
|
33
|
+
s,
|
|
34
|
+
)
|
|
35
|
+
s = re.sub(r"(?:/home/|/Users/)[^\s:]{1,80}/[^\s:]{0,200}", "[path]", s)
|
|
36
|
+
s = re.sub(r"[A-Za-z]:\\(?:[^\\\s]+\\){0,8}[^\s\\]{0,120}", "[path]", s)
|
|
37
|
+
s = re.sub(r"/[^\s:]{0,16}\.openclaw(?:/[^\s:]{0,160})?", "[openclaw-path]", s)
|
|
38
|
+
|
|
39
|
+
if len(s) > max_len:
|
|
40
|
+
s = s[:max_len] + "…"
|
|
41
|
+
return s
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def safe_client_string(message: str) -> str:
|
|
45
|
+
"""JSON 响应体中的 error 等字符串字段脱敏。"""
|
|
46
|
+
from core.config_fortify import get_fortify_config
|
|
47
|
+
|
|
48
|
+
raw = message or ""
|
|
49
|
+
if not get_fortify_config().sanitize_api_errors:
|
|
50
|
+
return raw[:4000]
|
|
51
|
+
return sanitize_client_error_text(raw)
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def safe_api_error_detail(exc: BaseException) -> str:
|
|
55
|
+
"""HTTP 500 等返回给客户端的 detail 字符串。"""
|
|
56
|
+
from core.config_fortify import get_fortify_config
|
|
57
|
+
|
|
58
|
+
raw = str(exc).strip() or type(exc).__name__
|
|
59
|
+
if not get_fortify_config().sanitize_api_errors:
|
|
60
|
+
return raw[:4000]
|
|
61
|
+
return sanitize_client_error_text(raw)
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def redact_framework_stats_for_client(data: Dict[str, Any]) -> Dict[str, Any]:
|
|
65
|
+
"""为 /api/errors/stats 等接口脱敏 last_error.detail。"""
|
|
66
|
+
from core.config_fortify import get_fortify_config
|
|
67
|
+
|
|
68
|
+
if not get_fortify_config().sanitize_api_errors:
|
|
69
|
+
return data
|
|
70
|
+
out = dict(data)
|
|
71
|
+
le = out.get("last_error")
|
|
72
|
+
if isinstance(le, dict) and le.get("detail"):
|
|
73
|
+
le = dict(le)
|
|
74
|
+
le["detail"] = sanitize_client_error_text(str(le["detail"]))
|
|
75
|
+
out["last_error"] = le
|
|
76
|
+
return out
|