openclaw-agent-dashboard 1.0.39 → 1.0.41
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dashboard/api/agent_config_api.py +28 -7
- package/dashboard/api/agents.py +48 -10
- package/dashboard/api/agents_config.py +5 -1
- package/dashboard/api/chains.py +25 -5
- package/dashboard/api/collaboration.py +10 -9
- package/dashboard/api/debug_paths.py +5 -1
- package/dashboard/api/error_analysis.py +29 -11
- package/dashboard/api/errors.py +37 -11
- package/dashboard/api/fortify_routes.py +108 -0
- package/dashboard/api/input_safety.py +60 -0
- package/dashboard/api/performance.py +73 -53
- package/dashboard/api/subagents.py +95 -99
- package/dashboard/api/timeline.py +24 -3
- package/dashboard/api/version.py +2 -0
- package/dashboard/api/websocket.py +9 -7
- package/dashboard/core/__init__.py +1 -0
- package/dashboard/core/config_fortify.py +125 -0
- package/dashboard/core/error_handler.py +488 -0
- package/dashboard/core/fallback_manager.py +81 -0
- package/dashboard/core/logging_config.py +217 -0
- package/dashboard/core/safe_api_error.py +76 -0
- package/dashboard/core/schemas/__init__.py +16 -0
- package/dashboard/core/schemas/base.py +43 -0
- package/dashboard/core/schemas/session_schema.py +40 -0
- package/dashboard/core/schemas/subagent_schema.py +23 -0
- package/dashboard/data/agent_config_manager.py +6 -4
- package/dashboard/data/chain_reader.py +16 -12
- package/dashboard/data/error_analyzer.py +15 -11
- package/dashboard/data/session_reader.py +268 -46
- package/dashboard/data/subagent_reader.py +74 -49
- package/dashboard/data/timeline_reader.py +35 -49
- package/dashboard/main.py +24 -2
- package/dashboard/mechanism_reader.py +4 -5
- package/dashboard/mechanisms.py +2 -2
- package/dashboard/pytest.ini +3 -0
- package/dashboard/requirements.txt +5 -0
- package/dashboard/status/cache_fp_probe.py +40 -0
- package/dashboard/status/status_cache.py +199 -72
- package/dashboard/status/status_calculator.py +50 -30
- package/dashboard/tests/conftest.py +87 -0
- package/dashboard/tests/test_api_contracts.py +372 -0
- package/dashboard/tests/test_bench_fortify.py +176 -0
- package/dashboard/tests/test_fortify.py +952 -0
- package/dashboard/utils/__init__.py +1 -0
- package/dashboard/utils/data_repair.py +210 -0
- package/dashboard/watchers/file_watcher.py +380 -77
- package/frontend-dist/assets/{index-cYIOn3Wq.css → index-BIZ2xHfw.css} +1 -1
- package/frontend-dist/assets/{index-DyRXGevD.js → index-Cnr0b02R.js} +1 -1
- package/frontend-dist/index.html +2 -2
- package/openclaw.plugin.json +1 -1
- package/package.json +1 -1
- package/dashboard/agents.py +0 -74
- package/dashboard/collaboration.py +0 -407
- package/dashboard/errors.py +0 -63
- package/dashboard/performance.py +0 -474
- package/dashboard/session_reader.py +0 -240
- package/dashboard/status_calculator.py +0 -121
- package/dashboard/subagent_reader.py +0 -232
package/dashboard/api/version.py
CHANGED
|
@@ -9,6 +9,7 @@ from pydantic import BaseModel
|
|
|
9
9
|
from typing import Optional
|
|
10
10
|
|
|
11
11
|
# 导入版本信息读取器
|
|
12
|
+
from core.error_handler import record_error
|
|
12
13
|
from data.version_info_reader import get_version_reader
|
|
13
14
|
|
|
14
15
|
logger = logging.getLogger(__name__)
|
|
@@ -38,6 +39,7 @@ async def get_version_info() -> VersionInfo:
|
|
|
38
39
|
version_data = reader.read_version_info()
|
|
39
40
|
return VersionInfo(**version_data)
|
|
40
41
|
except Exception as e:
|
|
42
|
+
record_error("unknown", str(e), "api:version", exc=e)
|
|
41
43
|
logger.exception("get_version_info 异常,返回降级数据: %s", e)
|
|
42
44
|
return VersionInfo(
|
|
43
45
|
version="unknown",
|
|
@@ -11,6 +11,8 @@ from pathlib import Path
|
|
|
11
11
|
|
|
12
12
|
sys.path.append(str(Path(__file__).parent.parent))
|
|
13
13
|
|
|
14
|
+
from core.error_handler import record_error
|
|
15
|
+
|
|
14
16
|
router = APIRouter()
|
|
15
17
|
|
|
16
18
|
# 活跃的 WebSocket 连接
|
|
@@ -33,7 +35,7 @@ async def _periodic_broadcast_loop():
|
|
|
33
35
|
if changed_agents:
|
|
34
36
|
await broadcast_state_update(changed_agents)
|
|
35
37
|
except Exception as e:
|
|
36
|
-
|
|
38
|
+
record_error("unknown", str(e), "websocket:periodic_broadcast", exc=e)
|
|
37
39
|
|
|
38
40
|
|
|
39
41
|
def _ensure_broadcast_task():
|
|
@@ -110,26 +112,26 @@ async def send_initial_state(websocket: WebSocket):
|
|
|
110
112
|
collab = await get_collaboration()
|
|
111
113
|
data['collaboration'] = collab.model_dump() if hasattr(collab, "model_dump") else collab
|
|
112
114
|
except Exception as e:
|
|
113
|
-
|
|
115
|
+
record_error("unknown", str(e), "websocket:initial_collaboration", exc=e)
|
|
114
116
|
try:
|
|
115
117
|
tasks_result = await get_tasks()
|
|
116
118
|
data['tasks'] = tasks_result.get("tasks", []) if isinstance(tasks_result, dict) else []
|
|
117
119
|
except Exception as e:
|
|
118
|
-
|
|
120
|
+
record_error("unknown", str(e), "websocket:initial_tasks", exc=e)
|
|
119
121
|
try:
|
|
120
122
|
from .performance import get_real_stats
|
|
121
123
|
data['performance'] = await get_real_stats()
|
|
122
124
|
except Exception as e:
|
|
123
|
-
|
|
125
|
+
record_error("unknown", str(e), "websocket:initial_performance", exc=e)
|
|
124
126
|
try:
|
|
125
127
|
from .workflow import list_workflows
|
|
126
128
|
data['workflows'] = await list_workflows()
|
|
127
129
|
except Exception as e:
|
|
128
|
-
|
|
130
|
+
record_error("unknown", str(e), "websocket:initial_workflows", exc=e)
|
|
129
131
|
|
|
130
132
|
await websocket.send_json({'type': 'full_state', 'data': data})
|
|
131
133
|
except Exception as e:
|
|
132
|
-
|
|
134
|
+
record_error("unknown", str(e), "websocket:send_initial_state", exc=e)
|
|
133
135
|
|
|
134
136
|
|
|
135
137
|
async def broadcast_agent_update(agent_id: str, status: str):
|
|
@@ -250,7 +252,7 @@ async def broadcast_full_state():
|
|
|
250
252
|
},
|
|
251
253
|
})
|
|
252
254
|
except Exception as e:
|
|
253
|
-
|
|
255
|
+
record_error("unknown", str(e), "websocket:broadcast_full_state", exc=e)
|
|
254
256
|
|
|
255
257
|
|
|
256
258
|
async def broadcast_state_update(changed_agents: List[Dict[str, Any]]) -> None:
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""Core fortify modules (error handling, config, schemas)."""
|
|
@@ -0,0 +1,125 @@
|
|
|
1
|
+
"""
|
|
2
|
+
TECHDEBT_FORTIFY: centralized environment configuration.
|
|
3
|
+
|
|
4
|
+
OPENCLAW_CACHE_MAX_SIZE = max cache memory in MB (PRD).
|
|
5
|
+
OPENCLAW_CACHE_MAX_ENTRIES = max number of cache entries (distinct from memory cap).
|
|
6
|
+
"""
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
import os
|
|
10
|
+
from dataclasses import dataclass
|
|
11
|
+
from functools import lru_cache
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def _env_int(key: str, default: int, min_v: int | None = None, max_v: int | None = None) -> int:
|
|
15
|
+
raw = os.environ.get(key)
|
|
16
|
+
if raw is None or raw == "":
|
|
17
|
+
v = default
|
|
18
|
+
else:
|
|
19
|
+
try:
|
|
20
|
+
v = int(raw)
|
|
21
|
+
except ValueError:
|
|
22
|
+
v = default
|
|
23
|
+
if min_v is not None:
|
|
24
|
+
v = max(min_v, v)
|
|
25
|
+
if max_v is not None:
|
|
26
|
+
v = min(max_v, v)
|
|
27
|
+
return v
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def _env_float(key: str, default: float) -> float:
|
|
31
|
+
raw = os.environ.get(key)
|
|
32
|
+
if raw is None or raw == "":
|
|
33
|
+
return default
|
|
34
|
+
try:
|
|
35
|
+
return float(raw)
|
|
36
|
+
except ValueError:
|
|
37
|
+
return default
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def _env_bool(key: str, default: bool) -> bool:
|
|
41
|
+
raw = os.environ.get(key)
|
|
42
|
+
if raw is None or raw == "":
|
|
43
|
+
return default
|
|
44
|
+
return raw.lower() in ("1", "true", "yes", "on")
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def _env_str(key: str, default: str) -> str:
|
|
48
|
+
raw = os.environ.get(key)
|
|
49
|
+
if raw is None or raw == "":
|
|
50
|
+
return default
|
|
51
|
+
return raw
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
@dataclass(frozen=True)
|
|
55
|
+
class FortifyConfig:
|
|
56
|
+
cache_ttl_seconds: int
|
|
57
|
+
cache_max_entries: int
|
|
58
|
+
cache_max_memory_mb: int
|
|
59
|
+
cache_preload: bool
|
|
60
|
+
cache_double_check: bool
|
|
61
|
+
cache_fp_probe_interval_sec: float
|
|
62
|
+
|
|
63
|
+
max_retry: int
|
|
64
|
+
retry_base_delay: float
|
|
65
|
+
retry_budget_per_minute: int
|
|
66
|
+
enable_fallback: bool
|
|
67
|
+
fallback_cache_on_io: bool
|
|
68
|
+
error_log_level: str
|
|
69
|
+
sanitize_api_errors: bool
|
|
70
|
+
|
|
71
|
+
json_strict: bool
|
|
72
|
+
auto_repair_json: bool
|
|
73
|
+
auto_repair_write_back: bool
|
|
74
|
+
repair_backup_path: str | None
|
|
75
|
+
max_repair_attempts: int
|
|
76
|
+
|
|
77
|
+
watcher_max_retries: int
|
|
78
|
+
watcher_poll_interval_sec: float
|
|
79
|
+
watcher_failure_window_sec: float
|
|
80
|
+
|
|
81
|
+
# NFR-S-003: Logging storage security
|
|
82
|
+
log_retention_days: int
|
|
83
|
+
log_max_size_mb: int
|
|
84
|
+
log_backup_count: int
|
|
85
|
+
log_file_path: str | None
|
|
86
|
+
log_compression: bool
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
@lru_cache(maxsize=1)
|
|
90
|
+
def get_fortify_config() -> FortifyConfig:
|
|
91
|
+
ttl = _env_int("OPENCLAW_CACHE_TTL", 1, min_v=1, max_v=60)
|
|
92
|
+
return FortifyConfig(
|
|
93
|
+
cache_ttl_seconds=ttl,
|
|
94
|
+
cache_max_entries=_env_int("OPENCLAW_CACHE_MAX_ENTRIES", 100, min_v=1, max_v=10_000),
|
|
95
|
+
cache_max_memory_mb=_env_int("OPENCLAW_CACHE_MAX_SIZE", 100, min_v=1, max_v=4096),
|
|
96
|
+
cache_preload=_env_bool("OPENCLAW_CACHE_PRELOAD", True),
|
|
97
|
+
cache_double_check=_env_bool("OPENCLAW_CACHE_DOUBLE_CHECK", True),
|
|
98
|
+
cache_fp_probe_interval_sec=_env_float("OPENCLAW_CACHE_FP_PROBE_INTERVAL", 0.0),
|
|
99
|
+
max_retry=_env_int("OPENCLAW_MAX_RETRY", 3, min_v=0, max_v=20),
|
|
100
|
+
retry_base_delay=_env_float("OPENCLAW_RETRY_BASE_DELAY", 1.0),
|
|
101
|
+
retry_budget_per_minute=_env_int("OPENCLAW_RETRY_BUDGET_PER_MINUTE", 300, min_v=0, max_v=100_000),
|
|
102
|
+
enable_fallback=_env_bool("OPENCLAW_ENABLE_FALLBACK", True),
|
|
103
|
+
fallback_cache_on_io=_env_bool("OPENCLAW_FALLBACK_CACHE_ON_IO", True),
|
|
104
|
+
error_log_level=_env_str("OPENCLAW_ERROR_LOG_LEVEL", "INFO").upper(),
|
|
105
|
+
sanitize_api_errors=_env_bool("OPENCLAW_API_ERROR_SANITIZE", True),
|
|
106
|
+
json_strict=_env_bool("OPENCLAW_JSON_STRICT", True),
|
|
107
|
+
auto_repair_json=_env_bool("OPENCLAW_AUTO_REPAIR_JSON", True),
|
|
108
|
+
auto_repair_write_back=_env_bool("OPENCLAW_AUTO_REPAIR_WB", False),
|
|
109
|
+
repair_backup_path=os.environ.get("OPENCLAW_REPAIR_BACKUP") or None,
|
|
110
|
+
max_repair_attempts=_env_int("OPENCLAW_MAX_REPAIR_ATTEMPTS", 3, min_v=1, max_v=10),
|
|
111
|
+
watcher_max_retries=_env_int("OPENCLAW_WATCHER_MAX_RETRIES", 3, min_v=1, max_v=10),
|
|
112
|
+
watcher_poll_interval_sec=_env_float("OPENCLAW_WATCHER_POLL_INTERVAL", 5.0),
|
|
113
|
+
watcher_failure_window_sec=_env_float("OPENCLAW_WATCHER_FAILURE_WINDOW", 30.0),
|
|
114
|
+
# NFR-S-003: Logging storage security
|
|
115
|
+
log_retention_days=_env_int("OPENCLAW_LOG_RETENTION_DAYS", 30, min_v=1, max_v=365),
|
|
116
|
+
log_max_size_mb=_env_int("OPENCLAW_LOG_MAX_SIZE_MB", 100, min_v=1, max_v=1024),
|
|
117
|
+
log_backup_count=_env_int("OPENCLAW_LOG_BACKUP_COUNT", 5, min_v=1, max_v=50),
|
|
118
|
+
log_file_path=os.environ.get("OPENCLAW_LOG_FILE_PATH") or None,
|
|
119
|
+
log_compression=_env_bool("OPENCLAW_LOG_COMPRESSION", True),
|
|
120
|
+
)
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
def refresh_fortify_config_cache() -> FortifyConfig:
|
|
124
|
+
get_fortify_config.cache_clear()
|
|
125
|
+
return get_fortify_config()
|
|
@@ -0,0 +1,488 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Unified error handling: classification, exponential backoff retry, in-process stats, structured logging.
|
|
3
|
+
|
|
4
|
+
降级策略集中注册见 core.fallback_manager(REQ_003-SPEC-04);IO 失败读缓存见 status_cache.get_stale_fallback(REQ_003-AC-003)。
|
|
5
|
+
"""
|
|
6
|
+
from __future__ import annotations
|
|
7
|
+
|
|
8
|
+
import logging
|
|
9
|
+
import threading
|
|
10
|
+
import time
|
|
11
|
+
import functools
|
|
12
|
+
from collections import defaultdict, deque
|
|
13
|
+
from dataclasses import dataclass, field
|
|
14
|
+
from datetime import datetime, timezone
|
|
15
|
+
from typing import Any, Callable, Dict, List, Optional, Tuple, Type
|
|
16
|
+
|
|
17
|
+
from core.config_fortify import get_fortify_config
|
|
18
|
+
|
|
19
|
+
_LOG = logging.getLogger("openclaw.fortify")
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def _ensure_fortify_logging() -> None:
|
|
23
|
+
if getattr(_ensure_fortify_logging, "_done", False):
|
|
24
|
+
return
|
|
25
|
+
cfg = get_fortify_config()
|
|
26
|
+
level = getattr(logging, cfg.error_log_level, logging.INFO)
|
|
27
|
+
_LOG.setLevel(level)
|
|
28
|
+
if not _LOG.handlers:
|
|
29
|
+
# Try to use secure file-based logging if configured
|
|
30
|
+
try:
|
|
31
|
+
from core.logging_config import setup_secure_logging, get_log_file_path
|
|
32
|
+
log_path = get_log_file_path()
|
|
33
|
+
if log_path is not None:
|
|
34
|
+
# Secure logging is configured, skip console-only handler
|
|
35
|
+
# setup_secure_logging() already added file handlers
|
|
36
|
+
_ensure_fortify_logging._done = True # type: ignore[attr-defined]
|
|
37
|
+
return
|
|
38
|
+
except ImportError:
|
|
39
|
+
pass # Fall back to console handler
|
|
40
|
+
|
|
41
|
+
h = logging.StreamHandler()
|
|
42
|
+
h.setFormatter(
|
|
43
|
+
logging.Formatter(
|
|
44
|
+
fmt="%(asctime)s | %(levelname)s | fortify | %(message)s",
|
|
45
|
+
datefmt="%Y-%m-%dT%H:%M:%S%z",
|
|
46
|
+
)
|
|
47
|
+
)
|
|
48
|
+
_LOG.addHandler(h)
|
|
49
|
+
_ensure_fortify_logging._done = True # type: ignore[attr-defined]
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
@dataclass
|
|
53
|
+
class ErrorHandlerStats:
|
|
54
|
+
total_count: int = 0
|
|
55
|
+
by_type: Dict[str, int] = field(default_factory=dict)
|
|
56
|
+
by_scope: Dict[str, int] = field(default_factory=dict)
|
|
57
|
+
hourly_trend: List[Dict[str, Any]] = field(default_factory=list)
|
|
58
|
+
last_error: Optional[Dict[str, Any]] = None
|
|
59
|
+
last_update_iso: Optional[str] = None
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
_stats_lock = threading.Lock()
|
|
63
|
+
_stats = ErrorHandlerStats()
|
|
64
|
+
_retry_totals = defaultdict(int)
|
|
65
|
+
|
|
66
|
+
_retry_budget_lock = threading.Lock()
|
|
67
|
+
_retry_budget_deques: Dict[str, deque] = {}
|
|
68
|
+
_retry_budget_blocks = 0
|
|
69
|
+
|
|
70
|
+
# NFR-R: Reliability metrics
|
|
71
|
+
_reliability_lock = threading.Lock()
|
|
72
|
+
# Error recovery tracking
|
|
73
|
+
_error_recovery_times: deque = deque(maxlen=100) # last 100 recovery times in seconds
|
|
74
|
+
_last_error_timestamp: Optional[float] = None
|
|
75
|
+
_last_recovery_timestamp: Optional[float] = None
|
|
76
|
+
|
|
77
|
+
# Graceful degradation tracking
|
|
78
|
+
_fallback_total_attempts = 0
|
|
79
|
+
_fallback_success_count = 0
|
|
80
|
+
|
|
81
|
+
# Watcher availability tracking
|
|
82
|
+
_watcher_uptime_start: Optional[float] = None
|
|
83
|
+
_watcher_total_uptime_seconds = 0.0
|
|
84
|
+
_watcher_total_downtime_seconds = 0.0
|
|
85
|
+
_watchdog_last_failure_time: Optional[float] = None
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def record_fallback_attempt(success: bool) -> None:
|
|
89
|
+
"""Record graceful degradation attempt (NFR-R-005)."""
|
|
90
|
+
global _fallback_total_attempts, _fallback_success_count
|
|
91
|
+
with _reliability_lock:
|
|
92
|
+
_fallback_total_attempts += 1
|
|
93
|
+
if success:
|
|
94
|
+
_fallback_success_count += 1
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
def record_error_recovery(duration_seconds: float) -> None:
|
|
98
|
+
"""Record error recovery time (NFR-R-003)."""
|
|
99
|
+
with _reliability_lock:
|
|
100
|
+
_error_recovery_times.append(duration_seconds)
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
def record_watcher_failure() -> None:
|
|
104
|
+
"""Mark watchdog failure start time."""
|
|
105
|
+
global _watchdog_last_failure_time, _watcher_uptime_start, _watcher_total_uptime_seconds
|
|
106
|
+
now = time.time()
|
|
107
|
+
with _reliability_lock:
|
|
108
|
+
if _watchdog_last_failure_time is None:
|
|
109
|
+
_watchdog_last_failure_time = now
|
|
110
|
+
if _watcher_uptime_start is not None:
|
|
111
|
+
_watcher_total_uptime_seconds += now - _watcher_uptime_start
|
|
112
|
+
_watcher_uptime_start = None
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
def record_watcher_recovery() -> None:
|
|
116
|
+
"""Mark watchdog recovery and record recovery time (NFR-R-003)."""
|
|
117
|
+
global _watchdog_last_failure_time, _watcher_uptime_start, _watcher_total_downtime_seconds
|
|
118
|
+
now = time.time()
|
|
119
|
+
recovery_time = 0.0
|
|
120
|
+
with _reliability_lock:
|
|
121
|
+
if _watchdog_last_failure_time is not None:
|
|
122
|
+
recovery_time = now - _watchdog_last_failure_time
|
|
123
|
+
_watcher_total_downtime_seconds += recovery_time
|
|
124
|
+
_watchdog_last_failure_time = None
|
|
125
|
+
_watcher_uptime_start = now
|
|
126
|
+
# Record outside lock to avoid deadlock (record_error_recovery also uses _reliability_lock)
|
|
127
|
+
if recovery_time > 0:
|
|
128
|
+
_error_recovery_times.append(recovery_time)
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
def get_reliability_metrics() -> Dict[str, Any]:
|
|
132
|
+
"""Get all reliability metrics for NFR-R-002/003/005."""
|
|
133
|
+
import statistics as _statistics
|
|
134
|
+
|
|
135
|
+
with _reliability_lock:
|
|
136
|
+
current_time = time.time()
|
|
137
|
+
current_uptime = 0.0
|
|
138
|
+
if _watcher_uptime_start is not None:
|
|
139
|
+
current_uptime = current_time - _watcher_uptime_start
|
|
140
|
+
|
|
141
|
+
total_uptime = _watcher_total_uptime_seconds + current_uptime
|
|
142
|
+
total_downtime = _watcher_total_downtime_seconds
|
|
143
|
+
total_time = total_uptime + total_downtime
|
|
144
|
+
|
|
145
|
+
# NFR-R-002: Watcher availability/success rate
|
|
146
|
+
availability_rate = 1.0
|
|
147
|
+
if total_time > 0:
|
|
148
|
+
availability_rate = total_uptime / total_time
|
|
149
|
+
|
|
150
|
+
# NFR-R-003: Error recovery time
|
|
151
|
+
recovery_times_list = list(_error_recovery_times)
|
|
152
|
+
avg_recovery_time = 0.0
|
|
153
|
+
p95_recovery_time = 0.0
|
|
154
|
+
if recovery_times_list:
|
|
155
|
+
avg_recovery_time = _statistics.mean(recovery_times_list)
|
|
156
|
+
sorted_times = sorted(recovery_times_list)
|
|
157
|
+
p95_idx = int(len(sorted_times) * 0.95)
|
|
158
|
+
p95_recovery_time = sorted_times[min(p95_idx, len(sorted_times) - 1)]
|
|
159
|
+
|
|
160
|
+
# NFR-R-005: Graceful degradation rate
|
|
161
|
+
graceful_degradation_rate = 1.0
|
|
162
|
+
if _fallback_total_attempts > 0:
|
|
163
|
+
graceful_degradation_rate = _fallback_success_count / _fallback_total_attempts
|
|
164
|
+
|
|
165
|
+
return {
|
|
166
|
+
# NFR-R-002: Watcher availability
|
|
167
|
+
"watcher_uptime_seconds": total_uptime,
|
|
168
|
+
"watcher_downtime_seconds": total_downtime,
|
|
169
|
+
"watcher_availability_rate": round(availability_rate, 4),
|
|
170
|
+
"watcher_uptime_percentage": round(availability_rate * 100, 2),
|
|
171
|
+
# NFR-R-003: Error recovery time
|
|
172
|
+
"avg_error_recovery_seconds": round(avg_recovery_time, 3),
|
|
173
|
+
"p95_error_recovery_seconds": round(p95_recovery_time, 3),
|
|
174
|
+
"error_recovery_count": len(recovery_times_list),
|
|
175
|
+
"last_error_recovery_time": recovery_times_list[-1] if recovery_times_list else None,
|
|
176
|
+
# NFR-R-005: Graceful degradation
|
|
177
|
+
"graceful_degradation_attempts": _fallback_total_attempts,
|
|
178
|
+
"graceful_degradation_successes": _fallback_success_count,
|
|
179
|
+
"graceful_degradation_rate": round(graceful_degradation_rate, 4),
|
|
180
|
+
"graceful_degradation_percentage": round(graceful_degradation_rate * 100, 2),
|
|
181
|
+
}
|
|
182
|
+
|
|
183
|
+
|
|
184
|
+
def reset_reliability_metrics_for_tests() -> None:
|
|
185
|
+
"""Reset reliability metrics for testing."""
|
|
186
|
+
global _error_recovery_times, _last_error_timestamp, _last_recovery_timestamp
|
|
187
|
+
global _fallback_total_attempts, _fallback_success_count
|
|
188
|
+
global _watcher_uptime_start, _watcher_total_uptime_seconds, _watcher_total_downtime_seconds
|
|
189
|
+
global _watchdog_last_failure_time
|
|
190
|
+
with _reliability_lock:
|
|
191
|
+
_error_recovery_times.clear()
|
|
192
|
+
_last_error_timestamp = None
|
|
193
|
+
_last_recovery_timestamp = None
|
|
194
|
+
_fallback_total_attempts = 0
|
|
195
|
+
_fallback_success_count = 0
|
|
196
|
+
_watcher_uptime_start = None
|
|
197
|
+
_watcher_total_uptime_seconds = 0.0
|
|
198
|
+
_watcher_total_downtime_seconds = 0.0
|
|
199
|
+
_watchdog_last_failure_time = None
|
|
200
|
+
|
|
201
|
+
|
|
202
|
+
def _consume_retry_budget(operation: str) -> bool:
|
|
203
|
+
"""
|
|
204
|
+
滑动窗口(60s)内同一 operation 的退避重试次数上限,缓解重试风暴(RISK-005)。
|
|
205
|
+
OPENCLAW_RETRY_BUDGET_PER_MINUTE=0 表示不限制。
|
|
206
|
+
"""
|
|
207
|
+
global _retry_budget_blocks
|
|
208
|
+
cfg = get_fortify_config()
|
|
209
|
+
limit = cfg.retry_budget_per_minute
|
|
210
|
+
if limit <= 0:
|
|
211
|
+
return True
|
|
212
|
+
op = (operation or "default").strip() or "default"
|
|
213
|
+
now = time.monotonic()
|
|
214
|
+
with _retry_budget_lock:
|
|
215
|
+
dq = _retry_budget_deques.setdefault(op, deque())
|
|
216
|
+
while dq and now - dq[0] > 60.0:
|
|
217
|
+
dq.popleft()
|
|
218
|
+
if len(dq) >= limit:
|
|
219
|
+
_retry_budget_blocks += 1
|
|
220
|
+
return False
|
|
221
|
+
dq.append(now)
|
|
222
|
+
return True
|
|
223
|
+
|
|
224
|
+
|
|
225
|
+
def classify_exception(exc: BaseException) -> str:
|
|
226
|
+
"""Map exception to PRD-style category."""
|
|
227
|
+
import json as _json
|
|
228
|
+
|
|
229
|
+
if isinstance(exc, TimeoutError):
|
|
230
|
+
return "timeout"
|
|
231
|
+
if isinstance(exc, PermissionError):
|
|
232
|
+
return "permission-error"
|
|
233
|
+
if isinstance(exc, FileNotFoundError):
|
|
234
|
+
return "io-error"
|
|
235
|
+
if isinstance(exc, (BrokenPipeError, ConnectionResetError, ConnectionAbortedError)):
|
|
236
|
+
return "network"
|
|
237
|
+
if isinstance(exc, (ConnectionError, OSError)):
|
|
238
|
+
msg = str(exc).lower()
|
|
239
|
+
if "network" in msg or "connection" in msg or "broken pipe" in msg:
|
|
240
|
+
return "network"
|
|
241
|
+
return "io-error"
|
|
242
|
+
if isinstance(exc, _json.JSONDecodeError):
|
|
243
|
+
return "parsing-error"
|
|
244
|
+
if isinstance(exc, UnicodeDecodeError):
|
|
245
|
+
return "parsing-error"
|
|
246
|
+
if isinstance(exc, MemoryError):
|
|
247
|
+
return "compute-error"
|
|
248
|
+
if isinstance(exc, RecursionError):
|
|
249
|
+
return "compute-error"
|
|
250
|
+
if isinstance(exc, (KeyError, TypeError, AttributeError)):
|
|
251
|
+
return "validation-error"
|
|
252
|
+
if isinstance(exc, ValueError):
|
|
253
|
+
return "validation-error"
|
|
254
|
+
try:
|
|
255
|
+
import ssl
|
|
256
|
+
except ImportError:
|
|
257
|
+
pass
|
|
258
|
+
else:
|
|
259
|
+
if isinstance(exc, ssl.SSLError):
|
|
260
|
+
return "network"
|
|
261
|
+
return "unknown"
|
|
262
|
+
|
|
263
|
+
|
|
264
|
+
class ErrorHandler:
|
|
265
|
+
"""Per-use-case handler; global stats still aggregate via record_error."""
|
|
266
|
+
|
|
267
|
+
def __init__(
|
|
268
|
+
self,
|
|
269
|
+
max_retry: Optional[int] = None,
|
|
270
|
+
base_delay: Optional[float] = None,
|
|
271
|
+
enable_fallback: Optional[bool] = None,
|
|
272
|
+
):
|
|
273
|
+
_ensure_fortify_logging()
|
|
274
|
+
cfg = get_fortify_config()
|
|
275
|
+
self.max_retry = cfg.max_retry if max_retry is None else max_retry
|
|
276
|
+
self.base_delay = cfg.retry_base_delay if base_delay is None else base_delay
|
|
277
|
+
self.enable_fallback = cfg.enable_fallback if enable_fallback is None else enable_fallback
|
|
278
|
+
|
|
279
|
+
def log_error(
|
|
280
|
+
self,
|
|
281
|
+
error_type: str,
|
|
282
|
+
error_detail: str,
|
|
283
|
+
affected_scope: str = "",
|
|
284
|
+
exc: Optional[BaseException] = None,
|
|
285
|
+
) -> None:
|
|
286
|
+
record_error(error_type, error_detail, affected_scope, exc)
|
|
287
|
+
|
|
288
|
+
def get_stats(self) -> Dict[str, Any]:
|
|
289
|
+
return get_framework_error_stats()
|
|
290
|
+
|
|
291
|
+
def run_with_retry(
|
|
292
|
+
self,
|
|
293
|
+
fn: Callable[[], Any],
|
|
294
|
+
*,
|
|
295
|
+
operation: str = "operation",
|
|
296
|
+
error_type: str = "unknown",
|
|
297
|
+
fallback: Optional[Callable[[], Any]] = None,
|
|
298
|
+
retryable: Optional[Tuple[Type[BaseException], ...]] = None,
|
|
299
|
+
) -> Any:
|
|
300
|
+
if retryable is None:
|
|
301
|
+
retryable = (OSError, IOError, TimeoutError, ConnectionError)
|
|
302
|
+
attempts = max(1, self.max_retry + 1)
|
|
303
|
+
last_exc: Optional[BaseException] = None
|
|
304
|
+
for attempt in range(attempts):
|
|
305
|
+
try:
|
|
306
|
+
return fn()
|
|
307
|
+
except retryable as e:
|
|
308
|
+
last_exc = e
|
|
309
|
+
if attempt + 1 >= attempts:
|
|
310
|
+
break
|
|
311
|
+
if not _consume_retry_budget(operation):
|
|
312
|
+
record_error(
|
|
313
|
+
"compute-error",
|
|
314
|
+
f"retry budget exceeded (60s window) op={operation}",
|
|
315
|
+
f"retry_budget:{operation}",
|
|
316
|
+
)
|
|
317
|
+
if self.enable_fallback and fallback is not None:
|
|
318
|
+
return fallback()
|
|
319
|
+
if last_exc:
|
|
320
|
+
raise last_exc
|
|
321
|
+
raise RuntimeError(operation)
|
|
322
|
+
delay = self.base_delay * (2**attempt)
|
|
323
|
+
_LOG.warning(
|
|
324
|
+
"retry operation=%s attempt=%s/%s delay=%.2fs err=%s",
|
|
325
|
+
operation,
|
|
326
|
+
attempt + 1,
|
|
327
|
+
attempts,
|
|
328
|
+
delay,
|
|
329
|
+
e,
|
|
330
|
+
)
|
|
331
|
+
time.sleep(delay)
|
|
332
|
+
record_error(error_type, str(last_exc) if last_exc else "failed", operation, last_exc)
|
|
333
|
+
if self.enable_fallback and fallback is not None:
|
|
334
|
+
return fallback()
|
|
335
|
+
if last_exc:
|
|
336
|
+
raise last_exc
|
|
337
|
+
raise RuntimeError(operation)
|
|
338
|
+
|
|
339
|
+
|
|
340
|
+
def record_error(
|
|
341
|
+
error_type: str,
|
|
342
|
+
error_detail: str,
|
|
343
|
+
affected_scope: str = "",
|
|
344
|
+
exc: Optional[BaseException] = None,
|
|
345
|
+
) -> None:
|
|
346
|
+
_ensure_fortify_logging()
|
|
347
|
+
if exc is not None:
|
|
348
|
+
error_type = classify_exception(exc) if error_type in ("", "unknown") else error_type
|
|
349
|
+
detail = (error_detail or "")[:2000]
|
|
350
|
+
scope = affected_scope or ""
|
|
351
|
+
exc_type_name = type(exc).__name__ if exc is not None else ""
|
|
352
|
+
exc_module = type(exc).__module__ if exc is not None else ""
|
|
353
|
+
_LOG.error(
|
|
354
|
+
"fortify_event error_type=%s scope=%s exc_type=%s exc_module=%s detail=%s",
|
|
355
|
+
error_type,
|
|
356
|
+
scope,
|
|
357
|
+
exc_type_name,
|
|
358
|
+
exc_module,
|
|
359
|
+
detail,
|
|
360
|
+
exc_info=exc is not None,
|
|
361
|
+
)
|
|
362
|
+
now = datetime.now(timezone.utc)
|
|
363
|
+
hour_key = now.strftime("%Y-%m-%d %H:00")
|
|
364
|
+
with _stats_lock:
|
|
365
|
+
_stats.total_count += 1
|
|
366
|
+
_stats.by_type[error_type] = _stats.by_type.get(error_type, 0) + 1
|
|
367
|
+
if scope:
|
|
368
|
+
_stats.by_scope[scope] = _stats.by_scope.get(scope, 0) + 1
|
|
369
|
+
_stats.last_error = {
|
|
370
|
+
"type": error_type,
|
|
371
|
+
"detail": detail,
|
|
372
|
+
"scope": scope,
|
|
373
|
+
"time": now.isoformat(),
|
|
374
|
+
"exc_type": exc_type_name or None,
|
|
375
|
+
"exc_module": exc_module or None,
|
|
376
|
+
}
|
|
377
|
+
_stats.last_update_iso = now.isoformat()
|
|
378
|
+
# rolling hourly bucket (merge into hourly_trend max 24 entries)
|
|
379
|
+
found = False
|
|
380
|
+
for row in _stats.hourly_trend:
|
|
381
|
+
if row.get("hour") == hour_key:
|
|
382
|
+
row["count"] = row.get("count", 0) + 1
|
|
383
|
+
found = True
|
|
384
|
+
break
|
|
385
|
+
if not found:
|
|
386
|
+
_stats.hourly_trend.append({"hour": hour_key, "count": 1})
|
|
387
|
+
if len(_stats.hourly_trend) > 24:
|
|
388
|
+
_stats.hourly_trend = _stats.hourly_trend[-24:]
|
|
389
|
+
|
|
390
|
+
|
|
391
|
+
def record_retry(operation: str) -> None:
|
|
392
|
+
_retry_totals[operation] += 1
|
|
393
|
+
|
|
394
|
+
|
|
395
|
+
def get_framework_error_stats_for_client() -> Dict[str, Any]:
|
|
396
|
+
"""供 HTTP 返回:在开启脱敏时处理 last_error。"""
|
|
397
|
+
from core.safe_api_error import redact_framework_stats_for_client
|
|
398
|
+
|
|
399
|
+
return redact_framework_stats_for_client(get_framework_error_stats())
|
|
400
|
+
|
|
401
|
+
|
|
402
|
+
def get_framework_error_stats() -> Dict[str, Any]:
|
|
403
|
+
with _stats_lock:
|
|
404
|
+
by_type_out: Dict[str, Any] = {}
|
|
405
|
+
type_labels = {
|
|
406
|
+
"network": ("网络错误", "#f59e0b"),
|
|
407
|
+
"timeout": ("超时错误", "#ef4444"),
|
|
408
|
+
"parsing": ("解析错误", "#8b5cf6"),
|
|
409
|
+
"parsing-error": ("解析错误", "#8b5cf6"),
|
|
410
|
+
"io-error": ("IO 错误", "#64748b"),
|
|
411
|
+
"permission-error": ("权限错误", "#b45309"),
|
|
412
|
+
"compute-error": ("计算错误", "#dc2626"),
|
|
413
|
+
"validation-error": ("校验错误", "#ca8a04"),
|
|
414
|
+
"unknown": ("未知错误", "#6b7280"),
|
|
415
|
+
}
|
|
416
|
+
for t, c in _stats.by_type.items():
|
|
417
|
+
label, color = type_labels.get(t, (t, "#6b7280"))
|
|
418
|
+
by_type_out[t] = {"count": c, "label": label, "color": color}
|
|
419
|
+
by_agent: Dict[str, Any] = {}
|
|
420
|
+
for scope, c in _stats.by_scope.items():
|
|
421
|
+
if scope.startswith("agent_id:"):
|
|
422
|
+
aid = scope.split(":", 1)[-1]
|
|
423
|
+
by_agent[aid] = {"count": c, "agentId": aid}
|
|
424
|
+
else:
|
|
425
|
+
by_agent[scope] = {"count": c, "agentId": scope}
|
|
426
|
+
sum_by_type = sum(t.get("count", 0) if isinstance(t, dict) else 0 for t in by_type_out.values())
|
|
427
|
+
top_scopes = [
|
|
428
|
+
{"scope": k, "count": v}
|
|
429
|
+
for k, v in sorted(_stats.by_scope.items(), key=lambda kv: -kv[1])[:50]
|
|
430
|
+
]
|
|
431
|
+
|
|
432
|
+
# NFR-R reliability metrics
|
|
433
|
+
reliability = get_reliability_metrics()
|
|
434
|
+
|
|
435
|
+
return {
|
|
436
|
+
"total_count": _stats.total_count,
|
|
437
|
+
"by_type": by_type_out,
|
|
438
|
+
"by_agent": by_agent,
|
|
439
|
+
"by_scope_top": top_scopes,
|
|
440
|
+
"sum_by_type": sum_by_type,
|
|
441
|
+
"totals_consistent": sum_by_type == _stats.total_count,
|
|
442
|
+
"hourly_trend": list(_stats.hourly_trend),
|
|
443
|
+
"last_update": _stats.last_update_iso,
|
|
444
|
+
"last_error": _stats.last_error,
|
|
445
|
+
"retry_by_operation": dict(_retry_totals),
|
|
446
|
+
"retry_budget_blocks": _retry_budget_blocks,
|
|
447
|
+
# NFR-R Reliability
|
|
448
|
+
"reliability": reliability,
|
|
449
|
+
}
|
|
450
|
+
|
|
451
|
+
|
|
452
|
+
def execute_with_retry(
|
|
453
|
+
max_attempts: int = 3,
|
|
454
|
+
delay_base: float = 1.0,
|
|
455
|
+
exceptions: Tuple[Type[BaseException], ...] = (Exception,),
|
|
456
|
+
) -> Callable[[Callable[..., Any]], Callable[..., Any]]:
|
|
457
|
+
def deco(fn: Callable[..., Any]) -> Callable[..., Any]:
|
|
458
|
+
@functools.wraps(fn)
|
|
459
|
+
def wrapped(*args: Any, **kwargs: Any) -> Any:
|
|
460
|
+
cfg = get_fortify_config()
|
|
461
|
+
attempts = max_attempts if max_attempts is not None else cfg.max_retry + 1
|
|
462
|
+
base = delay_base if delay_base is not None else cfg.retry_base_delay
|
|
463
|
+
last: Optional[BaseException] = None
|
|
464
|
+
for attempt in range(attempts):
|
|
465
|
+
try:
|
|
466
|
+
return fn(*args, **kwargs)
|
|
467
|
+
except exceptions as e:
|
|
468
|
+
last = e
|
|
469
|
+
record_retry(fn.__name__)
|
|
470
|
+
if attempt + 1 >= attempts:
|
|
471
|
+
break
|
|
472
|
+
if not _consume_retry_budget(fn.__name__):
|
|
473
|
+
record_error(
|
|
474
|
+
"compute-error",
|
|
475
|
+
f"retry budget exceeded (60s window) op={fn.__name__}",
|
|
476
|
+
f"retry_budget:{fn.__name__}",
|
|
477
|
+
)
|
|
478
|
+
if last:
|
|
479
|
+
raise last
|
|
480
|
+
raise RuntimeError(fn.__name__)
|
|
481
|
+
time.sleep(base * (2**attempt))
|
|
482
|
+
if last:
|
|
483
|
+
raise last
|
|
484
|
+
raise RuntimeError(fn.__name__)
|
|
485
|
+
|
|
486
|
+
return wrapped
|
|
487
|
+
|
|
488
|
+
return deco
|