openclaw-agent-dashboard 1.0.40 → 1.0.42

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -295,3 +295,13 @@ async def get_errors_summary():
295
295
  "apiStatus": api_status,
296
296
  "stats": stats,
297
297
  }
298
+
299
+
300
+ @router.get("/errors/reliability")
301
+ async def get_reliability_stats():
302
+ """
303
+ NFR-R 可靠性指标接口
304
+ 包括:监听成功率(NFR-R-002)、错误恢复时间(NFR-R-003)、优雅降级率(NFR-R-005)
305
+ """
306
+ from core.error_handler import get_reliability_metrics
307
+ return get_reliability_metrics()
@@ -1,4 +1,4 @@
1
- """TECHDEBT_FORTIFY: health, cache stats, data validation endpoints."""
1
+ """TECHDEBT_FORTIFY: health, cache stats, data validation, logging endpoints."""
2
2
  from __future__ import annotations
3
3
 
4
4
  import sys
@@ -78,3 +78,31 @@ async def validate_session_data(
78
78
  except Exception as e:
79
79
  record_error("unknown", str(e), "api:fortify:validate", exc=e)
80
80
  raise HTTPException(status_code=500, detail=safe_api_error_detail(e)) from e
81
+
82
+
83
+ @router.get("/logging/config")
84
+ async def logging_config() -> Any:
85
+ """
86
+ NFR-S-003: Get logging configuration and status.
87
+
88
+ Returns current logging configuration for diagnostics and monitoring.
89
+ """
90
+ try:
91
+ from core.logging_config import get_logging_config_summary
92
+ return {
93
+ "status": "ok",
94
+ "config": get_logging_config_summary(),
95
+ }
96
+ except ImportError:
97
+ # Fallback if logging_config is not available
98
+ return {
99
+ "status": "ok",
100
+ "config": {
101
+ "log_retention_days": 30,
102
+ "log_max_size_mb": 100,
103
+ "log_backup_count": 5,
104
+ "log_file_path": None,
105
+ "log_compression": True,
106
+ },
107
+ "note": "Enhanced logging not configured",
108
+ }
@@ -78,6 +78,13 @@ class FortifyConfig:
78
78
  watcher_poll_interval_sec: float
79
79
  watcher_failure_window_sec: float
80
80
 
81
+ # NFR-S-003: Logging storage security
82
+ log_retention_days: int
83
+ log_max_size_mb: int
84
+ log_backup_count: int
85
+ log_file_path: str | None
86
+ log_compression: bool
87
+
81
88
 
82
89
  @lru_cache(maxsize=1)
83
90
  def get_fortify_config() -> FortifyConfig:
@@ -104,6 +111,12 @@ def get_fortify_config() -> FortifyConfig:
104
111
  watcher_max_retries=_env_int("OPENCLAW_WATCHER_MAX_RETRIES", 3, min_v=1, max_v=10),
105
112
  watcher_poll_interval_sec=_env_float("OPENCLAW_WATCHER_POLL_INTERVAL", 5.0),
106
113
  watcher_failure_window_sec=_env_float("OPENCLAW_WATCHER_FAILURE_WINDOW", 30.0),
114
+ # NFR-S-003: Logging storage security
115
+ log_retention_days=_env_int("OPENCLAW_LOG_RETENTION_DAYS", 30, min_v=1, max_v=365),
116
+ log_max_size_mb=_env_int("OPENCLAW_LOG_MAX_SIZE_MB", 100, min_v=1, max_v=1024),
117
+ log_backup_count=_env_int("OPENCLAW_LOG_BACKUP_COUNT", 5, min_v=1, max_v=50),
118
+ log_file_path=os.environ.get("OPENCLAW_LOG_FILE_PATH") or None,
119
+ log_compression=_env_bool("OPENCLAW_LOG_COMPRESSION", True),
107
120
  )
108
121
 
109
122
 
@@ -26,6 +26,18 @@ def _ensure_fortify_logging() -> None:
26
26
  level = getattr(logging, cfg.error_log_level, logging.INFO)
27
27
  _LOG.setLevel(level)
28
28
  if not _LOG.handlers:
29
+ # Try to use secure file-based logging if configured
30
+ try:
31
+ from core.logging_config import setup_secure_logging, get_log_file_path
32
+ log_path = get_log_file_path()
33
+ if log_path is not None:
34
+ # Secure logging is configured, skip console-only handler
35
+ # setup_secure_logging() already added file handlers
36
+ _ensure_fortify_logging._done = True # type: ignore[attr-defined]
37
+ return
38
+ except ImportError:
39
+ pass # Fall back to console handler
40
+
29
41
  h = logging.StreamHandler()
30
42
  h.setFormatter(
31
43
  logging.Formatter(
@@ -55,6 +67,137 @@ _retry_budget_lock = threading.Lock()
55
67
  _retry_budget_deques: Dict[str, deque] = {}
56
68
  _retry_budget_blocks = 0
57
69
 
70
+ # NFR-R: Reliability metrics
71
+ _reliability_lock = threading.Lock()
72
+ # Error recovery tracking
73
+ _error_recovery_times: deque = deque(maxlen=100) # last 100 recovery times in seconds
74
+ _last_error_timestamp: Optional[float] = None
75
+ _last_recovery_timestamp: Optional[float] = None
76
+
77
+ # Graceful degradation tracking
78
+ _fallback_total_attempts = 0
79
+ _fallback_success_count = 0
80
+
81
+ # Watcher availability tracking
82
+ _watcher_uptime_start: Optional[float] = None
83
+ _watcher_total_uptime_seconds = 0.0
84
+ _watcher_total_downtime_seconds = 0.0
85
+ _watchdog_last_failure_time: Optional[float] = None
86
+
87
+
88
+ def record_fallback_attempt(success: bool) -> None:
89
+ """Record graceful degradation attempt (NFR-R-005)."""
90
+ global _fallback_total_attempts, _fallback_success_count
91
+ with _reliability_lock:
92
+ _fallback_total_attempts += 1
93
+ if success:
94
+ _fallback_success_count += 1
95
+
96
+
97
+ def record_error_recovery(duration_seconds: float) -> None:
98
+ """Record error recovery time (NFR-R-003)."""
99
+ with _reliability_lock:
100
+ _error_recovery_times.append(duration_seconds)
101
+
102
+
103
+ def record_watcher_failure() -> None:
104
+ """Mark watchdog failure start time."""
105
+ global _watchdog_last_failure_time, _watcher_uptime_start, _watcher_total_uptime_seconds
106
+ now = time.time()
107
+ with _reliability_lock:
108
+ if _watchdog_last_failure_time is None:
109
+ _watchdog_last_failure_time = now
110
+ if _watcher_uptime_start is not None:
111
+ _watcher_total_uptime_seconds += now - _watcher_uptime_start
112
+ _watcher_uptime_start = None
113
+
114
+
115
+ def record_watcher_recovery() -> None:
116
+ """Mark watchdog recovery and record recovery time (NFR-R-003)."""
117
+ global _watchdog_last_failure_time, _watcher_uptime_start, _watcher_total_downtime_seconds
118
+ now = time.time()
119
+ recovery_time = 0.0
120
+ with _reliability_lock:
121
+ if _watchdog_last_failure_time is not None:
122
+ recovery_time = now - _watchdog_last_failure_time
123
+ _watcher_total_downtime_seconds += recovery_time
124
+ _watchdog_last_failure_time = None
125
+ _watcher_uptime_start = now
126
+ # Record outside lock to avoid deadlock (record_error_recovery also uses _reliability_lock)
127
+ if recovery_time > 0:
128
+ _error_recovery_times.append(recovery_time)
129
+
130
+
131
+ def get_reliability_metrics() -> Dict[str, Any]:
132
+ """Get all reliability metrics for NFR-R-002/003/005."""
133
+ import statistics as _statistics
134
+
135
+ with _reliability_lock:
136
+ current_time = time.time()
137
+ current_uptime = 0.0
138
+ if _watcher_uptime_start is not None:
139
+ current_uptime = current_time - _watcher_uptime_start
140
+
141
+ total_uptime = _watcher_total_uptime_seconds + current_uptime
142
+ total_downtime = _watcher_total_downtime_seconds
143
+ total_time = total_uptime + total_downtime
144
+
145
+ # NFR-R-002: Watcher availability/success rate
146
+ availability_rate = 1.0
147
+ if total_time > 0:
148
+ availability_rate = total_uptime / total_time
149
+
150
+ # NFR-R-003: Error recovery time
151
+ recovery_times_list = list(_error_recovery_times)
152
+ avg_recovery_time = 0.0
153
+ p95_recovery_time = 0.0
154
+ if recovery_times_list:
155
+ avg_recovery_time = _statistics.mean(recovery_times_list)
156
+ sorted_times = sorted(recovery_times_list)
157
+ p95_idx = int(len(sorted_times) * 0.95)
158
+ p95_recovery_time = sorted_times[min(p95_idx, len(sorted_times) - 1)]
159
+
160
+ # NFR-R-005: Graceful degradation rate
161
+ graceful_degradation_rate = 1.0
162
+ if _fallback_total_attempts > 0:
163
+ graceful_degradation_rate = _fallback_success_count / _fallback_total_attempts
164
+
165
+ return {
166
+ # NFR-R-002: Watcher availability
167
+ "watcher_uptime_seconds": total_uptime,
168
+ "watcher_downtime_seconds": total_downtime,
169
+ "watcher_availability_rate": round(availability_rate, 4),
170
+ "watcher_uptime_percentage": round(availability_rate * 100, 2),
171
+ # NFR-R-003: Error recovery time
172
+ "avg_error_recovery_seconds": round(avg_recovery_time, 3),
173
+ "p95_error_recovery_seconds": round(p95_recovery_time, 3),
174
+ "error_recovery_count": len(recovery_times_list),
175
+ "last_error_recovery_time": recovery_times_list[-1] if recovery_times_list else None,
176
+ # NFR-R-005: Graceful degradation
177
+ "graceful_degradation_attempts": _fallback_total_attempts,
178
+ "graceful_degradation_successes": _fallback_success_count,
179
+ "graceful_degradation_rate": round(graceful_degradation_rate, 4),
180
+ "graceful_degradation_percentage": round(graceful_degradation_rate * 100, 2),
181
+ }
182
+
183
+
184
+ def reset_reliability_metrics_for_tests() -> None:
185
+ """Reset reliability metrics for testing."""
186
+ global _error_recovery_times, _last_error_timestamp, _last_recovery_timestamp
187
+ global _fallback_total_attempts, _fallback_success_count
188
+ global _watcher_uptime_start, _watcher_total_uptime_seconds, _watcher_total_downtime_seconds
189
+ global _watchdog_last_failure_time
190
+ with _reliability_lock:
191
+ _error_recovery_times.clear()
192
+ _last_error_timestamp = None
193
+ _last_recovery_timestamp = None
194
+ _fallback_total_attempts = 0
195
+ _fallback_success_count = 0
196
+ _watcher_uptime_start = None
197
+ _watcher_total_uptime_seconds = 0.0
198
+ _watcher_total_downtime_seconds = 0.0
199
+ _watchdog_last_failure_time = None
200
+
58
201
 
59
202
  def _consume_retry_budget(operation: str) -> bool:
60
203
  """
@@ -285,19 +428,25 @@ def get_framework_error_stats() -> Dict[str, Any]:
285
428
  {"scope": k, "count": v}
286
429
  for k, v in sorted(_stats.by_scope.items(), key=lambda kv: -kv[1])[:50]
287
430
  ]
288
- return {
289
- "total_count": _stats.total_count,
290
- "by_type": by_type_out,
291
- "by_agent": by_agent,
292
- "by_scope_top": top_scopes,
293
- "sum_by_type": sum_by_type,
294
- "totals_consistent": sum_by_type == _stats.total_count,
295
- "hourly_trend": list(_stats.hourly_trend),
296
- "last_update": _stats.last_update_iso,
297
- "last_error": _stats.last_error,
298
- "retry_by_operation": dict(_retry_totals),
299
- "retry_budget_blocks": _retry_budget_blocks,
300
- }
431
+
432
+ # NFR-R reliability metrics
433
+ reliability = get_reliability_metrics()
434
+
435
+ return {
436
+ "total_count": _stats.total_count,
437
+ "by_type": by_type_out,
438
+ "by_agent": by_agent,
439
+ "by_scope_top": top_scopes,
440
+ "sum_by_type": sum_by_type,
441
+ "totals_consistent": sum_by_type == _stats.total_count,
442
+ "hourly_trend": list(_stats.hourly_trend),
443
+ "last_update": _stats.last_update_iso,
444
+ "last_error": _stats.last_error,
445
+ "retry_by_operation": dict(_retry_totals),
446
+ "retry_budget_blocks": _retry_budget_blocks,
447
+ # NFR-R Reliability
448
+ "reliability": reliability,
449
+ }
301
450
 
302
451
 
303
452
  def execute_with_retry(
@@ -28,7 +28,18 @@ def run_fallback(error_category: str, *, agent_id: Optional[str] = None, **kwarg
28
28
  h = _handlers.get(error_category)
29
29
  if h is None:
30
30
  return None
31
- return h(agent_id=agent_id, **kwargs)
31
+ # NFR-R-005: Record fallback attempt (success if returns non-None)
32
+ try:
33
+ result = h(agent_id=agent_id, **kwargs)
34
+ from core.error_handler import record_fallback_attempt
35
+
36
+ record_fallback_attempt(success=result is not None)
37
+ return result
38
+ except Exception:
39
+ from core.error_handler import record_fallback_attempt
40
+
41
+ record_fallback_attempt(success=False)
42
+ raise
32
43
 
33
44
 
34
45
  def _stale_agent_status_handler(agent_id: Optional[str] = None, **_: Any) -> Optional[str]:
@@ -0,0 +1,217 @@
1
+ """
2
+ NFR-S-003: Logging storage security configuration.
3
+
4
+ Provides secure logging setup with:
5
+ - File rotation (size-based)
6
+ - Compression of rotated files
7
+ - Automatic cleanup of old logs based on retention policy
8
+ - File permission hardening
9
+
10
+ Usage:
11
+ from core.logging_config import setup_secure_logging
12
+ setup_secure_logging()
13
+
14
+ Configuration via environment variables:
15
+ OPENCLAW_LOG_RETENTION_DAYS: Days to retain log files (default: 30)
16
+ OPENCLAW_LOG_MAX_SIZE_MB: Max size per log file in MB (default: 100)
17
+ OPENCLAW_LOG_BACKUP_COUNT: Number of backup files to keep (default: 5)
18
+ OPENCLAW_LOG_FILE_PATH: Custom log file path (optional)
19
+ OPENCLAW_LOG_COMPRESSION: Compress rotated logs (default: true)
20
+ """
21
+ from __future__ import annotations
22
+
23
+ import logging
24
+ import logging.handlers
25
+ import os
26
+ import sys
27
+ from pathlib import Path
28
+ from typing import Optional
29
+
30
+ from core.config_fortify import get_fortify_config
31
+
32
+
33
+ def get_log_file_path() -> Optional[Path]:
34
+ """Determine the log file path based on configuration."""
35
+ cfg = get_fortify_config()
36
+ if cfg.log_file_path:
37
+ return Path(cfg.log_file_path)
38
+
39
+ # Default path: logs/openclaw.log in project root
40
+ project_root = Path(__file__).parent.parent.parent
41
+ log_dir = project_root / "logs"
42
+ return log_dir / "openclaw.log"
43
+
44
+
45
+ def ensure_log_directory(log_path: Path) -> None:
46
+ """Ensure log directory exists with proper permissions."""
47
+ log_dir = log_path.parent
48
+ log_dir.mkdir(parents=True, exist_ok=True)
49
+
50
+ # Set directory permissions to 0o750 (owner rwx, group r-x, others none)
51
+ # Note: This may fail on Windows or if running as non-owner
52
+ try:
53
+ os.chmod(log_dir, 0o750)
54
+ except (OSError, PermissionError):
55
+ pass # Skip on platforms that don't support chmod
56
+
57
+
58
+ def setup_secure_logging() -> None:
59
+ """
60
+ Configure secure logging with rotation, compression, and retention.
61
+
62
+ This sets up handlers for all openclaw.* loggers:
63
+ - Console handler for development
64
+ - Rotating file handler with compression for production
65
+ """
66
+ cfg = get_fortify_config()
67
+ log_path = get_log_file_path()
68
+
69
+ if log_path is None:
70
+ # No file logging, just console
71
+ return
72
+
73
+ ensure_log_directory(log_path)
74
+
75
+ # Determine which loggers to configure
76
+ logger_names = ["openclaw", "openclaw.fortify", "openclaw.fortify.watcher",
77
+ "openclaw.fortify.audit", "openclaw.fortify.cache_probe"]
78
+
79
+ # Create rotating file handler
80
+ max_bytes = cfg.log_max_size_mb * 1024 * 1024
81
+ backup_count = cfg.log_backup_count
82
+
83
+ # Base rotating handler
84
+ if cfg.log_compression:
85
+ # Use custom rotating handler with gzip compression
86
+ handler: logging.Handler = _CompressedRotatingFileHandler(
87
+ filename=str(log_path),
88
+ maxBytes=max_bytes,
89
+ backupCount=backup_count,
90
+ encoding="utf-8",
91
+ )
92
+ else:
93
+ handler = logging.handlers.RotatingFileHandler(
94
+ filename=str(log_path),
95
+ maxBytes=max_bytes,
96
+ backupCount=backup_count,
97
+ encoding="utf-8",
98
+ )
99
+
100
+ # Set file permissions (owner read/write only)
101
+ try:
102
+ os.chmod(log_path, 0o600)
103
+ except (OSError, PermissionError):
104
+ pass
105
+
106
+ formatter = logging.Formatter(
107
+ fmt="%(asctime)s | %(levelname)-8s | %(name)s | %(message)s",
108
+ datefmt="%Y-%m-%dT%H:%M:%S%z",
109
+ )
110
+ handler.setFormatter(formatter)
111
+
112
+ # Apply to all relevant loggers
113
+ for logger_name in logger_names:
114
+ logger = logging.getLogger(logger_name)
115
+ # Avoid duplicate handlers
116
+ if not any(isinstance(h, (logging.handlers.RotatingFileHandler, _CompressedRotatingFileHandler))
117
+ for h in logger.handlers):
118
+ logger.addHandler(handler)
119
+
120
+ # Set levels based on config
121
+ level = getattr(logging, cfg.error_log_level, logging.INFO)
122
+ for logger_name in logger_names:
123
+ logging.getLogger(logger_name).setLevel(level)
124
+
125
+ # Schedule cleanup of old logs (best-effort)
126
+ _schedule_log_cleanup(log_path, cfg.log_retention_days)
127
+
128
+
129
+ class _CompressedRotatingFileHandler(logging.handlers.RotatingFileHandler):
130
+ """
131
+ Rotating file handler that compresses old log files using gzip.
132
+
133
+ Rotated files are renamed to <filename>.1.gz, <filename>.2.gz, etc.
134
+ """
135
+
136
+ def __init__(self, filename: str, maxBytes: int = 0, backupCount: int = 0,
137
+ encoding: str = "utf-8", compress: bool = True):
138
+ super().__init__(filename, maxBytes=maxBytes, backupCount=backupCount, encoding=encoding)
139
+ self._compress = compress
140
+
141
+ def rotate(self, source: str, dest: str) -> None:
142
+ """Compress the rotated file."""
143
+ super().rotate(source, dest)
144
+
145
+ if self._compress and os.path.exists(dest):
146
+ try:
147
+ import gzip
148
+ with open(dest, "rb") as f_in:
149
+ with gzip.open(dest + ".gz", "wb", compresslevel=6) as f_out:
150
+ f_out.writelines(f_in)
151
+ os.remove(dest)
152
+ except Exception:
153
+ # Compression failed, keep uncompressed file
154
+ pass
155
+
156
+ def shouldRollover(self, record: logging.LogRecord) -> int:
157
+ """Check if rollover should occur."""
158
+ if self.stream is None:
159
+ self.stream = self._open()
160
+
161
+ if self.maxBytes > 0:
162
+ msg = "%s\n" % self.format(record)
163
+ if self.stream.tell() + len(msg) >= self.maxBytes:
164
+ return 1
165
+ return 0
166
+
167
+
168
+ def _schedule_log_cleanup(log_path: Path, retention_days: int) -> None:
169
+ """
170
+ Schedule cleanup of log files older than retention period.
171
+
172
+ This is a best-effort cleanup that runs on startup.
173
+ For production, use an external cron job or logrotate.
174
+ """
175
+ import time
176
+
177
+ def _cleanup():
178
+ try:
179
+ cutoff = time.time() - (retention_days * 86400)
180
+ log_dir = log_path.parent
181
+
182
+ for pattern in ["*.log*", "*.gz"]:
183
+ for file_path in log_dir.glob(pattern):
184
+ if file_path.is_file() and file_path.stat().st_mtime < cutoff:
185
+ try:
186
+ file_path.unlink()
187
+ except OSError:
188
+ pass
189
+ except Exception:
190
+ pass # Best-effort cleanup
191
+
192
+ # Run cleanup in background thread
193
+ import threading
194
+ t = threading.Thread(target=_cleanup, daemon=True)
195
+ t.start()
196
+
197
+
198
+ def get_logging_config_summary() -> dict:
199
+ """Get a summary of the logging configuration for diagnostics."""
200
+ cfg = get_fortify_config()
201
+ log_path = get_log_file_path()
202
+
203
+ summary = {
204
+ "log_retention_days": cfg.log_retention_days,
205
+ "log_max_size_mb": cfg.log_max_size_mb,
206
+ "log_backup_count": cfg.log_backup_count,
207
+ "log_file_path": str(log_path) if log_path else None,
208
+ "log_compression": cfg.log_compression,
209
+ "log_directory_exists": log_path.parent.exists() if log_path else False,
210
+ }
211
+
212
+ if log_path and log_path.exists():
213
+ stat = log_path.stat()
214
+ summary["current_log_size_bytes"] = stat.st_size
215
+ summary["current_log_size_mb"] = round(stat.st_size / (1024 * 1024), 2)
216
+
217
+ return summary
@@ -15,15 +15,18 @@ sys.path.insert(0, str(BACKEND))
15
15
  def reset_fortify_state():
16
16
  """Reset all fortify singletons between tests."""
17
17
  from core.config_fortify import refresh_fortify_config_cache
18
+ from core.error_handler import reset_reliability_metrics_for_tests
18
19
  from core.fallback_manager import reset_fallback_handlers_for_tests
19
20
  from status.status_cache import reset_cache_for_tests
20
21
 
21
22
  reset_cache_for_tests()
22
23
  reset_fallback_handlers_for_tests()
24
+ reset_reliability_metrics_for_tests()
23
25
  refresh_fortify_config_cache()
24
26
  yield
25
27
  reset_cache_for_tests()
26
28
  reset_fallback_handlers_for_tests()
29
+ reset_reliability_metrics_for_tests()
27
30
  refresh_fortify_config_cache()
28
31
 
29
32