npm - openclaw-agent-dashboard - Versions diffs - 1.0.40 → 1.0.42 - Mend

openclaw-agent-dashboard 1.0.40 → 1.0.42

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (14) hide show

package/dashboard/api/errors.py +10 -0
package/dashboard/api/fortify_routes.py +29 -1
package/dashboard/core/config_fortify.py +13 -0
package/dashboard/core/error_handler.py +162 -13
package/dashboard/core/fallback_manager.py +12 -1
package/dashboard/core/logging_config.py +217 -0
package/dashboard/tests/conftest.py +3 -0
package/dashboard/tests/test_fortify.py +211 -0
package/dashboard/watchers/file_watcher.py +14 -1
package/frontend-dist/assets/{index-cYIOn3Wq.css → index-BIZ2xHfw.css} +1 -1
package/frontend-dist/assets/{index-DyRXGevD.js → index-Cnr0b02R.js} +1 -1
package/frontend-dist/index.html +2 -2
package/openclaw.plugin.json +1 -1
package/package.json +1 -1

package/dashboard/api/errors.py CHANGED Viewed

@@ -295,3 +295,13 @@ async def get_errors_summary():
         "apiStatus": api_status,
         "stats": stats,
     }
+@router.get("/errors/reliability")
+async def get_reliability_stats():
+    """
+    NFR-R 可靠性指标接口
+    包括：监听成功率(NFR-R-002)、错误恢复时间(NFR-R-003)、优雅降级率(NFR-R-005)
+    """
+    from core.error_handler import get_reliability_metrics
+    return get_reliability_metrics()

package/dashboard/api/fortify_routes.py CHANGED Viewed

@@ -1,4 +1,4 @@
-"""TECHDEBT_FORTIFY: health, cache stats, data validation endpoints."""
+"""TECHDEBT_FORTIFY: health, cache stats, data validation, logging endpoints."""
 from __future__ import annotations
 import sys
@@ -78,3 +78,31 @@ async def validate_session_data(
     except Exception as e:
         record_error("unknown", str(e), "api:fortify:validate", exc=e)
         raise HTTPException(status_code=500, detail=safe_api_error_detail(e)) from e
+@router.get("/logging/config")
+async def logging_config() -> Any:
+    """
+    NFR-S-003: Get logging configuration and status.
+    Returns current logging configuration for diagnostics and monitoring.
+    """
+    try:
+        from core.logging_config import get_logging_config_summary
+        return {
+            "status": "ok",
+            "config": get_logging_config_summary(),
+        }
+    except ImportError:
+        # Fallback if logging_config is not available
+        return {
+            "status": "ok",
+            "config": {
+                "log_retention_days": 30,
+                "log_max_size_mb": 100,
+                "log_backup_count": 5,
+                "log_file_path": None,
+                "log_compression": True,
+            },
+            "note": "Enhanced logging not configured",
+        }

package/dashboard/core/config_fortify.py CHANGED Viewed

@@ -78,6 +78,13 @@ class FortifyConfig:
     watcher_poll_interval_sec: float
     watcher_failure_window_sec: float
+    # NFR-S-003: Logging storage security
+    log_retention_days: int
+    log_max_size_mb: int
+    log_backup_count: int
+    log_file_path: str | None
+    log_compression: bool
 @lru_cache(maxsize=1)
 def get_fortify_config() -> FortifyConfig:
@@ -104,6 +111,12 @@ def get_fortify_config() -> FortifyConfig:
         watcher_max_retries=_env_int("OPENCLAW_WATCHER_MAX_RETRIES", 3, min_v=1, max_v=10),
         watcher_poll_interval_sec=_env_float("OPENCLAW_WATCHER_POLL_INTERVAL", 5.0),
         watcher_failure_window_sec=_env_float("OPENCLAW_WATCHER_FAILURE_WINDOW", 30.0),
+        # NFR-S-003: Logging storage security
+        log_retention_days=_env_int("OPENCLAW_LOG_RETENTION_DAYS", 30, min_v=1, max_v=365),
+        log_max_size_mb=_env_int("OPENCLAW_LOG_MAX_SIZE_MB", 100, min_v=1, max_v=1024),
+        log_backup_count=_env_int("OPENCLAW_LOG_BACKUP_COUNT", 5, min_v=1, max_v=50),
+        log_file_path=os.environ.get("OPENCLAW_LOG_FILE_PATH") or None,
+        log_compression=_env_bool("OPENCLAW_LOG_COMPRESSION", True),
     )

package/dashboard/core/error_handler.py CHANGED Viewed

@@ -26,6 +26,18 @@ def _ensure_fortify_logging() -> None:
     level = getattr(logging, cfg.error_log_level, logging.INFO)
     _LOG.setLevel(level)
     if not _LOG.handlers:
+        # Try to use secure file-based logging if configured
+        try:
+            from core.logging_config import setup_secure_logging, get_log_file_path
+            log_path = get_log_file_path()
+            if log_path is not None:
+                # Secure logging is configured, skip console-only handler
+                # setup_secure_logging() already added file handlers
+                _ensure_fortify_logging._done = True  # type: ignore[attr-defined]
+                return
+        except ImportError:
+            pass  # Fall back to console handler
         h = logging.StreamHandler()
         h.setFormatter(
             logging.Formatter(
@@ -55,6 +67,137 @@ _retry_budget_lock = threading.Lock()
 _retry_budget_deques: Dict[str, deque] = {}
 _retry_budget_blocks = 0
+# NFR-R: Reliability metrics
+_reliability_lock = threading.Lock()
+# Error recovery tracking
+_error_recovery_times: deque = deque(maxlen=100)  # last 100 recovery times in seconds
+_last_error_timestamp: Optional[float] = None
+_last_recovery_timestamp: Optional[float] = None
+# Graceful degradation tracking
+_fallback_total_attempts = 0
+_fallback_success_count = 0
+# Watcher availability tracking
+_watcher_uptime_start: Optional[float] = None
+_watcher_total_uptime_seconds = 0.0
+_watcher_total_downtime_seconds = 0.0
+_watchdog_last_failure_time: Optional[float] = None
+def record_fallback_attempt(success: bool) -> None:
+    """Record graceful degradation attempt (NFR-R-005)."""
+    global _fallback_total_attempts, _fallback_success_count
+    with _reliability_lock:
+        _fallback_total_attempts += 1
+        if success:
+            _fallback_success_count += 1
+def record_error_recovery(duration_seconds: float) -> None:
+    """Record error recovery time (NFR-R-003)."""
+    with _reliability_lock:
+        _error_recovery_times.append(duration_seconds)
+def record_watcher_failure() -> None:
+    """Mark watchdog failure start time."""
+    global _watchdog_last_failure_time, _watcher_uptime_start, _watcher_total_uptime_seconds
+    now = time.time()
+    with _reliability_lock:
+        if _watchdog_last_failure_time is None:
+            _watchdog_last_failure_time = now
+            if _watcher_uptime_start is not None:
+                _watcher_total_uptime_seconds += now - _watcher_uptime_start
+                _watcher_uptime_start = None
+def record_watcher_recovery() -> None:
+    """Mark watchdog recovery and record recovery time (NFR-R-003)."""
+    global _watchdog_last_failure_time, _watcher_uptime_start, _watcher_total_downtime_seconds
+    now = time.time()
+    recovery_time = 0.0
+    with _reliability_lock:
+        if _watchdog_last_failure_time is not None:
+            recovery_time = now - _watchdog_last_failure_time
+            _watcher_total_downtime_seconds += recovery_time
+            _watchdog_last_failure_time = None
+        _watcher_uptime_start = now
+    # Record outside lock to avoid deadlock (record_error_recovery also uses _reliability_lock)
+    if recovery_time > 0:
+        _error_recovery_times.append(recovery_time)
+def get_reliability_metrics() -> Dict[str, Any]:
+    """Get all reliability metrics for NFR-R-002/003/005."""
+    import statistics as _statistics
+    with _reliability_lock:
+        current_time = time.time()
+        current_uptime = 0.0
+        if _watcher_uptime_start is not None:
+            current_uptime = current_time - _watcher_uptime_start
+        total_uptime = _watcher_total_uptime_seconds + current_uptime
+        total_downtime = _watcher_total_downtime_seconds
+        total_time = total_uptime + total_downtime
+        # NFR-R-002: Watcher availability/success rate
+        availability_rate = 1.0
+        if total_time > 0:
+            availability_rate = total_uptime / total_time
+        # NFR-R-003: Error recovery time
+        recovery_times_list = list(_error_recovery_times)
+        avg_recovery_time = 0.0
+        p95_recovery_time = 0.0
+        if recovery_times_list:
+            avg_recovery_time = _statistics.mean(recovery_times_list)
+            sorted_times = sorted(recovery_times_list)
+            p95_idx = int(len(sorted_times) * 0.95)
+            p95_recovery_time = sorted_times[min(p95_idx, len(sorted_times) - 1)]
+        # NFR-R-005: Graceful degradation rate
+        graceful_degradation_rate = 1.0
+        if _fallback_total_attempts > 0:
+            graceful_degradation_rate = _fallback_success_count / _fallback_total_attempts
+        return {
+            # NFR-R-002: Watcher availability
+            "watcher_uptime_seconds": total_uptime,
+            "watcher_downtime_seconds": total_downtime,
+            "watcher_availability_rate": round(availability_rate, 4),
+            "watcher_uptime_percentage": round(availability_rate * 100, 2),
+            # NFR-R-003: Error recovery time
+            "avg_error_recovery_seconds": round(avg_recovery_time, 3),
+            "p95_error_recovery_seconds": round(p95_recovery_time, 3),
+            "error_recovery_count": len(recovery_times_list),
+            "last_error_recovery_time": recovery_times_list[-1] if recovery_times_list else None,
+            # NFR-R-005: Graceful degradation
+            "graceful_degradation_attempts": _fallback_total_attempts,
+            "graceful_degradation_successes": _fallback_success_count,
+            "graceful_degradation_rate": round(graceful_degradation_rate, 4),
+            "graceful_degradation_percentage": round(graceful_degradation_rate * 100, 2),
+        }
+def reset_reliability_metrics_for_tests() -> None:
+    """Reset reliability metrics for testing."""
+    global _error_recovery_times, _last_error_timestamp, _last_recovery_timestamp
+    global _fallback_total_attempts, _fallback_success_count
+    global _watcher_uptime_start, _watcher_total_uptime_seconds, _watcher_total_downtime_seconds
+    global _watchdog_last_failure_time
+    with _reliability_lock:
+        _error_recovery_times.clear()
+        _last_error_timestamp = None
+        _last_recovery_timestamp = None
+        _fallback_total_attempts = 0
+        _fallback_success_count = 0
+        _watcher_uptime_start = None
+        _watcher_total_uptime_seconds = 0.0
+        _watcher_total_downtime_seconds = 0.0
+        _watchdog_last_failure_time = None
 def _consume_retry_budget(operation: str) -> bool:
     """
@@ -285,19 +428,25 @@ def get_framework_error_stats() -> Dict[str, Any]:
             {"scope": k, "count": v}
             for k, v in sorted(_stats.by_scope.items(), key=lambda kv: -kv[1])[:50]
         ]
-        return {
-            "total_count": _stats.total_count,
-            "by_type": by_type_out,
-            "by_agent": by_agent,
-            "by_scope_top": top_scopes,
-            "sum_by_type": sum_by_type,
-            "totals_consistent": sum_by_type == _stats.total_count,
-            "hourly_trend": list(_stats.hourly_trend),
-            "last_update": _stats.last_update_iso,
-            "last_error": _stats.last_error,
-            "retry_by_operation": dict(_retry_totals),
-            "retry_budget_blocks": _retry_budget_blocks,
-        }
+    # NFR-R reliability metrics
+    reliability = get_reliability_metrics()
+    return {
+        "total_count": _stats.total_count,
+        "by_type": by_type_out,
+        "by_agent": by_agent,
+        "by_scope_top": top_scopes,
+        "sum_by_type": sum_by_type,
+        "totals_consistent": sum_by_type == _stats.total_count,
+        "hourly_trend": list(_stats.hourly_trend),
+        "last_update": _stats.last_update_iso,
+        "last_error": _stats.last_error,
+        "retry_by_operation": dict(_retry_totals),
+        "retry_budget_blocks": _retry_budget_blocks,
+        # NFR-R Reliability
+        "reliability": reliability,
+    }
 def execute_with_retry(

package/dashboard/core/fallback_manager.py CHANGED Viewed

@@ -28,7 +28,18 @@ def run_fallback(error_category: str, *, agent_id: Optional[str] = None, **kwarg
         h = _handlers.get(error_category)
     if h is None:
         return None
-    return h(agent_id=agent_id, **kwargs)
+    # NFR-R-005: Record fallback attempt (success if returns non-None)
+    try:
+        result = h(agent_id=agent_id, **kwargs)
+        from core.error_handler import record_fallback_attempt
+        record_fallback_attempt(success=result is not None)
+        return result
+    except Exception:
+        from core.error_handler import record_fallback_attempt
+        record_fallback_attempt(success=False)
+        raise
 def _stale_agent_status_handler(agent_id: Optional[str] = None, **_: Any) -> Optional[str]:

package/dashboard/core/logging_config.py ADDED Viewed

@@ -0,0 +1,217 @@
+"""
+NFR-S-003: Logging storage security configuration.
+Provides secure logging setup with:
+- File rotation (size-based)
+- Compression of rotated files
+- Automatic cleanup of old logs based on retention policy
+- File permission hardening
+Usage:
+    from core.logging_config import setup_secure_logging
+    setup_secure_logging()
+Configuration via environment variables:
+    OPENCLAW_LOG_RETENTION_DAYS: Days to retain log files (default: 30)
+    OPENCLAW_LOG_MAX_SIZE_MB: Max size per log file in MB (default: 100)
+    OPENCLAW_LOG_BACKUP_COUNT: Number of backup files to keep (default: 5)
+    OPENCLAW_LOG_FILE_PATH: Custom log file path (optional)
+    OPENCLAW_LOG_COMPRESSION: Compress rotated logs (default: true)
+"""
+from __future__ import annotations
+import logging
+import logging.handlers
+import os
+import sys
+from pathlib import Path
+from typing import Optional
+from core.config_fortify import get_fortify_config
+def get_log_file_path() -> Optional[Path]:
+    """Determine the log file path based on configuration."""
+    cfg = get_fortify_config()
+    if cfg.log_file_path:
+        return Path(cfg.log_file_path)
+    # Default path: logs/openclaw.log in project root
+    project_root = Path(__file__).parent.parent.parent
+    log_dir = project_root / "logs"
+    return log_dir / "openclaw.log"
+def ensure_log_directory(log_path: Path) -> None:
+    """Ensure log directory exists with proper permissions."""
+    log_dir = log_path.parent
+    log_dir.mkdir(parents=True, exist_ok=True)
+    # Set directory permissions to 0o750 (owner rwx, group r-x, others none)
+    # Note: This may fail on Windows or if running as non-owner
+    try:
+        os.chmod(log_dir, 0o750)
+    except (OSError, PermissionError):
+        pass  # Skip on platforms that don't support chmod
+def setup_secure_logging() -> None:
+    """
+    Configure secure logging with rotation, compression, and retention.
+    This sets up handlers for all openclaw.* loggers:
+    - Console handler for development
+    - Rotating file handler with compression for production
+    """
+    cfg = get_fortify_config()
+    log_path = get_log_file_path()
+    if log_path is None:
+        # No file logging, just console
+        return
+    ensure_log_directory(log_path)
+    # Determine which loggers to configure
+    logger_names = ["openclaw", "openclaw.fortify", "openclaw.fortify.watcher",
+                    "openclaw.fortify.audit", "openclaw.fortify.cache_probe"]
+    # Create rotating file handler
+    max_bytes = cfg.log_max_size_mb * 1024 * 1024
+    backup_count = cfg.log_backup_count
+    # Base rotating handler
+    if cfg.log_compression:
+        # Use custom rotating handler with gzip compression
+        handler: logging.Handler = _CompressedRotatingFileHandler(
+            filename=str(log_path),
+            maxBytes=max_bytes,
+            backupCount=backup_count,
+            encoding="utf-8",
+        )
+    else:
+        handler = logging.handlers.RotatingFileHandler(
+            filename=str(log_path),
+            maxBytes=max_bytes,
+            backupCount=backup_count,
+            encoding="utf-8",
+        )
+    # Set file permissions (owner read/write only)
+    try:
+        os.chmod(log_path, 0o600)
+    except (OSError, PermissionError):
+        pass
+    formatter = logging.Formatter(
+        fmt="%(asctime)s | %(levelname)-8s | %(name)s | %(message)s",
+        datefmt="%Y-%m-%dT%H:%M:%S%z",
+    )
+    handler.setFormatter(formatter)
+    # Apply to all relevant loggers
+    for logger_name in logger_names:
+        logger = logging.getLogger(logger_name)
+        # Avoid duplicate handlers
+        if not any(isinstance(h, (logging.handlers.RotatingFileHandler, _CompressedRotatingFileHandler))
+                   for h in logger.handlers):
+            logger.addHandler(handler)
+    # Set levels based on config
+    level = getattr(logging, cfg.error_log_level, logging.INFO)
+    for logger_name in logger_names:
+        logging.getLogger(logger_name).setLevel(level)
+    # Schedule cleanup of old logs (best-effort)
+    _schedule_log_cleanup(log_path, cfg.log_retention_days)
+class _CompressedRotatingFileHandler(logging.handlers.RotatingFileHandler):
+    """
+    Rotating file handler that compresses old log files using gzip.
+    Rotated files are renamed to <filename>.1.gz, <filename>.2.gz, etc.
+    """
+    def __init__(self, filename: str, maxBytes: int = 0, backupCount: int = 0,
+                 encoding: str = "utf-8", compress: bool = True):
+        super().__init__(filename, maxBytes=maxBytes, backupCount=backupCount, encoding=encoding)
+        self._compress = compress
+    def rotate(self, source: str, dest: str) -> None:
+        """Compress the rotated file."""
+        super().rotate(source, dest)
+        if self._compress and os.path.exists(dest):
+            try:
+                import gzip
+                with open(dest, "rb") as f_in:
+                    with gzip.open(dest + ".gz", "wb", compresslevel=6) as f_out:
+                        f_out.writelines(f_in)
+                os.remove(dest)
+            except Exception:
+                # Compression failed, keep uncompressed file
+                pass
+    def shouldRollover(self, record: logging.LogRecord) -> int:
+        """Check if rollover should occur."""
+        if self.stream is None:
+            self.stream = self._open()
+        if self.maxBytes > 0:
+            msg = "%s\n" % self.format(record)
+            if self.stream.tell() + len(msg) >= self.maxBytes:
+                return 1
+        return 0
+def _schedule_log_cleanup(log_path: Path, retention_days: int) -> None:
+    """
+    Schedule cleanup of log files older than retention period.
+    This is a best-effort cleanup that runs on startup.
+    For production, use an external cron job or logrotate.
+    """
+    import time
+    def _cleanup():
+        try:
+            cutoff = time.time() - (retention_days * 86400)
+            log_dir = log_path.parent
+            for pattern in ["*.log*", "*.gz"]:
+                for file_path in log_dir.glob(pattern):
+                    if file_path.is_file() and file_path.stat().st_mtime < cutoff:
+                        try:
+                            file_path.unlink()
+                        except OSError:
+                            pass
+        except Exception:
+            pass  # Best-effort cleanup
+    # Run cleanup in background thread
+    import threading
+    t = threading.Thread(target=_cleanup, daemon=True)
+    t.start()
+def get_logging_config_summary() -> dict:
+    """Get a summary of the logging configuration for diagnostics."""
+    cfg = get_fortify_config()
+    log_path = get_log_file_path()
+    summary = {
+        "log_retention_days": cfg.log_retention_days,
+        "log_max_size_mb": cfg.log_max_size_mb,
+        "log_backup_count": cfg.log_backup_count,
+        "log_file_path": str(log_path) if log_path else None,
+        "log_compression": cfg.log_compression,
+        "log_directory_exists": log_path.parent.exists() if log_path else False,
+    }
+    if log_path and log_path.exists():
+        stat = log_path.stat()
+        summary["current_log_size_bytes"] = stat.st_size
+        summary["current_log_size_mb"] = round(stat.st_size / (1024 * 1024), 2)
+    return summary

package/dashboard/tests/conftest.py CHANGED Viewed

@@ -15,15 +15,18 @@ sys.path.insert(0, str(BACKEND))
 def reset_fortify_state():
     """Reset all fortify singletons between tests."""
     from core.config_fortify import refresh_fortify_config_cache
+    from core.error_handler import reset_reliability_metrics_for_tests
     from core.fallback_manager import reset_fallback_handlers_for_tests
     from status.status_cache import reset_cache_for_tests
     reset_cache_for_tests()
     reset_fallback_handlers_for_tests()
+    reset_reliability_metrics_for_tests()
     refresh_fortify_config_cache()
     yield
     reset_cache_for_tests()
     reset_fallback_handlers_for_tests()
+    reset_reliability_metrics_for_tests()
     refresh_fortify_config_cache()