npm - @misterhuydo/sentinel - Versions diffs - 1.2.5 → 1.2.6 - Mend

@misterhuydo/sentinel 1.2.5 → 1.2.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

package/.cairn/.hint-lock +1 -1
package/.cairn/session.json +2 -2
package/package.json +21 -21
package/python/sentinel/config_loader.py +14 -0
package/python/sentinel/fix_engine.py +259 -242
package/python/sentinel/health_checker.py +219 -0
package/python/sentinel/log_syncer.py +164 -0
package/python/sentinel/main.py +62 -0
package/python/sentinel/sentinel_boss.py +2406 -2147
package/python/sentinel/state_store.py +542 -499

package/python/sentinel/health_checker.py ADDED Viewed

@@ -0,0 +1,219 @@
+"""
+health_checker.py — Poll HEALTH_URL for each configured repo.
+Distinguishes deliberate admin stops (502/503 + no startup errors in logs)
+from crash restarts (502/503 + startup failure patterns detected).
+State machine per repo:
+  None      — normal monitoring
+  pending   — first stop detected, one alert sent asking human to confirm
+  confirmed — human confirmed deliberate stop, silent monitoring
+  (cleared) — when app recovers: state removed, "back online" notification sent
+"""
+import logging
+import re
+from dataclasses import dataclass, field
+from pathlib import Path
+import requests
+from .config_loader import RepoConfig
+logger = logging.getLogger(__name__)
+# Spring Boot / Java app startup failure patterns
+_START_FAILURE_RE = re.compile(
+    r"APPLICATION FAILED TO START"
+    r"|Error starting ApplicationContext"
+    r"|BeanCreationException"
+    r"|Failed to load ApplicationContext"
+    r"|java\.lang\.NoSuchMethodError"
+    r"|java\.lang\.ClassNotFoundException"
+    r"|java\.lang\.NoClassDefFoundError"
+    r"|could not start embedded"
+    r"|Caused by:.*Spring"
+    r"|startup failed",
+    re.IGNORECASE,
+)
+# How many bytes from the end of each synced file to scan
+_SCAN_TAIL_BYTES = 200_000  # ~200KB / ~2000 lines
+@dataclass
+class HealthResult:
+    status: str  # "healthy" | "stopped" | "failing" | "unknown"
+    http_code: int = 0
+    message: str = ""
+    raw: dict = field(default_factory=dict)
+def check_health(repo: RepoConfig, timeout: int = 10) -> HealthResult:
+    """
+    Poll repo.health_url and return a HealthResult.
+    Status values:
+      healthy  — HTTP 200 + Status=true
+      failing  — HTTP 200 + Status!=true
+      stopped  — HTTP 502/503/504 or connection refused
+      unknown  — no URL, unexpected HTTP code, or parse error
+    """
+    if not repo.health_url:
+        return HealthResult(status="unknown", message="No HEALTH_URL configured")
+    try:
+        r = requests.get(repo.health_url, timeout=timeout)
+        if r.status_code in (502, 503, 504):
+            return HealthResult(
+                status="stopped",
+                http_code=r.status_code,
+                message=f"HTTP {r.status_code}",
+            )
+        if r.status_code == 200:
+            try:
+                data = r.json()
+            except Exception:
+                return HealthResult(status="healthy", http_code=200)
+            if str(data.get("Status", "")).lower() == "true":
+                return HealthResult(status="healthy", http_code=200, raw=data)
+            return HealthResult(
+                status="failing",
+                http_code=200,
+                raw=data,
+                message=f"Status={data.get('Status')} DEFCON={data.get('DEFCON', '?')}",
+            )
+        return HealthResult(
+            status="unknown",
+            http_code=r.status_code,
+            message=f"Unexpected HTTP {r.status_code}",
+        )
+    except requests.exceptions.ConnectionError:
+        return HealthResult(status="stopped", message="Connection refused")
+    except requests.exceptions.Timeout:
+        return HealthResult(status="unknown", message="Request timed out")
+    except Exception as e:
+        return HealthResult(status="unknown", message=str(e))
+def scan_startup_failure(workspace_dir: str, source_names: list[str]) -> tuple[bool, str]:
+    """
+    Search synced log files for the given source names for startup failure patterns.
+    Scans the last ~200KB of each file (most recent entries).
+    Returns (found: bool, first_matching_line: str).
+    """
+    synced_base = Path(workspace_dir) / "synced"
+    for source_name in source_names:
+        source_dir = synced_base / source_name
+        if not source_dir.exists():
+            continue
+        for log_file in sorted(source_dir.glob("*")):
+            if not log_file.is_file():
+                continue
+            try:
+                size = log_file.stat().st_size
+                with open(log_file, encoding="utf-8", errors="replace") as fh:
+                    if size > _SCAN_TAIL_BYTES:
+                        fh.seek(size - _SCAN_TAIL_BYTES)
+                        fh.readline()  # skip partial line at seek boundary
+                    for line in fh:
+                        if _START_FAILURE_RE.search(line):
+                            return True, line.strip()[:300]
+            except Exception as e:
+                logger.debug("health_checker: scan error %s: %s", log_file, e)
+    return False, ""
+def evaluate_repos(repos: dict, log_sources: dict, workspace_dir: str, store=None) -> list[dict]:
+    """
+    Run health checks for all repos that have HEALTH_URL configured.
+    Uses store to track maintenance state and suppress repeated alerts.
+    Returns a list of result dicts, one per repo checked. Each dict has:
+      repo_name, status, http_code, message,
+      action ("none" | "fix" | "alert_once" | "recovered"),
+      startup_failure (bool), startup_failure_line (str)
+    """
+    results = []
+    for repo in repos.values():
+        if not repo.health_url:
+            continue
+        health = check_health(repo)
+        state = store.get_health_state(repo.repo_name) if store else None
+        result = {
+            "repo_name":            repo.repo_name,
+            "status":               health.status,
+            "http_code":            health.http_code,
+            "message":              health.message,
+            "action":               "none",
+            "startup_failure":      False,
+            "startup_failure_line": "",
+            "was_in_maintenance":   state is not None,
+        }
+        # App recovered — clear state and notify
+        if health.status == "healthy":
+            if state is not None:
+                logger.info(
+                    "health_checker: %s is back online (was %s since %s)",
+                    repo.repo_name, state["status"], state["since"],
+                )
+                if store:
+                    store.clear_health_state(repo.repo_name)
+                result["action"] = "recovered"
+                result["message"] = f"Back online (was {state['status']} since {state['since']})"
+            results.append(result)
+            continue
+        if health.status == "unknown":
+            results.append(result)
+            continue
+        # App is stopped or failing — check for startup errors first
+        sources_for_repo = [
+            s.name for s in log_sources.values()
+            if getattr(s, "target_repo", "auto") in ("auto", repo.repo_name)
+        ]
+        if not sources_for_repo:
+            sources_for_repo = list(log_sources.keys())
+        found, line = scan_startup_failure(workspace_dir, sources_for_repo)
+        result["startup_failure"] = found
+        result["startup_failure_line"] = line
+        if found:
+            # Startup crash overrides any maintenance state — app needs a fix
+            result["action"] = "fix"
+            result["message"] = f"{health.message} — startup failure found in logs"
+            if store and state:
+                store.clear_health_state(repo.repo_name)
+            logger.warning(
+                "health_checker: %s stopped + startup error → fix needed: %s",
+                repo.repo_name, line[:120],
+            )
+            results.append(result)
+            continue
+        # No startup errors. Apply maintenance state machine.
+        if state is None:
+            # First time we've seen this — ask once, set pending
+            result["action"] = "alert_once"
+            result["message"] = health.message
+            if store:
+                store.set_health_state(repo.repo_name, "pending")
+            logger.info(
+                "health_checker: %s stopped with no startup errors — alerting once, "
+                "set state=pending",
+                repo.repo_name,
+            )
+        else:
+            # pending or confirmed — stay silent
+            result["action"] = "none"
+            logger.debug(
+                "health_checker: %s still stopped (%s since %s) — silent",
+                repo.repo_name, state["status"], state["since"],
+            )
+        results.append(result)
+    return results

package/python/sentinel/log_syncer.py ADDED Viewed

@@ -0,0 +1,164 @@
+"""
+log_syncer.py — Periodically rsync log files from remote SSH servers to local workspace.
+Synced location: workspace/synced/<source-name>/<logname>-node<N>.log
+Uses rsync --append-verify (delta sync — only new bytes transferred).
+Full log history is preserved locally for instant grep without SSH.
+"""
+import logging
+import shutil
+import subprocess
+from pathlib import Path
+from .config_loader import LogSourceConfig, SentinelConfig
+logger = logging.getLogger(__name__)
+RSYNC_TIMEOUT = 120  # seconds per file
+def _resolve_key(key: str, config_dir: Path) -> str | None:
+    import os
+    for candidate in [
+        config_dir / key,
+        Path(os.path.expanduser("~/.ssh")) / key,
+        Path(key),
+    ]:
+        if Path(candidate).exists():
+            return str(candidate)
+    return None
+def sync_source(source: LogSourceConfig, cfg: SentinelConfig, config_dir: Path) -> dict:
+    """
+    rsync all log files for one SSH source from all hosts.
+    Returns {"synced": N, "failed": N, "skipped": N}.
+    """
+    if source.source_type != "ssh":
+        return {"synced": 0, "failed": 0, "skipped": 1}
+    if not getattr(source, "sync_enabled", True):
+        return {"synced": 0, "failed": 0, "skipped": 1}
+    if not shutil.which("rsync"):
+        logger.warning("log_syncer: rsync not found — install rsync to enable local sync")
+        return {"synced": 0, "failed": 0, "skipped": 1}
+    key_path = _resolve_key(source.key, config_dir)
+    if not key_path:
+        logger.warning("log_syncer: SSH key not found for %s: %s", source.name, source.key)
+        return {"synced": 0, "failed": 1, "skipped": 0}
+    dest_dir = Path(cfg.workspace_dir) / "synced" / source.name
+    dest_dir.mkdir(parents=True, exist_ok=True)
+    counts = {"synced": 0, "failed": 0, "skipped": 0}
+    for node_idx, raw_host in enumerate(source.hosts):
+        raw_host = raw_host.strip()
+        if "@" in raw_host:
+            ssh_user, ssh_host = raw_host.split("@", 1)
+        else:
+            ssh_user = "ec2-user"
+            ssh_host = raw_host
+        for log_path in source.logs:
+            log_path = log_path.strip()
+            dest_file = dest_dir / f"{Path(log_path).name}-node{node_idx}"
+            ssh_opts = (
+                f"ssh -i {key_path} "
+                "-o StrictHostKeyChecking=no "
+                "-o ConnectTimeout=15 "
+                "-o BatchMode=yes"
+            )
+            cmd = [
+                "rsync", "--append-verify", "-az", "--timeout=30",
+                "-e", ssh_opts,
+                f"{ssh_user}@{ssh_host}:{log_path}",
+                str(dest_file),
+            ]
+            try:
+                r = subprocess.run(
+                    cmd, capture_output=True, text=True, timeout=RSYNC_TIMEOUT
+                )
+                if r.returncode == 0:
+                    counts["synced"] += 1
+                    logger.debug("Synced %s@%s:%s → %s", ssh_user, ssh_host, log_path, dest_file.name)
+                else:
+                    counts["failed"] += 1
+                    logger.warning(
+                        "Sync failed %s@%s:%s (rc=%d): %s",
+                        ssh_user, ssh_host, log_path, r.returncode, r.stderr.strip()[:200],
+                    )
+            except subprocess.TimeoutExpired:
+                counts["failed"] += 1
+                logger.warning("Sync timed out: %s@%s:%s", ssh_user, ssh_host, log_path)
+            except Exception as e:
+                counts["failed"] += 1
+                logger.warning("Sync error %s@%s:%s: %s", ssh_user, ssh_host, log_path, e)
+    return counts
+def prune_synced_logs(cfg: SentinelConfig) -> dict:
+    """
+    Prune synced log files:
+    - Delete files not modified in sync_retention_days days (stale)
+    - Truncate files exceeding sync_max_file_mb MB (drop oldest half of lines)
+    """
+    import time
+    synced_base = Path(cfg.workspace_dir) / "synced"
+    if not synced_base.exists():
+        return {"deleted": 0, "truncated": 0}
+    retention_secs = cfg.sync_retention_days * 86400
+    max_bytes = cfg.sync_max_file_mb * 1024 * 1024
+    now = time.time()
+    deleted = truncated = 0
+    for log_file in synced_base.rglob("*"):
+        if not log_file.is_file():
+            continue
+        try:
+            age = now - log_file.stat().st_mtime
+            if age > retention_secs:
+                log_file.unlink()
+                deleted += 1
+                logger.info(
+                    "log_syncer: pruned stale file %s (%.1f days old)",
+                    log_file.name, age / 86400,
+                )
+                continue
+            if log_file.stat().st_size > max_bytes:
+                lines = log_file.read_text(encoding="utf-8", errors="replace").splitlines()
+                keep = lines[len(lines) // 2:]
+                log_file.write_text("\n".join(keep) + "\n", encoding="utf-8")
+                truncated += 1
+                logger.info(
+                    "log_syncer: trimmed %s — dropped %d lines (exceeded %dMB cap)",
+                    log_file.name, len(lines) - len(keep), cfg.sync_max_file_mb,
+                )
+        except Exception as e:
+            logger.warning("log_syncer: prune error for %s: %s", log_file, e)
+    if deleted or truncated:
+        logger.info("log_syncer: pruning done — %d deleted, %d truncated", deleted, truncated)
+    return {"deleted": deleted, "truncated": truncated}
+def sync_all(log_sources: dict, cfg: SentinelConfig, config_dir: Path) -> dict:
+    """Sync all SSH log sources, then prune old/oversized files. Returns summary dict."""
+    total = {"synced": 0, "failed": 0, "skipped": 0}
+    for source in log_sources.values():
+        r = sync_source(source, cfg, config_dir)
+        for k in total:
+            total[k] += r.get(k, 0)
+    logger.info(
+        "Log sync complete — %d file(s) synced, %d failed, %d skipped",
+        total["synced"], total["failed"], total["skipped"],
+    )
+    prune_synced_logs(cfg)
+    return total
+def get_synced_files(source_name: str, workspace_dir: str) -> list[Path]:
+    """Return list of synced log files for a given source name, or [] if none."""
+    d = Path(workspace_dir) / "synced" / source_name
+    if not d.exists():
+        return []
+    return sorted(d.glob("*"))

package/python/sentinel/main.py CHANGED Viewed

@@ -28,6 +28,7 @@ from .log_parser import parse_all, scan_all_for_markers, ErrorEvent
 from .issue_watcher import scan_issues, mark_done, IssueEvent
 from .repo_router import route
 from .reporter import build_and_send, send_fix_notification, send_failure_notification, send_confirmed_notification, send_regression_notification, send_startup_notification, send_upgrade_notification
+from .health_checker import evaluate_repos
 from .state_store import StateStore
 logging.basicConfig(
@@ -299,6 +300,48 @@ async def poll_cycle(cfg_loader: ConfigLoader, store: StateStore):
             return_exceptions=True,
         )
+    # -- Health URL checks -------------------------------------------------------
+    if cfg_loader.repos:
+        health_results = evaluate_repos(
+            cfg_loader.repos, cfg_loader.log_sources, cfg_loader.sentinel.workspace_dir,
+            store=store,
+        )
+        for hr in health_results:
+            if hr["action"] == "fix":
+                fp = f"health-{hr['repo_name']}"
+                store.record_error(fp, f"health_checker/{hr['repo_name']}", hr["message"])
+                if not store.fix_attempted_recently(fp, hours=6):
+                    synth = ErrorEvent(
+                        source=f"health_checker/{hr['repo_name']}",
+                        severity="ERROR",
+                        message=f"App startup failure: {hr['message']}",
+                        raw_lines=[hr["startup_failure_line"]],
+                        timestamp=None,
+                    )
+                    synth.fingerprint = fp
+                    await _handle_error(synth, cfg_loader, store)
+            elif hr["action"] == "alert_once":
+                from .notify import slack_alert
+                slack_alert(
+                    cfg_loader.sentinel.slack_bot_token,
+                    cfg_loader.sentinel.slack_channel,
+                    (
+                        f":question: *{hr['repo_name']}* health returned {hr['message']}"
+                        " with no startup errors in logs.\n"
+                        "If this is deliberate maintenance, tell Boss: "
+                        f"`maintenance {hr['repo_name']}` \n"
+                        "I'll silently monitor until it's back online."
+                    ),
+                )
+            elif hr["action"] == "recovered":
+                from .notify import slack_alert
+                slack_alert(
+                    cfg_loader.sentinel.slack_bot_token,
+                    cfg_loader.sentinel.slack_channel,
+                    f":white_check_mark: *{hr['repo_name']}* is back online.",
+                )
     if cfg_loader.sentinel.send_health and (_report_requested or _report_due(cfg_loader, store)):
         _report_requested = False
         logger.info("Sending health digest...")
@@ -539,6 +582,23 @@ async def _upgrade_check_loop(cfg_loader: ConfigLoader):
         await asyncio.sleep(cfg_loader.sentinel.upgrade_check_hours * 3600)
+async def _sync_loop(cfg_loader: ConfigLoader):
+    """Background task: rsync remote logs to local workspace/synced/."""
+    from .log_syncer import sync_all
+    # Short initial delay — let startup checks finish first
+    await asyncio.sleep(30)
+    while True:
+        try:
+            sync_all(
+                cfg_loader.log_sources,
+                cfg_loader.sentinel,
+                cfg_loader.config_dir,
+            )
+        except Exception as e:
+            logger.warning("Log sync loop error: %s", e)
+        await asyncio.sleep(cfg_loader.sentinel.sync_interval_seconds)
 # ── Entry point ──────────────────────────────────────────────────────────────────────────────────
 def _log_auth_status(cfg: SentinelConfig) -> None:
@@ -605,6 +665,8 @@ async def run_loop(cfg_loader: ConfigLoader, store: StateStore):
     asyncio.ensure_future(_config_poll_loop(cfg_loader))
     if cfg_loader.sentinel.auto_upgrade:
         asyncio.ensure_future(_upgrade_check_loop(cfg_loader))
+    if cfg_loader.sentinel.sync_enabled:
+        asyncio.ensure_future(_sync_loop(cfg_loader))
     if cfg_loader.sentinel.slack_bot_token:
         from .slack_bot import run_slack_bot
         asyncio.ensure_future(run_slack_bot(cfg_loader, store))