npm - @misterhuydo/sentinel - Versions diffs - 1.2.6 → 1.2.8 - Mend

@misterhuydo/sentinel 1.2.6 → 1.2.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

package/.cairn/.hint-lock +1 -1
package/.cairn/session.json +2 -2
package/package.json +1 -1
package/python/sentinel/fix_engine.py +23 -5
package/python/sentinel/issue_watcher.py +8 -0
package/python/sentinel/main.py +34 -30
package/python/sentinel/notify.py +249 -173
package/python/sentinel/sentinel_boss.py +151 -5

package/.cairn/.hint-lock CHANGED Viewed

	@@ -1 +1 @@
1	- 2026-03-23T11:11:25.~~885Z~~
1	+ 2026-03-23T11:43:23.881Z

package/.cairn/session.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
-  "message": "Auto-checkpoint at 2026-03-23T11:34:06.855Z",
-  "checkpoint_at": "2026-03-23T11:34:06.857Z",
+  "message": "Auto-checkpoint at 2026-03-23T11:46:43.946Z",
+  "checkpoint_at": "2026-03-23T11:46:43.948Z",
   "active_files": [],
   "notes": [],
   "mtime_snapshot": {}

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@misterhuydo/sentinel",
-  "version": "1.2.6",
+  "version": "1.2.8",
   "description": "Sentinel — Autonomous DevOps Agent installer and manager",
   "bin": {
     "sentinel": "./bin/sentinel.js"

package/python/sentinel/fix_engine.py CHANGED Viewed

@@ -80,9 +80,22 @@ def _build_prompt(event, repo: RepoConfig, log_file, marker: str, stale_markers:
         f"1. {step1}",
         "2. Use your available tools to explore the codebase and identify the root cause.",
         f"3. {marker_instruction}",
-        "4. Output ONLY a unified diff patch (git diff format) fixing the issue.",
-        "5. Do not explain. Output only the patch.",
-        "6. If you cannot determine a safe fix, output: SKIP: <reason>",
+        "4. Consider all possible fix approaches. For each, weigh:",
+        "   - Confidence: is this definitely the root cause?",
+        "   - Safety: could this break other functionality?",
+        "   - Scope: is it minimal and targeted?",
+        "   Choose the safest minimal approach. If multiple valid options exist, pick the one",
+        "   with highest confidence and lowest blast radius.",
+        "5. Output ONLY a unified diff patch (git diff format) for the chosen fix.",
+        "6. Do not explain. Output only the patch.",
+        "7. Only if you truly cannot produce a safe fix — e.g. the root cause requires a",
+        "   DB schema change, infrastructure update, business logic decision, or is inside",
+        "   a third-party library — output exactly:",
+        "   NEEDS_HUMAN: <explanation>",
+        "   Include: (a) root cause identified, (b) approaches you considered and why each",
+        "   was insufficient or unsafe, (c) exactly what a human needs to do or decide.",
+        "   Do NOT output NEEDS_HUMAN just because the fix is complex — only when human",
+        "   judgement or access is genuinely required.",
     ]
     return "\n".join(lines_out)
@@ -148,7 +161,7 @@ def generate_fix(
     Returns:
         (status, patch_path, marker)
-        status: "patch" | "skip" | "error"
+        status: "patch" | "skip" | "needs_human" | "error"
     Auth strategy — API key and Claude Pro (OAuth) are interchangeable:
       Primary  : Claude Pro (OAuth) if claude_pro_for_tasks=True, else API key
@@ -237,10 +250,15 @@ def generate_fix(
         output=output,
     )
+    if output.strip().upper().startswith("NEEDS_HUMAN:"):
+        reason = output.strip()[len("NEEDS_HUMAN:"):].strip()
+        logger.info("Claude needs human for %s: %s", event.fingerprint, reason[:200])
+        return "needs_human", None, reason
     if output.strip().upper().startswith("SKIP:"):
         reason = output.strip()[5:].strip()
         logger.info("Claude skipped fix for %s: %s", event.fingerprint, reason)
-        return "skip", None, ""
+        return "skip", None, reason
     patch = _extract_patch(output)
     if not patch:

package/python/sentinel/issue_watcher.py CHANGED Viewed

@@ -41,6 +41,7 @@ class IssueEvent:
     fingerprint: str = ""
     severity: str = "ERROR"
     timestamp: str = ""
+    submitter_user_id: str = ""  # Slack user ID who raised this via Boss, if known
     # Compatibility fields matching ErrorEvent interface
     level: str = "ERROR"
@@ -53,6 +54,13 @@ class IssueEvent:
         if not self.fingerprint:
             raw = f"issue:{self.source}:{self.message[:200]}"
             self.fingerprint = hashlib.sha1(raw.encode()).hexdigest()[:16]
+        if not self.submitter_user_id:
+            import re as _re
+            for _line in self.body.splitlines():
+                _m = _re.match(r'SUBMITTED_BY:.*\(([UW][A-Z0-9]+)\)', _line.strip())
+                if _m:
+                    self.submitter_user_id = _m.group(1)
+                    break
         if not self.timestamp:
             self.timestamp = datetime.now(timezone.utc).isoformat()
         if not self.stack_trace:

package/python/sentinel/main.py CHANGED Viewed

@@ -28,6 +28,7 @@ from .log_parser import parse_all, scan_all_for_markers, ErrorEvent
 from .issue_watcher import scan_issues, mark_done, IssueEvent
 from .repo_router import route
 from .reporter import build_and_send, send_fix_notification, send_failure_notification, send_confirmed_notification, send_regression_notification, send_startup_notification, send_upgrade_notification
+from .notify import notify_fix_blocked
 from .health_checker import evaluate_repos
 from .state_store import StateStore
@@ -87,27 +88,29 @@ async def _handle_error(event: ErrorEvent, cfg_loader: ConfigLoader, store: Stat
     status, patch_path, marker = generate_fix(event, repo, sentinel, patches_dir, store)
     if status != "patch" or patch_path is None:
-        outcome = "skipped" if status == "skip" else "failed"
+        outcome = "skipped" if status in ("skip", "needs_human") else "failed"
         store.record_fix(event.fingerprint, outcome, repo_name=repo.repo_name)
-        send_failure_notification(sentinel, {
-            "source":    event.source,
-            "message":   event.message,
-            "repo_name": repo.repo_name,
-            "reason":    f"Claude Code returned {status.upper()}",
-            "body":      event.full_text()[:500],
-        })
+        submitter_uid = getattr(event, "submitter_user_id", "")
+        if status == "needs_human":
+            # marker holds the reason string for needs_human
+            notify_fix_blocked(sentinel, event.source, event.message,
+                               reason=marker, repo_name=repo.repo_name,
+                               submitter_user_id=submitter_uid)
+        else:
+            notify_fix_blocked(sentinel, event.source, event.message,
+                               reason=f"Claude Code returned {status.upper()}",
+                               repo_name=repo.repo_name,
+                               submitter_user_id=submitter_uid)
         return
     commit_status, commit_hash = apply_and_commit(event, patch_path, repo, sentinel)
     if commit_status != "committed":
         store.record_fix(event.fingerprint, "failed", repo_name=repo.repo_name)
-        send_failure_notification(sentinel, {
-            "source":    event.source,
-            "message":   event.message,
-            "repo_name": repo.repo_name,
-            "reason":    "patch generated but commit/tests failed",
-            "body":      event.full_text()[:500],
-        })
+        submitter_uid = getattr(event, "submitter_user_id", "")
+        notify_fix_blocked(sentinel, event.source, event.message,
+                           reason="Patch was generated but commit/tests failed",
+                           repo_name=repo.repo_name,
+                           submitter_user_id=submitter_uid)
         return
     branch, pr_url = publish(event, repo, sentinel, commit_hash)
@@ -179,28 +182,29 @@ async def _handle_issue(event: IssueEvent, cfg_loader: ConfigLoader, store: Stat
     status, patch_path, marker = generate_fix(event, repo, sentinel, patches_dir, store)
     if status != "patch" or patch_path is None:
-        store.record_fix(event.fingerprint, "skipped" if status == "skip" else "failed",
+        store.record_fix(event.fingerprint, "skipped" if status in ("skip", "needs_human") else "failed",
                          repo_name=repo.repo_name)
-        send_failure_notification(sentinel, {
-            "source":    event.source,
-            "message":   event.message,
-            "repo_name": repo.repo_name,
-            "reason":    f"Claude Code returned {status.upper()}",
-            "body":      event.body[:500],
-        })
+        submitter_uid = getattr(event, "submitter_user_id", "")
+        if status == "needs_human":
+            notify_fix_blocked(sentinel, event.source, event.message,
+                               reason=marker, repo_name=repo.repo_name,
+                               submitter_user_id=submitter_uid)
+        else:
+            notify_fix_blocked(sentinel, event.source, event.message,
+                               reason=f"Claude Code returned {status.upper()}",
+                               repo_name=repo.repo_name,
+                               submitter_user_id=submitter_uid)
         mark_done(event.issue_file)
         return
     commit_status, commit_hash = apply_and_commit(event, patch_path, repo, sentinel)
     if commit_status != "committed":
         store.record_fix(event.fingerprint, "failed", repo_name=repo.repo_name)
-        send_failure_notification(sentinel, {
-            "source":    event.source,
-            "message":   event.message,
-            "repo_name": repo.repo_name,
-            "reason":    "patch generated but commit/tests failed",
-            "body":      event.body[:500],
-        })
+        submitter_uid = getattr(event, "submitter_user_id", "")
+        notify_fix_blocked(sentinel, event.source, event.message,
+                           reason="Patch was generated but commit/tests failed",
+                           repo_name=repo.repo_name,
+                           submitter_user_id=submitter_uid)
         mark_done(event.issue_file)
         return

package/python/sentinel/notify.py CHANGED Viewed

@@ -1,173 +1,249 @@
-"""
-notify.py — Best-effort Slack alerts from any Sentinel module.
-Uses the Slack Web API directly (no Bolt / Socket Mode required).
-Calls never raise — failures are logged and silently dropped.
-"""
-import logging
-import re
-import time
-import requests
-logger = logging.getLogger(__name__)
-# ── Rate-limit / auth-failure detector ────────────────────────────────────────
-_RATE_LIMIT_RE = re.compile(
-    r"rate.?limit|usage.?limit|too many requests|quota.?exceeded"
-    r"|overloaded|credit.?balance|billing|529"
-    r"|not.?authenticated|invalid.?api.?key|authentication.?fail"
-    r"|claude\.ai subscription|pro.?plan|login required",
-    re.IGNORECASE,
-)
-def is_rate_limited(text: str) -> bool:
-    """Return True if the text contains a rate-limit or auth-failure signal."""
-    return bool(_RATE_LIMIT_RE.search(text))
-# ── Circuit breaker ────────────────────────────────────────────────────────────
-#
-# Prevents alert storms when Claude is persistently rate-limited.
-# Each `source` string gets its own independent circuit:
-#   CLOSED → normal; alerts pass through immediately
-#   OPEN   → suppressed; one re-alert every CIRCUIT_COOLDOWN_SECONDS
-#
-# On recovery (first non-rate-limited output after OPEN):
-#   → post "resolved" to Slack, close the circuit
-CIRCUIT_COOLDOWN_SECONDS = 3600   # 1 h between repeat alerts while open
-# source → {opened_at, last_alerted_at, count}
-_circuits: dict[str, dict] = {}
-def get_circuit_status() -> dict:
-    """
-    Return a snapshot of all open circuits.
-    Used by the `check_auth_status` Boss tool.
-    Returns:
-        { source: { state, opened_at, open_for_seconds, alert_count } }
-        Only open circuits are included; an empty dict means everything is healthy.
-    """
-    now = time.time()
-    return {
-        src: {
-            "state": "open",
-            "opened_at": c["opened_at"],
-            "open_for_seconds": int(now - c["opened_at"]),
-            "alert_count": c["count"],
-        }
-        for src, c in _circuits.items()
-    }
-def _open_or_repeat(bot_token: str, channel: str, source: str, output: str) -> None:
-    """Open circuit on first hit; re-alert after cooldown if still failing."""
-    now = time.time()
-    circuit = _circuits.get(source)
-    if circuit is None:
-        # First occurrence — open and alert immediately
-        _circuits[source] = {"opened_at": now, "last_alerted_at": now, "count": 1}
-        logger.error("Circuit opened for %s: %s", source, output[:200])
-        slack_alert(bot_token, channel, rate_limit_message(source, output))
-        return
-    circuit["count"] += 1
-    elapsed = now - circuit["last_alerted_at"]
-    if elapsed >= CIRCUIT_COOLDOWN_SECONDS:
-        # Still failing after cooldown — remind admins once per hour
-        circuit["last_alerted_at"] = now
-        open_mins = int((now - circuit["opened_at"]) / 60)
-        msg = (
-            f":warning: *Sentinel — Claude usage/auth problem still active ({source})*\n"
-            f"Still failing after {open_mins} minutes. Total occurrences: {circuit['count']}.\n"
-            f"Last error:\n```{output.strip()[:300]}```\n"
-            f"Run `check_auth_status` in Slack to see the full picture."
-        )
-        logger.error("Circuit still open for %s (count=%d)", source, circuit["count"])
-        slack_alert(bot_token, channel, msg)
-    # else: within cooldown window — suppress
-def _close_if_open(bot_token: str, channel: str, source: str) -> None:
-    """If circuit was open, close it and post a recovery alert."""
-    circuit = _circuits.pop(source, None)
-    if circuit is None:
-        return
-    duration_mins = int((time.time() - circuit["opened_at"]) / 60)
-    msg = (
-        f":white_check_mark: *Sentinel — Claude auth restored ({source})*\n"
-        f"Fixed after {duration_mins} min. Total failures during outage: {circuit['count']}."
-    )
-    logger.info("Circuit closed for %s after %d min, %d failures", source, duration_mins, circuit["count"])
-    slack_alert(bot_token, channel, msg)
-def rate_limit_message(source: str, raw: str) -> str:
-    """Produce a human-readable Slack alert for a rate-limit / auth event (first occurrence)."""
-    snippet = raw.strip()[:300].replace("\n", " ")
-    return (
-        f":warning: *Sentinel — Claude usage/auth problem ({source})*\n"
-        f"Claude returned an error that requires admin attention:\n"
-        f"```{snippet}```\n"
-        f"*What to check:*\n"
-        f"• API key: verify `ANTHROPIC_API_KEY` in `sentinel.properties` is valid and has credit\n"
-        f"• Claude Pro: run `claude login` on the server to refresh the OAuth session\n"
-        f"• Both: Sentinel tries both methods — at least one must be working\n"
-        f"Repeat alerts will be suppressed for 1 hour. "
-        f"Run `check_auth_status` in Slack to see current state."
-    )
-# ── Alert dispatcher ──────────────────────────────────────────────────────────
-def slack_alert(bot_token: str, channel: str, text: str) -> None:
-    """
-    Post a plain-text alert to a Slack channel.
-    Best-effort: logs on failure, never raises.
-    """
-    if not bot_token or not channel:
-        logger.debug("slack_alert: no token/channel configured — logging only: %s", text[:120])
-        return
-    try:
-        resp = requests.post(
-            "https://slack.com/api/chat.postMessage",
-            headers={
-                "Authorization": f"Bearer {bot_token}",
-                "Content-Type": "application/json",
-            },
-            json={"channel": channel, "text": text},
-            timeout=10,
-        )
-        data = resp.json()
-        if not data.get("ok"):
-            logger.warning("slack_alert: Slack API error: %s", data.get("error"))
-    except Exception as exc:
-        logger.warning("slack_alert: failed to post: %s", exc)
-def alert_if_rate_limited(
-    bot_token: str,
-    channel: str,
-    source: str,
-    output: str,
-) -> bool:
-    """
-    Check output for rate-limit / auth signals and manage the circuit breaker.
-    - Rate limited  → open/keep-open circuit, alert (with cooldown suppression)
-    - Not limited   → close circuit if it was open (recovery alert), return False
-    Returns True if a rate-limit signal was found.
-    """
-    if not is_rate_limited(output):
-        _close_if_open(bot_token, channel, source)
-        return False
-    _open_or_repeat(bot_token, channel, source, output)
-    return True
+"""
+notify.py — Best-effort Slack alerts from any Sentinel module.
+Uses the Slack Web API directly (no Bolt / Socket Mode required).
+Calls never raise — failures are logged and silently dropped.
+"""
+import logging
+import re
+import time
+import requests
+logger = logging.getLogger(__name__)
+# ── Rate-limit / auth-failure detector ────────────────────────────────────────
+_RATE_LIMIT_RE = re.compile(
+    r"rate.?limit|usage.?limit|too many requests|quota.?exceeded"
+    r"|overloaded|credit.?balance|billing|529"
+    r"|not.?authenticated|invalid.?api.?key|authentication.?fail"
+    r"|claude\.ai subscription|pro.?plan|login required",
+    re.IGNORECASE,
+)
+def is_rate_limited(text: str) -> bool:
+    """Return True if the text contains a rate-limit or auth-failure signal."""
+    return bool(_RATE_LIMIT_RE.search(text))
+# ── Circuit breaker ────────────────────────────────────────────────────────────
+#
+# Prevents alert storms when Claude is persistently rate-limited.
+# Each `source` string gets its own independent circuit:
+#   CLOSED → normal; alerts pass through immediately
+#   OPEN   → suppressed; one re-alert every CIRCUIT_COOLDOWN_SECONDS
+#
+# On recovery (first non-rate-limited output after OPEN):
+#   → post "resolved" to Slack, close the circuit
+CIRCUIT_COOLDOWN_SECONDS = 3600   # 1 h between repeat alerts while open
+# source → {opened_at, last_alerted_at, count}
+_circuits: dict[str, dict] = {}
+def get_circuit_status() -> dict:
+    """
+    Return a snapshot of all open circuits.
+    Used by the `check_auth_status` Boss tool.
+    Returns:
+        { source: { state, opened_at, open_for_seconds, alert_count } }
+        Only open circuits are included; an empty dict means everything is healthy.
+    """
+    now = time.time()
+    return {
+        src: {
+            "state": "open",
+            "opened_at": c["opened_at"],
+            "open_for_seconds": int(now - c["opened_at"]),
+            "alert_count": c["count"],
+        }
+        for src, c in _circuits.items()
+    }
+def _open_or_repeat(bot_token: str, channel: str, source: str, output: str) -> None:
+    """Open circuit on first hit; re-alert after cooldown if still failing."""
+    now = time.time()
+    circuit = _circuits.get(source)
+    if circuit is None:
+        # First occurrence — open and alert immediately
+        _circuits[source] = {"opened_at": now, "last_alerted_at": now, "count": 1}
+        logger.error("Circuit opened for %s: %s", source, output[:200])
+        slack_alert(bot_token, channel, rate_limit_message(source, output))
+        return
+    circuit["count"] += 1
+    elapsed = now - circuit["last_alerted_at"]
+    if elapsed >= CIRCUIT_COOLDOWN_SECONDS:
+        # Still failing after cooldown — remind admins once per hour
+        circuit["last_alerted_at"] = now
+        open_mins = int((now - circuit["opened_at"]) / 60)
+        msg = (
+            f":warning: *Sentinel — Claude usage/auth problem still active ({source})*\n"
+            f"Still failing after {open_mins} minutes. Total occurrences: {circuit['count']}.\n"
+            f"Last error:\n```{output.strip()[:300]}```\n"
+            f"Run `check_auth_status` in Slack to see the full picture."
+        )
+        logger.error("Circuit still open for %s (count=%d)", source, circuit["count"])
+        slack_alert(bot_token, channel, msg)
+    # else: within cooldown window — suppress
+def _close_if_open(bot_token: str, channel: str, source: str) -> None:
+    """If circuit was open, close it and post a recovery alert."""
+    circuit = _circuits.pop(source, None)
+    if circuit is None:
+        return
+    duration_mins = int((time.time() - circuit["opened_at"]) / 60)
+    msg = (
+        f":white_check_mark: *Sentinel — Claude auth restored ({source})*\n"
+        f"Fixed after {duration_mins} min. Total failures during outage: {circuit['count']}."
+    )
+    logger.info("Circuit closed for %s after %d min, %d failures", source, duration_mins, circuit["count"])
+    slack_alert(bot_token, channel, msg)
+def rate_limit_message(source: str, raw: str) -> str:
+    """Produce a human-readable Slack alert for a rate-limit / auth event (first occurrence)."""
+    snippet = raw.strip()[:300].replace("\n", " ")
+    return (
+        f":warning: *Sentinel — Claude usage/auth problem ({source})*\n"
+        f"Claude returned an error that requires admin attention:\n"
+        f"```{snippet}```\n"
+        f"*What to check:*\n"
+        f"• API key: verify `ANTHROPIC_API_KEY` in `sentinel.properties` is valid and has credit\n"
+        f"• Claude Pro: run `claude login` on the server to refresh the OAuth session\n"
+        f"• Both: Sentinel tries both methods — at least one must be working\n"
+        f"Repeat alerts will be suppressed for 1 hour. "
+        f"Run `check_auth_status` in Slack to see current state."
+    )
+# ── Alert dispatcher ──────────────────────────────────────────────────────────
+def slack_alert(bot_token: str, channel: str, text: str) -> None:
+    """
+    Post a plain-text alert to a Slack channel.
+    Best-effort: logs on failure, never raises.
+    """
+    if not bot_token or not channel:
+        logger.debug("slack_alert: no token/channel configured — logging only: %s", text[:120])
+        return
+    try:
+        resp = requests.post(
+            "https://slack.com/api/chat.postMessage",
+            headers={
+                "Authorization": f"Bearer {bot_token}",
+                "Content-Type": "application/json",
+            },
+            json={"channel": channel, "text": text},
+            timeout=10,
+        )
+        data = resp.json()
+        if not data.get("ok"):
+            logger.warning("slack_alert: Slack API error: %s", data.get("error"))
+    except Exception as exc:
+        logger.warning("slack_alert: failed to post: %s", exc)
+def slack_dm(bot_token: str, user_id: str, text: str) -> None:
+    """
+    Send a direct message to a specific Slack user.
+    Opens a DM channel via conversations.open, then posts.
+    Best-effort: logs on failure, never raises.
+    """
+    if not bot_token or not user_id:
+        logger.debug("slack_dm: no token/user_id — skipping DM")
+        return
+    try:
+        resp = requests.post(
+            "https://slack.com/api/conversations.open",
+            headers={"Authorization": f"Bearer {bot_token}", "Content-Type": "application/json"},
+            json={"users": user_id},
+            timeout=10,
+        )
+        data = resp.json()
+        if not data.get("ok"):
+            logger.warning("slack_dm: conversations.open failed: %s", data.get("error"))
+            return
+        dm_channel = data["channel"]["id"]
+        slack_alert(bot_token, dm_channel, text)
+    except Exception as exc:
+        logger.warning("slack_dm: failed to DM %s: %s", user_id, exc)
+def notify_fix_blocked(
+    cfg,
+    source: str,
+    message: str,
+    reason: str,
+    repo_name: str = "",
+    submitter_user_id: str = "",
+) -> None:
+    """
+    Notify that a fix needs human intervention.
+    - If submitter_user_id is known: DM that person directly.
+    - Otherwise: @channel in the configured Slack channel.
+    - Always: email admins via reporter.send_failure_notification.
+    """
+    short_reason = (reason or "Claude could not determine a safe fix.")[:600]
+    repo_line = f"\n*Repo:* {repo_name}" if repo_name else ""
+    slack_text = (
+        f":hand: *Fix blocked — human intervention needed*\n"
+        f"*Source:* {source}\n"
+        f"*Issue:* {message[:200]}{repo_line}\n"
+        f"*Reason:*\n{short_reason}"
+    )
+    if submitter_user_id:
+        slack_dm(cfg.slack_bot_token, submitter_user_id, slack_text)
+    else:
+        # No known submitter — broadcast to the whole channel
+        slack_alert(
+            cfg.slack_bot_token,
+            cfg.slack_channel,
+            f"<!channel> {slack_text}",
+        )
+    # Always email admins
+    try:
+        from .reporter import send_failure_notification
+        send_failure_notification(cfg, {
+            "source":    source,
+            "message":   message,
+            "repo_name": repo_name,
+            "reason":    f"Needs human intervention: {short_reason[:200]}",
+            "body":      reason,
+        })
+    except Exception as exc:
+        logger.warning("notify_fix_blocked: email notification failed: %s", exc)
+def alert_if_rate_limited(
+    bot_token: str,
+    channel: str,
+    source: str,
+    output: str,
+) -> bool:
+    """
+    Check output for rate-limit / auth signals and manage the circuit breaker.
+    - Rate limited  → open/keep-open circuit, alert (with cooldown suppression)
+    - Not limited   → close circuit if it was open (recovery alert), return False
+    Returns True if a rate-limit signal was found.
+    """
+    if not is_rate_limited(output):
+        _close_if_open(bot_token, channel, source)
+        return False
+    _open_or_repeat(bot_token, channel, source, output)
+    return True

package/python/sentinel/sentinel_boss.py CHANGED Viewed

@@ -240,6 +240,29 @@ When to act vs. when to ask:
 - If a tool call will take a moment (search, fetch, pull), prefix your reply with a brief "working" line ending in "..." before the results, e.g. "Searching SSOLWA for TryDig activity..." then the actual output.
   Never just say a working line and stop — always follow it with the results in the same message.
+Search reasoning — always do this before calling filter_logs or search_logs:
+1. Interpret intent: what is the user actually looking for? Don't pass the raw message as the query.
+   Examples:
+   - "TryDig errors" → query="TryDig" (component name; look for it in any context)
+   - "payment failures last hour" → query="pay|payment|transaction", since_hours=1
+   - "why is the app crashing" → query="Exception|Error|FAILED|crash", look for stack traces
+   - "login issues today" → query="login|auth|401|403|session", since_hours=24
+   - "slow requests" → query="timeout|slow|latency|took [0-9]+ms|duration"
+   - "startup problems" → query="APPLICATION FAILED|BeanCreation|NoSuchMethod|ClassNotFound"
+   Use | in the regex to cover synonyms and related terms. Keep it focused — not too broad.
+2. Choose since_hours if a time window is implied ("last hour", "today", "this morning").
+3. Pick source if the user mentioned a specific service (SSOLWA, STS, etc.) or server.
+After getting filter_logs results, always synthesize — never dump raw output:
+- Lead with 1-2 sentences: total count, affected sources, dominant pattern.
+  e.g. "Found 47 matches across SSOLWA and STS — mostly NullPointerException in DigService (31 hits)."
+- List the top 3-5 patterns with counts in plain language.
+- Call out any notable time clustering (e.g. "spike between 10:23–10:47 UTC").
+- Show 2-3 example lines at most — only the most informative ones.
+- End with a recommendation if the pattern suggests something actionable:
+  e.g. "Looks like a dependency resolution issue — create an issue?" or "Pattern consistent with a null config value at startup."
+- If total_matches=0, say so plainly and suggest what else to try.
 Session context — critical rules:
 - Loaded conversation history is prior-session background only. It may be hours or days old.
 - NEVER say "the previous search", "I already fetched", "as I found earlier", or any phrase implying you already did part of the current task — unless a tool result appears in THIS response's tool calls.
@@ -1341,11 +1364,29 @@ async def _run_tool(name: str, inputs: dict, cfg_loader, store, slack_client=Non
     if name == "filter_logs":
         import re as _re
+        from collections import Counter as _Counter
         from datetime import datetime, timedelta, timezone as _tz
+        # Extract a short grouping key from a log line for pattern analysis
+        _EXC_PAT  = _re.compile(r'([A-Z][a-zA-Z]+(?:Exception|Error|Failure|Fault|Warning))')
+        _LVL_PAT  = _re.compile(r'\b(ERROR|WARN(?:ING)?|CRITICAL|FATAL|SEVERE)\b', _re.IGNORECASE)
+        def _signature(line):
+            exc = _EXC_PAT.search(line)
+            if exc:
+                return exc.group(1)
+            m = _LVL_PAT.search(line)
+            if m:
+                after = line[m.end():].strip()
+                token = after.split()[0].rstrip(':.,') if after.split() else ''
+                if token and len(token) > 2:
+                    return m.group(1).upper() + ' ' + token[:40]
+            return line.strip()[:40]
         query_f      = inputs.get("query", "")
         source_f     = inputs.get("source", "").lower()
         since_hours  = inputs.get("since_hours")
-        max_matches  = int(inputs.get("max_matches", 50))
+        max_matches  = int(inputs.get("max_matches", 300))
         case_flag    = 0 if inputs.get("case_sensitive") else _re.IGNORECASE
         try:
             pat = _re.compile(query_f, case_flag)
@@ -1416,12 +1457,117 @@ async def _run_tool(name: str, inputs: dict, cfg_loader, store, slack_client=Non
                 "note": "No matches found in synced logs.",
             })
+        try:
+            pat = _re.compile(query_f, case_flag)
+        except _re.error as e:
+            return json.dumps({"error": f"Invalid regex: {e}"})
+        synced_base = Path("workspace/synced")
+        if not synced_base.exists():
+            return json.dumps({
+                "error": "No synced logs found.",
+                "hint": "Log sync runs every SYNC_INTERVAL_SECONDS (default 300s). "
+                        "If just started, wait a minute then try again.",
+            })
+        cutoff = None
+        if since_hours:
+            cutoff = datetime.now(_tz.utc) - timedelta(hours=int(since_hours))
+        if source_f:
+            src_dirs = [d for d in sorted(synced_base.iterdir())
+                        if d.is_dir() and source_f in d.name.lower()]
+        else:
+            src_dirs = [d for d in sorted(synced_base.iterdir()) if d.is_dir()]
+        if not src_dirs:
+            available = [d.name for d in synced_base.iterdir() if d.is_dir()]
+            return json.dumps({
+                "error": f"No synced source matching '{source_f}'",
+                "available_sources": available,
+            })
+        all_matches = []   # list of (source_name, line)
+        sources_hit = set()
+        for src_dir in src_dirs:
+            for log_file in sorted(src_dir.glob("*")):
+                try:
+                    lines = log_file.read_text(encoding="utf-8", errors="replace").splitlines()
+                    for line in lines:
+                        if not pat.search(line):
+                            continue
+                        if cutoff:
+                            from .log_fetcher import _parse_line_ts
+                            ts = _parse_line_ts(line)
+                            if ts and ts < cutoff:
+                                continue
+                        all_matches.append((src_dir.name, line[:300]))
+                        sources_hit.add(src_dir.name)
+                        if len(all_matches) >= max_matches:
+                            break
+                except Exception:
+                    pass
+            if len(all_matches) >= max_matches:
+                break
+        total = len(all_matches)
+        if total == 0:
+            return json.dumps({
+                "query": query_f,
+                "total_matches": 0,
+                "sources_searched": [d.name for d in src_dirs],
+                "note": "No matches found in synced logs.",
+            })
+        # Pattern grouping: count occurrences of each error signature
+        sig_counter = _Counter()
+        sig_examples = {}
+        for src, line in all_matches:
+            sig = _signature(line)
+            sig_counter[sig] += 1
+            if sig not in sig_examples:
+                sig_examples[sig] = f"[{src}] {line}"
+        top_patterns = [
+            {"pattern": sig, "count": cnt, "example": sig_examples[sig][:250]}
+            for sig, cnt in sig_counter.most_common(10)
+        ]
+        # Sample: first unique-signature line from each source
+        sample_lines = []
+        seen_sigs = set()
+        for src, line in all_matches:
+            sig = _signature(line)
+            if sig not in seen_sigs:
+                sample_lines.append(f"[{src}] {line}")
+                seen_sigs.add(sig)
+            if len(sample_lines) >= 10:
+                break
+        # Time span
+        time_span = {}
+        try:
+            from .log_fetcher import _parse_line_ts
+            timestamps = [_parse_line_ts(ln) for _, ln in all_matches]
+            timestamps = [t for t in timestamps if t]
+            if timestamps:
+                time_span = {
+                    "earliest": min(timestamps).strftime("%Y-%m-%d %H:%M:%S UTC"),
+                    "latest":   max(timestamps).strftime("%Y-%m-%d %H:%M:%S UTC"),
+                }
+        except Exception:
+            pass
         return json.dumps({
-            "query": query_f,
-            "mode": "local",
-            "total_matches": total_matches,
+            "query":            query_f,
+            "total_matches":    total,
+            "sources_hit":      sorted(sources_hit),
             "sources_searched": [d.name for d in src_dirs],
-            "results": results,
+            "top_patterns":     top_patterns,
+            "sample_lines":     sample_lines,
+            "time_span":        time_span,
+            "capped":           total >= max_matches,
         })
     if name == "trigger_poll":