npm - @misterhuydo/sentinel - Versions diffs - 1.4.0 → 1.4.2 - Mend

@misterhuydo/sentinel 1.4.0 → 1.4.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

package/.cairn/.hint-lock +1 -1
package/.cairn/session.json +2 -2
package/package.json +1 -1
package/python/sentinel/fix_engine.py +13 -2
package/python/sentinel/log_fetcher.py +11 -1
package/python/sentinel/sentinel_boss.py +128 -1
package/templates/log-configs/_example.properties +9 -2
package/templates/repo-configs/_example.properties +6 -0
package/templates/sentinel.properties +4 -0
package/templates/workspace-sentinel.properties +11 -0

package/.cairn/.hint-lock CHANGED Viewed

	@@ -1 +1 @@
1	- 2026-03-~~24T07~~:49:55.~~081Z~~
1	+ 2026-03-24T08:21:39.465Z

package/.cairn/session.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
-  "message": "Auto-checkpoint at 2026-03-24T08:01:08.778Z",
-  "checkpoint_at": "2026-03-24T08:01:08.779Z",
+  "message": "Auto-checkpoint at 2026-03-24T08:34:58.068Z",
+  "checkpoint_at": "2026-03-24T08:34:58.069Z",
   "active_files": [],
   "notes": [],
   "mtime_snapshot": {}

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@misterhuydo/sentinel",
-  "version": "1.4.0",
+  "version": "1.4.2",
   "description": "Sentinel — Autonomous DevOps Agent installer and manager",
   "bin": {
     "sentinel": "./bin/sentinel.js"

package/python/sentinel/fix_engine.py CHANGED Viewed

@@ -28,7 +28,7 @@ _DIFF_BLOCK = re.compile(r"```(?:diff|patch)?\n(.*?)```", re.DOTALL)
 _DIFF_HEADER = re.compile(r"^diff --git|^---\s+\S+|^\+\+\+\s+\S+", re.MULTILINE)
-def _build_prompt(event, repo: RepoConfig, log_file, marker: str, stale_markers: list[str] = None) -> str:
+def _build_prompt(event, repo: RepoConfig, log_file, marker: str, stale_markers: list[str] = None, synced_files: list = None) -> str:
     if log_file and log_file.exists():
         ctx = (
             "LOG FILE: " + str(log_file) + "\n"
@@ -44,6 +44,15 @@ def _build_prompt(event, repo: RepoConfig, log_file, marker: str, stale_markers:
         )
         step1 = "Use the issue description above as your primary context."
+    if synced_files:
+        paths = "\n".join(f"  {p}" for p in synced_files)
+        ctx += (
+            "\n\nFULL LOG HISTORY (rsync'd from remote, unfiltered):\n"
+            + paths + "\n"
+            "These files contain the complete unfiltered log history. "
+            "Use them to search for patterns, related errors, or context preceding this issue."
+        )
     marker_label = marker + " sentinel-auto-fix [safe to remove after verification]"
     marker_instruction = "\n".join([
         "For EVERY method and constructor you modify, add this as the FIRST executable line:",
@@ -205,7 +214,9 @@ def generate_fix(
     log_file = Path(cfg.workspace_dir) / "fetched" / f"{event.source}.log"
     if not log_file.exists():
         log_file = None
-    prompt = _build_prompt(event, repo, log_file, marker)
+    from .log_syncer import get_synced_files
+    synced = get_synced_files(event.source, cfg.workspace_dir)
+    prompt = _build_prompt(event, repo, log_file, marker, synced_files=synced or None)
     # -- Cross-source dedup: skip if fingerprint already fixed in recent git commits ------
     if repo.local_path:

package/python/sentinel/log_fetcher.py CHANGED Viewed

@@ -196,5 +196,15 @@ def _fetch_cloudflare(src, cfg):
     rolling_path = workspace / f"{src.name}.log"
     _rolling_update(rolling_path, "\n".join(lines), cfg.log_retention_hours)
-    logger.info("Cloudflare fetch %s: %d new lines -> %s", src.name, len(lines), rolling_path)
+    # Accumulate into workspace/synced/ with longer retention so Claude Code
+    # (and ask_logs / filter_logs) can inspect full history — same as SSH rsync.
+    retention_hours = getattr(cfg, "sync_retention_days", 7) * 24
+    synced_dir = Path(cfg.workspace_dir) / "synced" / src.name
+    synced_dir.mkdir(parents=True, exist_ok=True)
+    synced_path = synced_dir / "cloudflare.log"
+    _rolling_update(synced_path, "\n".join(lines), retention_hours)
+    logger.info("Cloudflare fetch %s: %d new lines -> %s (synced: %s)",
+                src.name, len(lines), rolling_path, synced_path)
     return [rolling_path]

package/python/sentinel/sentinel_boss.py CHANGED Viewed

@@ -108,7 +108,16 @@ What you can do (tools available):
                        e.g. "what does the 1881 backend do?", "find PIN validation in elprint",
                             "any TODOs in cairn?", "are there security issues in elprint-sales?"
-20. restart_project  — Stop and restart a specific Sentinel monitoring instance (stop.sh + start.sh).
+20. ask_logs         — Ask Claude Code to search and summarize logs for a source.
+                       Claude Code reads the full log history (synced + rolling) and answers.
+                       Use when the user asks something that requires reading and reasoning
+                       over log content — not just a grep match.
+                       e.g. "what errors happened yesterday in SSOLWA?",
+                            "summarize the last week of STS logs",
+                            "what's been causing 400s in the 1881 logs?",
+                            "any unusual patterns in elprint logs recently?"
+21. restart_project  — Stop and restart a specific Sentinel monitoring instance (stop.sh + start.sh).
                        This restarts the Sentinel agent for that project, NOT the application itself.
                        e.g. "restart sentinel for 1881", "restart the 1881 monitor", "reload elprint sentinel"
@@ -132,6 +141,7 @@ reply with a short summary grouped by category:
 • `search_logs` — live SSH grep on production servers — "search logs for illegal PIN in 1881"
 • `filter_logs` — instant grep on locally-synced logs (no SSH) — "filter logs for TryDig", "show errors from last 24h"
 • `tail_log` — last N lines of a log source, no filter — "show recent SSOLWA logs"
+• `ask_logs` — ask Claude Code to read and summarize logs — "what happened in SSOLWA yesterday?", "summarize last week of STS logs"
 *Codebase questions*
 • `ask_codebase` — any question about a repo's code — "what does 1881 do?", "find PIN validation", "any TODOs?", "security issues?"
@@ -769,6 +779,33 @@ _TOOLS = [
             "required": ["source"],
         },
     },
+    {
+        "name": "ask_logs",
+        "description": (
+            "Ask Claude Code to search and summarize log files for a source. "
+            "Claude Code reads the full log history (rsync'd synced logs + rolling window) "
+            "and answers the question using its file tools — not just a regex match. "
+            "Use for analysis questions that require reading and reasoning over log content. "
+            "e.g. 'what errors happened yesterday in SSOLWA?', "
+            "'summarize last week of STS logs', "
+            "'what's been causing 400s in 1881 logs?', "
+            "'any unusual patterns in elprint logs recently?'"
+        ),
+        "input_schema": {
+            "type": "object",
+            "properties": {
+                "source": {
+                    "type": "string",
+                    "description": "Log source name (partial match, e.g. 'SSOLWA', 'STS'). Leave blank to query all sources.",
+                },
+                "question": {
+                    "type": "string",
+                    "description": "Natural language question about the logs",
+                },
+            },
+            "required": ["question"],
+        },
+    },
     {
         "name": "post_file",
         "description": (
@@ -1878,6 +1915,96 @@ async def _run_tool(name: str, inputs: dict, cfg_loader, store, slack_client=Non
         results = [_ask_one(rn, r) for rn, r in matched]
         return json.dumps({"project": target, "repos_queried": len(results), "results": results})
+    if name == "ask_logs":
+        question   = inputs.get("question", "")
+        source_arg = inputs.get("source", "").lower()
+        cfg = cfg_loader.sentinel
+        workspace  = Path(cfg.workspace_dir)
+        synced_base = workspace / "synced"
+        fetched_base = workspace / "fetched"
+        # Collect all relevant log files for the requested source(s)
+        log_files = []
+        if source_arg:
+            # Synced history
+            if synced_base.exists():
+                for d in sorted(synced_base.iterdir()):
+                    if d.is_dir() and source_arg in d.name.lower():
+                        log_files.extend(sorted(d.glob("*")))
+            # Rolling fetched window
+            for f in sorted(fetched_base.glob("*.log")):
+                if source_arg in f.stem.lower() and f not in log_files:
+                    log_files.append(f)
+        else:
+            # All sources
+            if synced_base.exists():
+                for d in sorted(synced_base.iterdir()):
+                    if d.is_dir():
+                        log_files.extend(sorted(d.glob("*")))
+            for f in sorted(fetched_base.glob("*.log")):
+                if f not in log_files:
+                    log_files.append(f)
+        if not log_files:
+            hint = (
+                f"No log files found for source '{source_arg}'."
+                if source_arg else "No log files found."
+            )
+            available = (
+                [d.name for d in synced_base.iterdir() if d.is_dir()]
+                if synced_base.exists() else []
+            )
+            return json.dumps({
+                "error": hint,
+                "available_sources": available,
+                "hint": "Run fetch_logs first, or wait for the next poll cycle.",
+            })
+        file_list = "\n".join(f"  {p}" for p in log_files)
+        prompt = (
+            f"You are analyzing production logs.\n\n"
+            f"QUESTION: {question}\n\n"
+            f"LOG FILES (use your Read and Grep tools to search these):\n{file_list}\n\n"
+            f"Search the log files and answer the question. "
+            f"Be concise and direct. Plain text only — no markdown."
+        )
+        env = os.environ.copy()
+        if cfg.anthropic_api_key and not cfg.claude_pro_for_tasks:
+            env["ANTHROPIC_API_KEY"] = cfg.anthropic_api_key
+        try:
+            skip_flag = []
+            try:
+                if os.getuid() != 0:
+                    skip_flag = ["--dangerously-skip-permissions"]
+            except AttributeError:
+                skip_flag = ["--dangerously-skip-permissions"]
+            r = subprocess.run(
+                [cfg.claude_code_bin] + skip_flag + ["--print", prompt],
+                capture_output=True, text=True, timeout=240, env=env,
+                cwd=str(workspace),
+            )
+            output = (r.stdout or "").strip()
+            logger.info("Boss ask_logs source=%s rc=%d len=%d", source_arg or "all", r.returncode, len(output))
+            if r.returncode != 0 and not output:
+                raw_err = (r.stderr or "")
+                alert_if_rate_limited(cfg.slack_bot_token, cfg.slack_channel,
+                                      f"ask_logs/{source_arg or 'all'}", raw_err)
+                return json.dumps({"error": f"claude --print failed (rc={r.returncode}): {raw_err[:300]}"})
+            return json.dumps({
+                "source": source_arg or "all",
+                "files_searched": len(log_files),
+                "answer": output[:4000],
+            })
+        except subprocess.TimeoutExpired:
+            return json.dumps({"error": "timed out after 240s"})
+        except Exception as e:
+            return json.dumps({"error": str(e)})
     if name == "restart_project":
         if not is_admin:
             return json.dumps({"error": "Admin access required to restart a project."})

package/templates/log-configs/_example.properties CHANGED Viewed

@@ -13,8 +13,8 @@ SOURCE_TYPE=ssh
 # ── SSH source (SOURCE_TYPE=ssh) ──────────────────────────────────────────────
-# Path to the SSH private key (.pem) used to connect to the remote hosts
-KEY=/home/<user>/.ssh/<key>.pem
+# SSH private key (.pem). Relative path is resolved from the config dir, then ~/.ssh/
+KEY=prod.pem
 # Comma-separated list of hostnames or user@host entries.
 # Hosts without a user@ prefix default to ec2-user@<host>
@@ -38,6 +38,13 @@ GREP_FILTER=WARN|ERROR
 # Drop lines matching this regex (grep -iv)
 GREP_EXCLUDE=SSLTool|CommandValidate|hystrix
+# ── Routing ───────────────────────────────────────────────────────────────────
+# Which repo-config to route errors from this log source to.
+# The filename stem is the default match (e.g. "MyService.properties" → "MyService" repo-config).
+# Set TARGET_REPO to override with the exact repo-config filename stem.
+# TARGET_REPO=MyService
 # ── Cloudflare source (SOURCE_TYPE=cloudflare) ────────────────────────────────
 # Full URL of the Cloudflare Worker log endpoint

package/templates/repo-configs/_example.properties CHANGED Viewed

@@ -19,6 +19,12 @@ BRANCH=main
 # true  → Sentinel pushes directly to BRANCH and triggers CI/CD
 AUTO_PUBLISH=false
+# ── Health check (optional) ───────────────────────────────────────────────────
+# HTTP endpoint returning {"Status": "true"} when healthy.
+# Sentinel polls this after each fix to detect 502/503 before the next log cycle.
+# HEALTH_URL=https://myservice.example.com/health
 # ── CI/CD (optional) ──────────────────────────────────────────────────────────
 # Leave blank if this repo has no deploy pipeline (e.g. shared libraries)

package/templates/sentinel.properties CHANGED Viewed

@@ -18,6 +18,10 @@ REPORT_INTERVAL_HOURS=1
 # Uncomment here only if this project needs a different token.
 # GITHUB_TOKEN=<github-pat>
+# Fix confirmation: hours of silence after a fix marker appears in production logs before
+# the fix is declared confirmed. Increase for services that deploy infrequently.
+# MARKER_CONFIRM_HOURS=24
 # State DB and workspace paths (relative to this project dir)
 STATE_DB=./sentinel.db
 WORKSPACE_DIR=./workspace

package/templates/workspace-sentinel.properties CHANGED Viewed

@@ -64,6 +64,17 @@ UPGRADE_CHECK_HOURS=6
 # Config repo polling: if the project dir is a git repo, pull for config changes every N seconds
 CONFIG_POLL_INTERVAL=60
+# Fix confirmation: hours of silence after a fix marker appears in production logs before
+# the fix is declared confirmed. Increase for services that deploy infrequently.
+MARKER_CONFIRM_HOURS=24
+# Log sync: rsync remote logs to workspace/synced/ for full searchable history
+# Set SYNC_ENABLED=false to disable entirely
+SYNC_ENABLED=true
+SYNC_INTERVAL_SECONDS=300
+SYNC_RETENTION_DAYS=30
+SYNC_MAX_FILE_MB=200
 # Slack Bot (Sentinel Boss) — shared across all projects
 # SLACK_BOT_TOKEN=xoxb-...
 # SLACK_APP_TOKEN=xapp-...