@misterhuydo/sentinel 1.1.4 → 1.1.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/.cairn/.hint-lock CHANGED
@@ -1 +1 @@
1
- 2026-03-23T08:08:15.205Z
1
+ 2026-03-23T08:53:23.129Z
@@ -1,6 +1,6 @@
1
1
  {
2
- "message": "Auto-checkpoint at 2026-03-23T08:33:20.221Z",
3
- "checkpoint_at": "2026-03-23T08:33:20.223Z",
2
+ "message": "Auto-checkpoint at 2026-03-23T08:59:04.141Z",
3
+ "checkpoint_at": "2026-03-23T08:59:04.142Z",
4
4
  "active_files": [],
5
5
  "notes": [],
6
6
  "mtime_snapshot": {}
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@misterhuydo/sentinel",
3
- "version": "1.1.4",
3
+ "version": "1.1.6",
4
4
  "description": "Sentinel — Autonomous DevOps Agent installer and manager",
5
5
  "bin": {
6
6
  "sentinel": "./bin/sentinel.js"
@@ -7,6 +7,7 @@ Calls never raise — failures are logged and silently dropped.
7
7
 
8
8
  import logging
9
9
  import re
10
+ import time
10
11
 
11
12
  import requests
12
13
 
@@ -28,8 +29,88 @@ def is_rate_limited(text: str) -> bool:
28
29
  return bool(_RATE_LIMIT_RE.search(text))
29
30
 
30
31
 
32
+ # ── Circuit breaker ────────────────────────────────────────────────────────────
33
+ #
34
+ # Prevents alert storms when Claude is persistently rate-limited.
35
+ # Each `source` string gets its own independent circuit:
36
+ # CLOSED → normal; alerts pass through immediately
37
+ # OPEN → suppressed; one re-alert every CIRCUIT_COOLDOWN_SECONDS
38
+ #
39
+ # On recovery (first non-rate-limited output after OPEN):
40
+ # → post "resolved" to Slack, close the circuit
41
+
42
+ CIRCUIT_COOLDOWN_SECONDS = 3600 # 1 h between repeat alerts while open
43
+
44
+ # source → {opened_at, last_alerted_at, count}
45
+ _circuits: dict[str, dict] = {}
46
+
47
+
48
+ def get_circuit_status() -> dict:
49
+ """
50
+ Return a snapshot of all open circuits.
51
+ Used by the `check_auth_status` Boss tool.
52
+
53
+ Returns:
54
+ { source: { state, opened_at, open_for_seconds, alert_count } }
55
+ Only open circuits are included; an empty dict means everything is healthy.
56
+ """
57
+ now = time.time()
58
+ return {
59
+ src: {
60
+ "state": "open",
61
+ "opened_at": c["opened_at"],
62
+ "open_for_seconds": int(now - c["opened_at"]),
63
+ "alert_count": c["count"],
64
+ }
65
+ for src, c in _circuits.items()
66
+ }
67
+
68
+
69
+ def _open_or_repeat(bot_token: str, channel: str, source: str, output: str) -> None:
70
+ """Open circuit on first hit; re-alert after cooldown if still failing."""
71
+ now = time.time()
72
+ circuit = _circuits.get(source)
73
+
74
+ if circuit is None:
75
+ # First occurrence — open and alert immediately
76
+ _circuits[source] = {"opened_at": now, "last_alerted_at": now, "count": 1}
77
+ logger.error("Circuit opened for %s: %s", source, output[:200])
78
+ slack_alert(bot_token, channel, rate_limit_message(source, output))
79
+ return
80
+
81
+ circuit["count"] += 1
82
+ elapsed = now - circuit["last_alerted_at"]
83
+ if elapsed >= CIRCUIT_COOLDOWN_SECONDS:
84
+ # Still failing after cooldown — remind admins once per hour
85
+ circuit["last_alerted_at"] = now
86
+ open_mins = int((now - circuit["opened_at"]) / 60)
87
+ msg = (
88
+ f":warning: *Sentinel — Claude usage/auth problem still active ({source})*\n"
89
+ f"Still failing after {open_mins} minutes. Total occurrences: {circuit['count']}.\n"
90
+ f"Last error:\n```{output.strip()[:300]}```\n"
91
+ f"Run `check_auth_status` in Slack to see the full picture."
92
+ )
93
+ logger.error("Circuit still open for %s (count=%d)", source, circuit["count"])
94
+ slack_alert(bot_token, channel, msg)
95
+ # else: within cooldown window — suppress
96
+
97
+
98
+ def _close_if_open(bot_token: str, channel: str, source: str) -> None:
99
+ """If circuit was open, close it and post a recovery alert."""
100
+ circuit = _circuits.pop(source, None)
101
+ if circuit is None:
102
+ return
103
+ duration_mins = int((time.time() - circuit["opened_at"]) / 60)
104
+ msg = (
105
+ f":white_check_mark: *Sentinel — Claude auth restored ({source})*\n"
106
+ f"Fixed after {duration_mins} min. Total failures during outage: {circuit['count']}."
107
+ )
108
+ logger.info("Circuit closed for %s after %d min, %d failures", source, duration_mins, circuit["count"])
109
+ slack_alert(bot_token, channel, msg)
110
+
111
+
31
112
  def rate_limit_message(source: str, raw: str) -> str:
32
- """Produce a human-readable Slack alert for a rate-limit event."""
113
+ """Produce a human-readable Slack alert for a rate-limit / auth event (first occurrence)."""
33
114
  snippet = raw.strip()[:300].replace("\n", " ")
34
115
  return (
35
116
  f":warning: *Sentinel — Claude usage/auth problem ({source})*\n"
@@ -37,9 +118,10 @@ def rate_limit_message(source: str, raw: str) -> str:
37
118
  f"```{snippet}```\n"
38
119
  f"*What to check:*\n"
39
120
  f"• API key: verify `ANTHROPIC_API_KEY` in `sentinel.properties` is valid and has credit\n"
40
- f"• Claude Pro: run `claude login` on the server to refresh OAuth\n"
41
- f"• Both: at least one auth method must be working\n"
42
- f"Sentinel will retry on the next poll cycle."
121
+ f"• Claude Pro: run `claude login` on the server to refresh the OAuth session\n"
122
+ f"• Both: Sentinel tries both methods — at least one must be working\n"
123
+ f"Repeat alerts will be suppressed for 1 hour. "
124
+ f"Run `check_auth_status` in Slack to see current state."
43
125
  )
44
126
 
45
127
 
@@ -77,12 +159,15 @@ def alert_if_rate_limited(
77
159
  output: str,
78
160
  ) -> bool:
79
161
  """
80
- Check output for rate-limit / auth signals.
81
- If found, post a Slack alert and return True.
162
+ Check output for rate-limit / auth signals and manage the circuit breaker.
163
+
164
+ - Rate limited → open/keep-open circuit, alert (with cooldown suppression)
165
+ - Not limited → close circuit if it was open (recovery alert), return False
166
+
167
+ Returns True if a rate-limit signal was found.
82
168
  """
83
169
  if not is_rate_limited(output):
170
+ _close_if_open(bot_token, channel, source)
84
171
  return False
85
- msg = rate_limit_message(source, output)
86
- logger.error("Claude rate-limit/auth failure in %s: %s", source, output[:200])
87
- slack_alert(bot_token, channel, msg)
172
+ _open_or_repeat(bot_token, channel, source, output)
88
173
  return True
@@ -137,6 +137,7 @@ reply with a short summary grouped by category:
137
137
  *Fix management*
138
138
  • `get_fix_details` — full details of a specific fix — "show fix abc123"
139
139
  • `list_pending_prs` — all open Sentinel PRs awaiting review — "list open PRs"
140
+ • `check_auth_status` — Claude auth health, rate-limit circuit state, fix engine 24 h stats — "is Claude working?", "any rate limits?", "auth issues?"
140
141
 
141
142
  *Project & task delivery*
142
143
  • `list_projects` — all projects and repos Sentinel manages — "what projects do you manage?"
@@ -275,6 +276,16 @@ _TOOLS = [
275
276
  "description": "List all open Sentinel PRs awaiting admin review.",
276
277
  "input_schema": {"type": "object", "properties": {}},
277
278
  },
279
+ {
280
+ "name": "check_auth_status",
281
+ "description": (
282
+ "Check Claude authentication health, current rate-limit / usage-limit circuit state, "
283
+ "and fix engine stats for the last 24 h. "
284
+ "Use when someone asks: 'is Claude working?', 'any rate limits?', 'why aren't fixes running?', "
285
+ "'is the API key OK?', 'auth issues?', 'fix engine status'."
286
+ ),
287
+ "input_schema": {"type": "object", "properties": {}},
288
+ },
278
289
  {
279
290
  "name": "pause_sentinel",
280
291
  "description": (
@@ -798,6 +809,67 @@ async def _run_tool(name: str, inputs: dict, cfg_loader, store, slack_client=Non
798
809
  "sentinel_paused": Path("SENTINEL_PAUSE").exists(),
799
810
  })
800
811
 
812
+ if name == "check_auth_status":
813
+ import subprocess as _sp
814
+ from .notify import get_circuit_status
815
+ cfg = cfg_loader.sentinel
816
+
817
+ # Auth configuration
818
+ has_key = bool(cfg.anthropic_api_key)
819
+ pro_for_tasks = cfg.claude_pro_for_tasks
820
+ if pro_for_tasks and has_key:
821
+ primary, fallback = "claude_pro_oauth", "api_key"
822
+ elif pro_for_tasks:
823
+ primary, fallback = "claude_pro_oauth", None
824
+ else:
825
+ primary, fallback = "api_key", "claude_pro_oauth" if not has_key else "claude_pro_oauth"
826
+
827
+ # Claude CLI liveness check
828
+ cli_ok, cli_version = False, ""
829
+ try:
830
+ r = _sp.run(
831
+ [cfg.claude_code_bin, "--version"],
832
+ capture_output=True, text=True, timeout=10,
833
+ )
834
+ if r.returncode == 0:
835
+ cli_ok = True
836
+ cli_version = r.stdout.strip() or r.stderr.strip()
837
+ except Exception:
838
+ pass
839
+
840
+ # Circuit breaker snapshot — only open (unhealthy) circuits appear here
841
+ circuits = get_circuit_status()
842
+
843
+ # Fix engine stats (last 24 h)
844
+ recent = store.get_recent_fixes(hours=24)
845
+ counts = {"applied": 0, "failed": 0, "skipped": 0, "pending": 0}
846
+ last_success = None
847
+ for f in recent:
848
+ s = f.get("status", "")
849
+ if s in counts:
850
+ counts[s] += 1
851
+ if s == "applied" and not last_success:
852
+ last_success = f.get("timestamp", "")
853
+
854
+ overall = "healthy"
855
+ if circuits:
856
+ overall = "degraded — rate/auth limit active on: " + ", ".join(circuits)
857
+ elif not cli_ok:
858
+ overall = "warning — claude CLI not reachable"
859
+
860
+ return json.dumps({
861
+ "overall": overall,
862
+ "auth": {
863
+ "api_key_configured": has_key,
864
+ "claude_pro_for_tasks": pro_for_tasks,
865
+ "primary_method": primary,
866
+ "fallback_method": fallback,
867
+ },
868
+ "claude_cli": {"available": cli_ok, "version": cli_version},
869
+ "rate_limit_circuits": circuits,
870
+ "fix_engine_24h": {**counts, "last_successful_fix": last_success},
871
+ })
872
+
801
873
  if name == "create_issue":
802
874
  description = inputs["description"]
803
875
  target_repo = inputs.get("target_repo", "")
@@ -1938,7 +2010,7 @@ async def handle_message(
1938
2010
  from .notify import rate_limit_message
1939
2011
  alert_if_rate_limited(cfg.slack_bot_token, cfg.slack_channel,
1940
2012
  "sentinel_boss/api", err_str)
1941
- logger.warning("Boss: API key path failed (%s), trying CLI fallback", err_str[:80])
2013
+ logger.warning("Boss: API key path failed (%s), trying CLI fallback", err_str)
1942
2014
 
1943
2015
  # 2nd priority: Claude Pro / OAuth via CLI (limited tools but no API key needed)
1944
2016
  cli_reply, cli_done = await _handle_with_cli(