@misterhuydo/sentinel 1.1.3 → 1.1.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/.cairn/.hint-lock CHANGED
@@ -1 +1 @@
1
- 2026-03-23T08:08:15.205Z
1
+ 2026-03-23T08:53:23.129Z
@@ -1,6 +1,6 @@
1
1
  {
2
- "message": "Auto-checkpoint at 2026-03-23T08:30:55.526Z",
3
- "checkpoint_at": "2026-03-23T08:30:55.527Z",
2
+ "message": "Auto-checkpoint at 2026-03-23T08:53:48.152Z",
3
+ "checkpoint_at": "2026-03-23T08:53:48.153Z",
4
4
  "active_files": [],
5
5
  "notes": [],
6
6
  "mtime_snapshot": {}
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@misterhuydo/sentinel",
3
- "version": "1.1.3",
3
+ "version": "1.1.5",
4
4
  "description": "Sentinel — Autonomous DevOps Agent installer and manager",
5
5
  "bin": {
6
6
  "sentinel": "./bin/sentinel.js"
@@ -99,42 +99,109 @@ def _validate_patch(patch: str) -> tuple[bool, str]:
99
99
  return True, ""
100
100
 
101
101
 
102
+ _AUTH_ERROR_HINTS = (
103
+ "not logged in", "please run claude login", "authentication failed",
104
+ "api key is not set", "invalid x-api-key", "unauthorized", "please authenticate",
105
+ "unauthenticated", "auth_required", "no auth", "login required",
106
+ )
107
+
108
+
109
+ def _is_auth_error(output: str) -> bool:
110
+ low = output.lower()
111
+ return any(hint in low for hint in _AUTH_ERROR_HINTS)
112
+
113
+
114
+ def _claude_cmd(bin_path: str, prompt: str) -> list[str]:
115
+ import os as _os
116
+ try:
117
+ skip = _os.getuid() != 0
118
+ except AttributeError:
119
+ skip = True # Windows — always pass flag
120
+ if skip:
121
+ return [bin_path, "--dangerously-skip-permissions", "--print", prompt]
122
+ return [bin_path, "--print", prompt]
123
+
124
+
125
+ def _run_claude_attempt(bin_path: str, prompt: str, env: dict) -> tuple[str, bool]:
126
+ """
127
+ Run claude CLI with the given env. Returns (output, timed_out).
128
+ Raises FileNotFoundError if binary is missing.
129
+ """
130
+ try:
131
+ result = subprocess.run(
132
+ _claude_cmd(bin_path, prompt),
133
+ capture_output=True, text=True, timeout=SUBPROCESS_TIMEOUT, env=env,
134
+ )
135
+ return (result.stdout or "") + (result.stderr or ""), False
136
+ except subprocess.TimeoutExpired:
137
+ return "", True
138
+
139
+
102
140
  def generate_fix(
103
141
  event: ErrorEvent,
104
142
  repo: RepoConfig,
105
143
  cfg: SentinelConfig,
106
144
  patches_dir: Path,
107
- ) -> tuple[str, Path | None]:
145
+ ) -> tuple[str, Path | None, str]:
108
146
  """
109
147
  Generate a fix for the given error event.
110
148
 
111
149
  Returns:
112
- (status, patch_path)
150
+ (status, patch_path, marker)
113
151
  status: "patch" | "skip" | "error"
152
+
153
+ Auth strategy — API key and Claude Pro (OAuth) are interchangeable:
154
+ Primary : Claude Pro (OAuth) if claude_pro_for_tasks=True, else API key
155
+ Fallback : the other method, if primary fails with an auth error
156
+ On total auth failure: notify Slack admins + email report recipients
114
157
  """
115
- # Issues have source like "issues/filename" — no rolling log file exists
158
+ import os as _os
159
+
160
+ marker = f"sentinel-{event.fingerprint[:8]}"
116
161
  log_file = Path(cfg.workspace_dir) / "fetched" / f"{event.source}.log"
117
162
  if not log_file.exists():
118
163
  log_file = None
119
- prompt = _build_prompt(event, repo, log_file)
164
+ prompt = _build_prompt(event, repo, log_file, marker)
120
165
 
121
166
  logger.info("Invoking Claude Code for %s (fp=%s)", event.source, event.fingerprint)
122
- import os as _os
123
- env = _os.environ.copy()
124
- # Inject API key only when Claude Pro is NOT preferred for tasks
125
- # (when claude_pro_for_tasks=True and API key is set, let claude CLI use OAuth/Pro)
126
- if cfg.anthropic_api_key and not cfg.claude_pro_for_tasks:
127
- env["ANTHROPIC_API_KEY"] = cfg.anthropic_api_key
167
+
168
+ base_env = _os.environ.copy()
169
+ api_env = {**base_env, "ANTHROPIC_API_KEY": cfg.anthropic_api_key} if cfg.anthropic_api_key else None
170
+ oauth_env = base_env # relies on cached `claude login` session no key injected
171
+
172
+ # Choose primary/fallback order based on config
173
+ if cfg.claude_pro_for_tasks and cfg.anthropic_api_key:
174
+ attempts = [("Claude Pro (OAuth)", oauth_env), ("API key", api_env)]
175
+ elif cfg.claude_pro_for_tasks:
176
+ attempts = [("Claude Pro (OAuth)", oauth_env)]
177
+ elif cfg.anthropic_api_key:
178
+ attempts = [("API key", api_env), ("Claude Pro (OAuth)", oauth_env)]
179
+ else:
180
+ attempts = [("Claude Pro (OAuth)", oauth_env)]
181
+
182
+ output = ""
128
183
  try:
129
- result = subprocess.run(
130
- ([cfg.claude_code_bin, "--dangerously-skip-permissions", "--print", prompt]
131
- if os.getuid() != 0 else
132
- [cfg.claude_code_bin, "--print", prompt]),
133
- capture_output=True, text=True, timeout=SUBPROCESS_TIMEOUT, env=env,
134
- )
135
- except subprocess.TimeoutExpired:
136
- logger.error("Claude Code timed out for %s", event.fingerprint)
137
- return "error", None, ""
184
+ for label, env in attempts:
185
+ if env is None:
186
+ continue
187
+ logger.info("fix_engine: trying %s for %s", label, event.fingerprint)
188
+ output, timed_out = _run_claude_attempt(cfg.claude_code_bin, prompt, env)
189
+ if timed_out:
190
+ logger.error("Claude Code timed out for %s", event.fingerprint)
191
+ return "error", None, ""
192
+ if not _is_auth_error(output):
193
+ break
194
+ logger.warning("fix_engine: %s auth error for %s — trying next method", label, event.fingerprint)
195
+ else:
196
+ # All attempts failed with auth errors
197
+ msg = (
198
+ ":warning: *Sentinel — Fix Engine auth failure*\n"
199
+ f"Both API key and Claude Pro (OAuth) failed authentication for `{event.fingerprint}`.\n"
200
+ "• Check that `ANTHROPIC_API_KEY` is valid, or run `claude login` to refresh the OAuth session."
201
+ )
202
+ logger.error("fix_engine: all auth methods failed for %s", event.fingerprint)
203
+ slack_alert(cfg.slack_bot_token, cfg.slack_channel, msg)
204
+ return "error", None, ""
138
205
  except FileNotFoundError:
139
206
  msg = (
140
207
  f":warning: *Sentinel — Claude CLI not found*\n"
@@ -145,9 +212,7 @@ def generate_fix(
145
212
  slack_alert(cfg.slack_bot_token, cfg.slack_channel, msg)
146
213
  return "error", None, ""
147
214
 
148
- output = (result.stdout or "") + (result.stderr or "")
149
-
150
- # Alert Slack immediately on rate-limit / auth failure — never stay silent
215
+ # Alert Slack immediately on rate-limit never stay silent
151
216
  alert_if_rate_limited(
152
217
  cfg.slack_bot_token,
153
218
  cfg.slack_channel,
@@ -549,13 +549,14 @@ def _log_auth_status(cfg: SentinelConfig) -> None:
549
549
 
550
550
  if has_api_key and pro_for_tasks:
551
551
  logger.info(
552
- "Claude auth: API key ✓ (Boss) + Claude Pro preferred for Fix Engine/Ask Codebase. "
552
+ "Claude auth: API key ✓ + Claude Pro (OAuth) "
553
+ "Fix Engine will try Claude Pro first, falls back to API key on auth error. "
553
554
  "Run `claude login` if not already authenticated."
554
555
  )
555
556
  elif has_api_key and not pro_for_tasks:
556
557
  logger.info(
557
- "Claude auth: API key ✓ (Boss + Fix Engine). "
558
- "CLAUDE_PRO_FOR_TASKS=false all tasks billed to API quota."
558
+ "Claude auth: API key ✓ Boss + Fix Engine use API key. "
559
+ "CLAUDE_PRO_FOR_TASKS=false; falls back to Claude Pro (OAuth) if key auth fails."
559
560
  )
560
561
  elif not has_api_key and has_claude_bin:
561
562
  logger.warning(
@@ -7,6 +7,7 @@ Calls never raise — failures are logged and silently dropped.
7
7
 
8
8
  import logging
9
9
  import re
10
+ import time
10
11
 
11
12
  import requests
12
13
 
@@ -28,8 +29,88 @@ def is_rate_limited(text: str) -> bool:
28
29
  return bool(_RATE_LIMIT_RE.search(text))
29
30
 
30
31
 
32
+ # ── Circuit breaker ────────────────────────────────────────────────────────────
33
+ #
34
+ # Prevents alert storms when Claude is persistently rate-limited.
35
+ # Each `source` string gets its own independent circuit:
36
+ # CLOSED → normal; alerts pass through immediately
37
+ # OPEN → suppressed; one re-alert every CIRCUIT_COOLDOWN_SECONDS
38
+ #
39
+ # On recovery (first non-rate-limited output after OPEN):
40
+ # → post "resolved" to Slack, close the circuit
41
+
42
+ CIRCUIT_COOLDOWN_SECONDS = 3600 # 1 h between repeat alerts while open
43
+
44
+ # source → {opened_at, last_alerted_at, count}
45
+ _circuits: dict[str, dict] = {}
46
+
47
+
48
+ def get_circuit_status() -> dict:
49
+ """
50
+ Return a snapshot of all open circuits.
51
+ Used by the `check_auth_status` Boss tool.
52
+
53
+ Returns:
54
+ { source: { state, opened_at, open_for_seconds, alert_count } }
55
+ Only open circuits are included; an empty dict means everything is healthy.
56
+ """
57
+ now = time.time()
58
+ return {
59
+ src: {
60
+ "state": "open",
61
+ "opened_at": c["opened_at"],
62
+ "open_for_seconds": int(now - c["opened_at"]),
63
+ "alert_count": c["count"],
64
+ }
65
+ for src, c in _circuits.items()
66
+ }
67
+
68
+
69
+ def _open_or_repeat(bot_token: str, channel: str, source: str, output: str) -> None:
70
+ """Open circuit on first hit; re-alert after cooldown if still failing."""
71
+ now = time.time()
72
+ circuit = _circuits.get(source)
73
+
74
+ if circuit is None:
75
+ # First occurrence — open and alert immediately
76
+ _circuits[source] = {"opened_at": now, "last_alerted_at": now, "count": 1}
77
+ logger.error("Circuit opened for %s: %s", source, output[:200])
78
+ slack_alert(bot_token, channel, rate_limit_message(source, output))
79
+ return
80
+
81
+ circuit["count"] += 1
82
+ elapsed = now - circuit["last_alerted_at"]
83
+ if elapsed >= CIRCUIT_COOLDOWN_SECONDS:
84
+ # Still failing after cooldown — remind admins once per hour
85
+ circuit["last_alerted_at"] = now
86
+ open_mins = int((now - circuit["opened_at"]) / 60)
87
+ msg = (
88
+ f":warning: *Sentinel — Claude usage/auth problem still active ({source})*\n"
89
+ f"Still failing after {open_mins} minutes. Total occurrences: {circuit['count']}.\n"
90
+ f"Last error:\n```{output.strip()[:300]}```\n"
91
+ f"Run `check_auth_status` in Slack to see the full picture."
92
+ )
93
+ logger.error("Circuit still open for %s (count=%d)", source, circuit["count"])
94
+ slack_alert(bot_token, channel, msg)
95
+ # else: within cooldown window — suppress
96
+
97
+
98
+ def _close_if_open(bot_token: str, channel: str, source: str) -> None:
99
+ """If circuit was open, close it and post a recovery alert."""
100
+ circuit = _circuits.pop(source, None)
101
+ if circuit is None:
102
+ return
103
+ duration_mins = int((time.time() - circuit["opened_at"]) / 60)
104
+ msg = (
105
+ f":white_check_mark: *Sentinel — Claude auth restored ({source})*\n"
106
+ f"Fixed after {duration_mins} min. Total failures during outage: {circuit['count']}."
107
+ )
108
+ logger.info("Circuit closed for %s after %d min, %d failures", source, duration_mins, circuit["count"])
109
+ slack_alert(bot_token, channel, msg)
110
+
111
+
31
112
  def rate_limit_message(source: str, raw: str) -> str:
32
- """Produce a human-readable Slack alert for a rate-limit event."""
113
+ """Produce a human-readable Slack alert for a rate-limit / auth event (first occurrence)."""
33
114
  snippet = raw.strip()[:300].replace("\n", " ")
34
115
  return (
35
116
  f":warning: *Sentinel — Claude usage/auth problem ({source})*\n"
@@ -37,9 +118,10 @@ def rate_limit_message(source: str, raw: str) -> str:
37
118
  f"```{snippet}```\n"
38
119
  f"*What to check:*\n"
39
120
  f"• API key: verify `ANTHROPIC_API_KEY` in `sentinel.properties` is valid and has credit\n"
40
- f"• Claude Pro: run `claude login` on the server to refresh OAuth\n"
41
- f"• Both: at least one auth method must be working\n"
42
- f"Sentinel will retry on the next poll cycle."
121
+ f"• Claude Pro: run `claude login` on the server to refresh the OAuth session\n"
122
+ f"• Both: Sentinel tries both methods — at least one must be working\n"
123
+ f"Repeat alerts will be suppressed for 1 hour. "
124
+ f"Run `check_auth_status` in Slack to see current state."
43
125
  )
44
126
 
45
127
 
@@ -77,12 +159,15 @@ def alert_if_rate_limited(
77
159
  output: str,
78
160
  ) -> bool:
79
161
  """
80
- Check output for rate-limit / auth signals.
81
- If found, post a Slack alert and return True.
162
+ Check output for rate-limit / auth signals and manage the circuit breaker.
163
+
164
+ - Rate limited → open/keep-open circuit, alert (with cooldown suppression)
165
+ - Not limited → close circuit if it was open (recovery alert), return False
166
+
167
+ Returns True if a rate-limit signal was found.
82
168
  """
83
169
  if not is_rate_limited(output):
170
+ _close_if_open(bot_token, channel, source)
84
171
  return False
85
- msg = rate_limit_message(source, output)
86
- logger.error("Claude rate-limit/auth failure in %s: %s", source, output[:200])
87
- slack_alert(bot_token, channel, msg)
172
+ _open_or_repeat(bot_token, channel, source, output)
88
173
  return True
@@ -137,6 +137,7 @@ reply with a short summary grouped by category:
137
137
  *Fix management*
138
138
  • `get_fix_details` — full details of a specific fix — "show fix abc123"
139
139
  • `list_pending_prs` — all open Sentinel PRs awaiting review — "list open PRs"
140
+ • `check_auth_status` — Claude auth health, rate-limit circuit state, fix engine 24 h stats — "is Claude working?", "any rate limits?", "auth issues?"
140
141
 
141
142
  *Project & task delivery*
142
143
  • `list_projects` — all projects and repos Sentinel manages — "what projects do you manage?"
@@ -275,6 +276,16 @@ _TOOLS = [
275
276
  "description": "List all open Sentinel PRs awaiting admin review.",
276
277
  "input_schema": {"type": "object", "properties": {}},
277
278
  },
279
+ {
280
+ "name": "check_auth_status",
281
+ "description": (
282
+ "Check Claude authentication health, current rate-limit / usage-limit circuit state, "
283
+ "and fix engine stats for the last 24 h. "
284
+ "Use when someone asks: 'is Claude working?', 'any rate limits?', 'why aren't fixes running?', "
285
+ "'is the API key OK?', 'auth issues?', 'fix engine status'."
286
+ ),
287
+ "input_schema": {"type": "object", "properties": {}},
288
+ },
278
289
  {
279
290
  "name": "pause_sentinel",
280
291
  "description": (
@@ -798,6 +809,67 @@ async def _run_tool(name: str, inputs: dict, cfg_loader, store, slack_client=Non
798
809
  "sentinel_paused": Path("SENTINEL_PAUSE").exists(),
799
810
  })
800
811
 
812
+ if name == "check_auth_status":
813
+ import subprocess as _sp
814
+ from .notify import get_circuit_status
815
+ cfg = cfg_loader.sentinel
816
+
817
+ # Auth configuration
818
+ has_key = bool(cfg.anthropic_api_key)
819
+ pro_for_tasks = cfg.claude_pro_for_tasks
820
+ if pro_for_tasks and has_key:
821
+ primary, fallback = "claude_pro_oauth", "api_key"
822
+ elif pro_for_tasks:
823
+ primary, fallback = "claude_pro_oauth", None
824
+ else:
825
+ primary, fallback = "api_key", "claude_pro_oauth" if not has_key else "claude_pro_oauth"
826
+
827
+ # Claude CLI liveness check
828
+ cli_ok, cli_version = False, ""
829
+ try:
830
+ r = _sp.run(
831
+ [cfg.claude_code_bin, "--version"],
832
+ capture_output=True, text=True, timeout=10,
833
+ )
834
+ if r.returncode == 0:
835
+ cli_ok = True
836
+ cli_version = r.stdout.strip() or r.stderr.strip()
837
+ except Exception:
838
+ pass
839
+
840
+ # Circuit breaker snapshot — only open (unhealthy) circuits appear here
841
+ circuits = get_circuit_status()
842
+
843
+ # Fix engine stats (last 24 h)
844
+ recent = store.get_recent_fixes(hours=24)
845
+ counts = {"applied": 0, "failed": 0, "skipped": 0, "pending": 0}
846
+ last_success = None
847
+ for f in recent:
848
+ s = f.get("status", "")
849
+ if s in counts:
850
+ counts[s] += 1
851
+ if s == "applied" and not last_success:
852
+ last_success = f.get("timestamp", "")
853
+
854
+ overall = "healthy"
855
+ if circuits:
856
+ overall = "degraded — rate/auth limit active on: " + ", ".join(circuits)
857
+ elif not cli_ok:
858
+ overall = "warning — claude CLI not reachable"
859
+
860
+ return json.dumps({
861
+ "overall": overall,
862
+ "auth": {
863
+ "api_key_configured": has_key,
864
+ "claude_pro_for_tasks": pro_for_tasks,
865
+ "primary_method": primary,
866
+ "fallback_method": fallback,
867
+ },
868
+ "claude_cli": {"available": cli_ok, "version": cli_version},
869
+ "rate_limit_circuits": circuits,
870
+ "fix_engine_24h": {**counts, "last_successful_fix": last_success},
871
+ })
872
+
801
873
  if name == "create_issue":
802
874
  description = inputs["description"]
803
875
  target_repo = inputs.get("target_repo", "")