@misterhuydo/sentinel 1.1.3 → 1.1.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/.cairn/.hint-lock
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
2026-03-23T08:
|
|
1
|
+
2026-03-23T08:53:23.129Z
|
package/.cairn/session.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
|
-
"message": "Auto-checkpoint at 2026-03-23T08:
|
|
3
|
-
"checkpoint_at": "2026-03-23T08:
|
|
2
|
+
"message": "Auto-checkpoint at 2026-03-23T08:53:48.152Z",
|
|
3
|
+
"checkpoint_at": "2026-03-23T08:53:48.153Z",
|
|
4
4
|
"active_files": [],
|
|
5
5
|
"notes": [],
|
|
6
6
|
"mtime_snapshot": {}
|
package/package.json
CHANGED
|
@@ -99,42 +99,109 @@ def _validate_patch(patch: str) -> tuple[bool, str]:
|
|
|
99
99
|
return True, ""
|
|
100
100
|
|
|
101
101
|
|
|
102
|
+
_AUTH_ERROR_HINTS = (
|
|
103
|
+
"not logged in", "please run claude login", "authentication failed",
|
|
104
|
+
"api key is not set", "invalid x-api-key", "unauthorized", "please authenticate",
|
|
105
|
+
"unauthenticated", "auth_required", "no auth", "login required",
|
|
106
|
+
)
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
def _is_auth_error(output: str) -> bool:
|
|
110
|
+
low = output.lower()
|
|
111
|
+
return any(hint in low for hint in _AUTH_ERROR_HINTS)
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
def _claude_cmd(bin_path: str, prompt: str) -> list[str]:
|
|
115
|
+
import os as _os
|
|
116
|
+
try:
|
|
117
|
+
skip = _os.getuid() != 0
|
|
118
|
+
except AttributeError:
|
|
119
|
+
skip = True # Windows — always pass flag
|
|
120
|
+
if skip:
|
|
121
|
+
return [bin_path, "--dangerously-skip-permissions", "--print", prompt]
|
|
122
|
+
return [bin_path, "--print", prompt]
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
def _run_claude_attempt(bin_path: str, prompt: str, env: dict) -> tuple[str, bool]:
|
|
126
|
+
"""
|
|
127
|
+
Run claude CLI with the given env. Returns (output, timed_out).
|
|
128
|
+
Raises FileNotFoundError if binary is missing.
|
|
129
|
+
"""
|
|
130
|
+
try:
|
|
131
|
+
result = subprocess.run(
|
|
132
|
+
_claude_cmd(bin_path, prompt),
|
|
133
|
+
capture_output=True, text=True, timeout=SUBPROCESS_TIMEOUT, env=env,
|
|
134
|
+
)
|
|
135
|
+
return (result.stdout or "") + (result.stderr or ""), False
|
|
136
|
+
except subprocess.TimeoutExpired:
|
|
137
|
+
return "", True
|
|
138
|
+
|
|
139
|
+
|
|
102
140
|
def generate_fix(
|
|
103
141
|
event: ErrorEvent,
|
|
104
142
|
repo: RepoConfig,
|
|
105
143
|
cfg: SentinelConfig,
|
|
106
144
|
patches_dir: Path,
|
|
107
|
-
) -> tuple[str, Path | None]:
|
|
145
|
+
) -> tuple[str, Path | None, str]:
|
|
108
146
|
"""
|
|
109
147
|
Generate a fix for the given error event.
|
|
110
148
|
|
|
111
149
|
Returns:
|
|
112
|
-
(status, patch_path)
|
|
150
|
+
(status, patch_path, marker)
|
|
113
151
|
status: "patch" | "skip" | "error"
|
|
152
|
+
|
|
153
|
+
Auth strategy — API key and Claude Pro (OAuth) are interchangeable:
|
|
154
|
+
Primary : Claude Pro (OAuth) if claude_pro_for_tasks=True, else API key
|
|
155
|
+
Fallback : the other method, if primary fails with an auth error
|
|
156
|
+
On total auth failure: notify Slack admins + email report recipients
|
|
114
157
|
"""
|
|
115
|
-
|
|
158
|
+
import os as _os
|
|
159
|
+
|
|
160
|
+
marker = f"sentinel-{event.fingerprint[:8]}"
|
|
116
161
|
log_file = Path(cfg.workspace_dir) / "fetched" / f"{event.source}.log"
|
|
117
162
|
if not log_file.exists():
|
|
118
163
|
log_file = None
|
|
119
|
-
prompt = _build_prompt(event, repo, log_file)
|
|
164
|
+
prompt = _build_prompt(event, repo, log_file, marker)
|
|
120
165
|
|
|
121
166
|
logger.info("Invoking Claude Code for %s (fp=%s)", event.source, event.fingerprint)
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
167
|
+
|
|
168
|
+
base_env = _os.environ.copy()
|
|
169
|
+
api_env = {**base_env, "ANTHROPIC_API_KEY": cfg.anthropic_api_key} if cfg.anthropic_api_key else None
|
|
170
|
+
oauth_env = base_env # relies on cached `claude login` session — no key injected
|
|
171
|
+
|
|
172
|
+
# Choose primary/fallback order based on config
|
|
173
|
+
if cfg.claude_pro_for_tasks and cfg.anthropic_api_key:
|
|
174
|
+
attempts = [("Claude Pro (OAuth)", oauth_env), ("API key", api_env)]
|
|
175
|
+
elif cfg.claude_pro_for_tasks:
|
|
176
|
+
attempts = [("Claude Pro (OAuth)", oauth_env)]
|
|
177
|
+
elif cfg.anthropic_api_key:
|
|
178
|
+
attempts = [("API key", api_env), ("Claude Pro (OAuth)", oauth_env)]
|
|
179
|
+
else:
|
|
180
|
+
attempts = [("Claude Pro (OAuth)", oauth_env)]
|
|
181
|
+
|
|
182
|
+
output = ""
|
|
128
183
|
try:
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
184
|
+
for label, env in attempts:
|
|
185
|
+
if env is None:
|
|
186
|
+
continue
|
|
187
|
+
logger.info("fix_engine: trying %s for %s", label, event.fingerprint)
|
|
188
|
+
output, timed_out = _run_claude_attempt(cfg.claude_code_bin, prompt, env)
|
|
189
|
+
if timed_out:
|
|
190
|
+
logger.error("Claude Code timed out for %s", event.fingerprint)
|
|
191
|
+
return "error", None, ""
|
|
192
|
+
if not _is_auth_error(output):
|
|
193
|
+
break
|
|
194
|
+
logger.warning("fix_engine: %s auth error for %s — trying next method", label, event.fingerprint)
|
|
195
|
+
else:
|
|
196
|
+
# All attempts failed with auth errors
|
|
197
|
+
msg = (
|
|
198
|
+
":warning: *Sentinel — Fix Engine auth failure*\n"
|
|
199
|
+
f"Both API key and Claude Pro (OAuth) failed authentication for `{event.fingerprint}`.\n"
|
|
200
|
+
"• Check that `ANTHROPIC_API_KEY` is valid, or run `claude login` to refresh the OAuth session."
|
|
201
|
+
)
|
|
202
|
+
logger.error("fix_engine: all auth methods failed for %s", event.fingerprint)
|
|
203
|
+
slack_alert(cfg.slack_bot_token, cfg.slack_channel, msg)
|
|
204
|
+
return "error", None, ""
|
|
138
205
|
except FileNotFoundError:
|
|
139
206
|
msg = (
|
|
140
207
|
f":warning: *Sentinel — Claude CLI not found*\n"
|
|
@@ -145,9 +212,7 @@ def generate_fix(
|
|
|
145
212
|
slack_alert(cfg.slack_bot_token, cfg.slack_channel, msg)
|
|
146
213
|
return "error", None, ""
|
|
147
214
|
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
# Alert Slack immediately on rate-limit / auth failure — never stay silent
|
|
215
|
+
# Alert Slack immediately on rate-limit — never stay silent
|
|
151
216
|
alert_if_rate_limited(
|
|
152
217
|
cfg.slack_bot_token,
|
|
153
218
|
cfg.slack_channel,
|
package/python/sentinel/main.py
CHANGED
|
@@ -549,13 +549,14 @@ def _log_auth_status(cfg: SentinelConfig) -> None:
|
|
|
549
549
|
|
|
550
550
|
if has_api_key and pro_for_tasks:
|
|
551
551
|
logger.info(
|
|
552
|
-
"Claude auth: API key ✓
|
|
552
|
+
"Claude auth: API key ✓ + Claude Pro (OAuth) ✓ — "
|
|
553
|
+
"Fix Engine will try Claude Pro first, falls back to API key on auth error. "
|
|
553
554
|
"Run `claude login` if not already authenticated."
|
|
554
555
|
)
|
|
555
556
|
elif has_api_key and not pro_for_tasks:
|
|
556
557
|
logger.info(
|
|
557
|
-
"Claude auth: API key ✓
|
|
558
|
-
"CLAUDE_PRO_FOR_TASKS=false
|
|
558
|
+
"Claude auth: API key ✓ — Boss + Fix Engine use API key. "
|
|
559
|
+
"CLAUDE_PRO_FOR_TASKS=false; falls back to Claude Pro (OAuth) if key auth fails."
|
|
559
560
|
)
|
|
560
561
|
elif not has_api_key and has_claude_bin:
|
|
561
562
|
logger.warning(
|
|
@@ -7,6 +7,7 @@ Calls never raise — failures are logged and silently dropped.
|
|
|
7
7
|
|
|
8
8
|
import logging
|
|
9
9
|
import re
|
|
10
|
+
import time
|
|
10
11
|
|
|
11
12
|
import requests
|
|
12
13
|
|
|
@@ -28,8 +29,88 @@ def is_rate_limited(text: str) -> bool:
|
|
|
28
29
|
return bool(_RATE_LIMIT_RE.search(text))
|
|
29
30
|
|
|
30
31
|
|
|
32
|
+
# ── Circuit breaker ────────────────────────────────────────────────────────────
|
|
33
|
+
#
|
|
34
|
+
# Prevents alert storms when Claude is persistently rate-limited.
|
|
35
|
+
# Each `source` string gets its own independent circuit:
|
|
36
|
+
# CLOSED → normal; alerts pass through immediately
|
|
37
|
+
# OPEN → suppressed; one re-alert every CIRCUIT_COOLDOWN_SECONDS
|
|
38
|
+
#
|
|
39
|
+
# On recovery (first non-rate-limited output after OPEN):
|
|
40
|
+
# → post "resolved" to Slack, close the circuit
|
|
41
|
+
|
|
42
|
+
CIRCUIT_COOLDOWN_SECONDS = 3600 # 1 h between repeat alerts while open
|
|
43
|
+
|
|
44
|
+
# source → {opened_at, last_alerted_at, count}
|
|
45
|
+
_circuits: dict[str, dict] = {}
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def get_circuit_status() -> dict:
|
|
49
|
+
"""
|
|
50
|
+
Return a snapshot of all open circuits.
|
|
51
|
+
Used by the `check_auth_status` Boss tool.
|
|
52
|
+
|
|
53
|
+
Returns:
|
|
54
|
+
{ source: { state, opened_at, open_for_seconds, alert_count } }
|
|
55
|
+
Only open circuits are included; an empty dict means everything is healthy.
|
|
56
|
+
"""
|
|
57
|
+
now = time.time()
|
|
58
|
+
return {
|
|
59
|
+
src: {
|
|
60
|
+
"state": "open",
|
|
61
|
+
"opened_at": c["opened_at"],
|
|
62
|
+
"open_for_seconds": int(now - c["opened_at"]),
|
|
63
|
+
"alert_count": c["count"],
|
|
64
|
+
}
|
|
65
|
+
for src, c in _circuits.items()
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def _open_or_repeat(bot_token: str, channel: str, source: str, output: str) -> None:
|
|
70
|
+
"""Open circuit on first hit; re-alert after cooldown if still failing."""
|
|
71
|
+
now = time.time()
|
|
72
|
+
circuit = _circuits.get(source)
|
|
73
|
+
|
|
74
|
+
if circuit is None:
|
|
75
|
+
# First occurrence — open and alert immediately
|
|
76
|
+
_circuits[source] = {"opened_at": now, "last_alerted_at": now, "count": 1}
|
|
77
|
+
logger.error("Circuit opened for %s: %s", source, output[:200])
|
|
78
|
+
slack_alert(bot_token, channel, rate_limit_message(source, output))
|
|
79
|
+
return
|
|
80
|
+
|
|
81
|
+
circuit["count"] += 1
|
|
82
|
+
elapsed = now - circuit["last_alerted_at"]
|
|
83
|
+
if elapsed >= CIRCUIT_COOLDOWN_SECONDS:
|
|
84
|
+
# Still failing after cooldown — remind admins once per hour
|
|
85
|
+
circuit["last_alerted_at"] = now
|
|
86
|
+
open_mins = int((now - circuit["opened_at"]) / 60)
|
|
87
|
+
msg = (
|
|
88
|
+
f":warning: *Sentinel — Claude usage/auth problem still active ({source})*\n"
|
|
89
|
+
f"Still failing after {open_mins} minutes. Total occurrences: {circuit['count']}.\n"
|
|
90
|
+
f"Last error:\n```{output.strip()[:300]}```\n"
|
|
91
|
+
f"Run `check_auth_status` in Slack to see the full picture."
|
|
92
|
+
)
|
|
93
|
+
logger.error("Circuit still open for %s (count=%d)", source, circuit["count"])
|
|
94
|
+
slack_alert(bot_token, channel, msg)
|
|
95
|
+
# else: within cooldown window — suppress
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
def _close_if_open(bot_token: str, channel: str, source: str) -> None:
|
|
99
|
+
"""If circuit was open, close it and post a recovery alert."""
|
|
100
|
+
circuit = _circuits.pop(source, None)
|
|
101
|
+
if circuit is None:
|
|
102
|
+
return
|
|
103
|
+
duration_mins = int((time.time() - circuit["opened_at"]) / 60)
|
|
104
|
+
msg = (
|
|
105
|
+
f":white_check_mark: *Sentinel — Claude auth restored ({source})*\n"
|
|
106
|
+
f"Fixed after {duration_mins} min. Total failures during outage: {circuit['count']}."
|
|
107
|
+
)
|
|
108
|
+
logger.info("Circuit closed for %s after %d min, %d failures", source, duration_mins, circuit["count"])
|
|
109
|
+
slack_alert(bot_token, channel, msg)
|
|
110
|
+
|
|
111
|
+
|
|
31
112
|
def rate_limit_message(source: str, raw: str) -> str:
|
|
32
|
-
"""Produce a human-readable Slack alert for a rate-limit event."""
|
|
113
|
+
"""Produce a human-readable Slack alert for a rate-limit / auth event (first occurrence)."""
|
|
33
114
|
snippet = raw.strip()[:300].replace("\n", " ")
|
|
34
115
|
return (
|
|
35
116
|
f":warning: *Sentinel — Claude usage/auth problem ({source})*\n"
|
|
@@ -37,9 +118,10 @@ def rate_limit_message(source: str, raw: str) -> str:
|
|
|
37
118
|
f"```{snippet}```\n"
|
|
38
119
|
f"*What to check:*\n"
|
|
39
120
|
f"• API key: verify `ANTHROPIC_API_KEY` in `sentinel.properties` is valid and has credit\n"
|
|
40
|
-
f"• Claude Pro: run `claude login` on the server to refresh OAuth\n"
|
|
41
|
-
f"• Both: at least one
|
|
42
|
-
f"
|
|
121
|
+
f"• Claude Pro: run `claude login` on the server to refresh the OAuth session\n"
|
|
122
|
+
f"• Both: Sentinel tries both methods — at least one must be working\n"
|
|
123
|
+
f"Repeat alerts will be suppressed for 1 hour. "
|
|
124
|
+
f"Run `check_auth_status` in Slack to see current state."
|
|
43
125
|
)
|
|
44
126
|
|
|
45
127
|
|
|
@@ -77,12 +159,15 @@ def alert_if_rate_limited(
|
|
|
77
159
|
output: str,
|
|
78
160
|
) -> bool:
|
|
79
161
|
"""
|
|
80
|
-
Check output for rate-limit / auth signals.
|
|
81
|
-
|
|
162
|
+
Check output for rate-limit / auth signals and manage the circuit breaker.
|
|
163
|
+
|
|
164
|
+
- Rate limited → open/keep-open circuit, alert (with cooldown suppression)
|
|
165
|
+
- Not limited → close circuit if it was open (recovery alert), return False
|
|
166
|
+
|
|
167
|
+
Returns True if a rate-limit signal was found.
|
|
82
168
|
"""
|
|
83
169
|
if not is_rate_limited(output):
|
|
170
|
+
_close_if_open(bot_token, channel, source)
|
|
84
171
|
return False
|
|
85
|
-
|
|
86
|
-
logger.error("Claude rate-limit/auth failure in %s: %s", source, output[:200])
|
|
87
|
-
slack_alert(bot_token, channel, msg)
|
|
172
|
+
_open_or_repeat(bot_token, channel, source, output)
|
|
88
173
|
return True
|
|
@@ -137,6 +137,7 @@ reply with a short summary grouped by category:
|
|
|
137
137
|
*Fix management*
|
|
138
138
|
• `get_fix_details` — full details of a specific fix — "show fix abc123"
|
|
139
139
|
• `list_pending_prs` — all open Sentinel PRs awaiting review — "list open PRs"
|
|
140
|
+
• `check_auth_status` — Claude auth health, rate-limit circuit state, fix engine 24 h stats — "is Claude working?", "any rate limits?", "auth issues?"
|
|
140
141
|
|
|
141
142
|
*Project & task delivery*
|
|
142
143
|
• `list_projects` — all projects and repos Sentinel manages — "what projects do you manage?"
|
|
@@ -275,6 +276,16 @@ _TOOLS = [
|
|
|
275
276
|
"description": "List all open Sentinel PRs awaiting admin review.",
|
|
276
277
|
"input_schema": {"type": "object", "properties": {}},
|
|
277
278
|
},
|
|
279
|
+
{
|
|
280
|
+
"name": "check_auth_status",
|
|
281
|
+
"description": (
|
|
282
|
+
"Check Claude authentication health, current rate-limit / usage-limit circuit state, "
|
|
283
|
+
"and fix engine stats for the last 24 h. "
|
|
284
|
+
"Use when someone asks: 'is Claude working?', 'any rate limits?', 'why aren't fixes running?', "
|
|
285
|
+
"'is the API key OK?', 'auth issues?', 'fix engine status'."
|
|
286
|
+
),
|
|
287
|
+
"input_schema": {"type": "object", "properties": {}},
|
|
288
|
+
},
|
|
278
289
|
{
|
|
279
290
|
"name": "pause_sentinel",
|
|
280
291
|
"description": (
|
|
@@ -798,6 +809,67 @@ async def _run_tool(name: str, inputs: dict, cfg_loader, store, slack_client=Non
|
|
|
798
809
|
"sentinel_paused": Path("SENTINEL_PAUSE").exists(),
|
|
799
810
|
})
|
|
800
811
|
|
|
812
|
+
if name == "check_auth_status":
|
|
813
|
+
import subprocess as _sp
|
|
814
|
+
from .notify import get_circuit_status
|
|
815
|
+
cfg = cfg_loader.sentinel
|
|
816
|
+
|
|
817
|
+
# Auth configuration
|
|
818
|
+
has_key = bool(cfg.anthropic_api_key)
|
|
819
|
+
pro_for_tasks = cfg.claude_pro_for_tasks
|
|
820
|
+
if pro_for_tasks and has_key:
|
|
821
|
+
primary, fallback = "claude_pro_oauth", "api_key"
|
|
822
|
+
elif pro_for_tasks:
|
|
823
|
+
primary, fallback = "claude_pro_oauth", None
|
|
824
|
+
else:
|
|
825
|
+
primary, fallback = "api_key", "claude_pro_oauth" if not has_key else "claude_pro_oauth"
|
|
826
|
+
|
|
827
|
+
# Claude CLI liveness check
|
|
828
|
+
cli_ok, cli_version = False, ""
|
|
829
|
+
try:
|
|
830
|
+
r = _sp.run(
|
|
831
|
+
[cfg.claude_code_bin, "--version"],
|
|
832
|
+
capture_output=True, text=True, timeout=10,
|
|
833
|
+
)
|
|
834
|
+
if r.returncode == 0:
|
|
835
|
+
cli_ok = True
|
|
836
|
+
cli_version = r.stdout.strip() or r.stderr.strip()
|
|
837
|
+
except Exception:
|
|
838
|
+
pass
|
|
839
|
+
|
|
840
|
+
# Circuit breaker snapshot — only open (unhealthy) circuits appear here
|
|
841
|
+
circuits = get_circuit_status()
|
|
842
|
+
|
|
843
|
+
# Fix engine stats (last 24 h)
|
|
844
|
+
recent = store.get_recent_fixes(hours=24)
|
|
845
|
+
counts = {"applied": 0, "failed": 0, "skipped": 0, "pending": 0}
|
|
846
|
+
last_success = None
|
|
847
|
+
for f in recent:
|
|
848
|
+
s = f.get("status", "")
|
|
849
|
+
if s in counts:
|
|
850
|
+
counts[s] += 1
|
|
851
|
+
if s == "applied" and not last_success:
|
|
852
|
+
last_success = f.get("timestamp", "")
|
|
853
|
+
|
|
854
|
+
overall = "healthy"
|
|
855
|
+
if circuits:
|
|
856
|
+
overall = "degraded — rate/auth limit active on: " + ", ".join(circuits)
|
|
857
|
+
elif not cli_ok:
|
|
858
|
+
overall = "warning — claude CLI not reachable"
|
|
859
|
+
|
|
860
|
+
return json.dumps({
|
|
861
|
+
"overall": overall,
|
|
862
|
+
"auth": {
|
|
863
|
+
"api_key_configured": has_key,
|
|
864
|
+
"claude_pro_for_tasks": pro_for_tasks,
|
|
865
|
+
"primary_method": primary,
|
|
866
|
+
"fallback_method": fallback,
|
|
867
|
+
},
|
|
868
|
+
"claude_cli": {"available": cli_ok, "version": cli_version},
|
|
869
|
+
"rate_limit_circuits": circuits,
|
|
870
|
+
"fix_engine_24h": {**counts, "last_successful_fix": last_success},
|
|
871
|
+
})
|
|
872
|
+
|
|
801
873
|
if name == "create_issue":
|
|
802
874
|
description = inputs["description"]
|
|
803
875
|
target_repo = inputs.get("target_repo", "")
|