@misterhuydo/sentinel 1.2.7 → 1.2.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.cairn/session.json +2 -2
- package/package.json +1 -1
- package/python/sentinel/fix_engine.py +24 -5
- package/python/sentinel/issue_watcher.py +8 -0
- package/python/sentinel/main.py +46 -32
- package/python/sentinel/notify.py +249 -173
package/.cairn/session.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
|
-
"message": "Auto-checkpoint at 2026-03-
|
|
3
|
-
"checkpoint_at": "2026-03-
|
|
2
|
+
"message": "Auto-checkpoint at 2026-03-23T12:00:29.548Z",
|
|
3
|
+
"checkpoint_at": "2026-03-23T12:00:29.550Z",
|
|
4
4
|
"active_files": [],
|
|
5
5
|
"notes": [],
|
|
6
6
|
"mtime_snapshot": {}
|
package/package.json
CHANGED
|
@@ -80,9 +80,22 @@ def _build_prompt(event, repo: RepoConfig, log_file, marker: str, stale_markers:
|
|
|
80
80
|
f"1. {step1}",
|
|
81
81
|
"2. Use your available tools to explore the codebase and identify the root cause.",
|
|
82
82
|
f"3. {marker_instruction}",
|
|
83
|
-
"4.
|
|
84
|
-
"
|
|
85
|
-
"
|
|
83
|
+
"4. Consider all possible fix approaches. For each, weigh:",
|
|
84
|
+
" - Confidence: is this definitely the root cause?",
|
|
85
|
+
" - Safety: could this break other functionality?",
|
|
86
|
+
" - Scope: is it minimal and targeted?",
|
|
87
|
+
" Choose the safest minimal approach. If multiple valid options exist, pick the one",
|
|
88
|
+
" with highest confidence and lowest blast radius.",
|
|
89
|
+
"5. Output ONLY a unified diff patch (git diff format) for the chosen fix.",
|
|
90
|
+
"6. Do not explain. Output only the patch.",
|
|
91
|
+
"7. Only if you truly cannot produce a safe fix — e.g. the root cause requires a",
|
|
92
|
+
" DB schema change, infrastructure update, business logic decision, or is inside",
|
|
93
|
+
" a third-party library — output exactly:",
|
|
94
|
+
" NEEDS_HUMAN: <explanation>",
|
|
95
|
+
" Include: (a) root cause identified, (b) approaches you considered and why each",
|
|
96
|
+
" was insufficient or unsafe, (c) exactly what a human needs to do or decide.",
|
|
97
|
+
" Do NOT output NEEDS_HUMAN just because the fix is complex — only when human",
|
|
98
|
+
" judgement or access is genuinely required.",
|
|
86
99
|
]
|
|
87
100
|
return "\n".join(lines_out)
|
|
88
101
|
|
|
@@ -142,13 +155,14 @@ def generate_fix(
|
|
|
142
155
|
repo: RepoConfig,
|
|
143
156
|
cfg: SentinelConfig,
|
|
144
157
|
patches_dir: Path,
|
|
158
|
+
store=None,
|
|
145
159
|
) -> tuple[str, Path | None, str]:
|
|
146
160
|
"""
|
|
147
161
|
Generate a fix for the given error event.
|
|
148
162
|
|
|
149
163
|
Returns:
|
|
150
164
|
(status, patch_path, marker)
|
|
151
|
-
status: "patch" | "skip" | "error"
|
|
165
|
+
status: "patch" | "skip" | "needs_human" | "error"
|
|
152
166
|
|
|
153
167
|
Auth strategy — API key and Claude Pro (OAuth) are interchangeable:
|
|
154
168
|
Primary : Claude Pro (OAuth) if claude_pro_for_tasks=True, else API key
|
|
@@ -237,10 +251,15 @@ def generate_fix(
|
|
|
237
251
|
output=output,
|
|
238
252
|
)
|
|
239
253
|
|
|
254
|
+
if output.strip().upper().startswith("NEEDS_HUMAN:"):
|
|
255
|
+
reason = output.strip()[len("NEEDS_HUMAN:"):].strip()
|
|
256
|
+
logger.info("Claude needs human for %s: %s", event.fingerprint, reason[:200])
|
|
257
|
+
return "needs_human", None, reason
|
|
258
|
+
|
|
240
259
|
if output.strip().upper().startswith("SKIP:"):
|
|
241
260
|
reason = output.strip()[5:].strip()
|
|
242
261
|
logger.info("Claude skipped fix for %s: %s", event.fingerprint, reason)
|
|
243
|
-
return "skip", None,
|
|
262
|
+
return "skip", None, reason
|
|
244
263
|
|
|
245
264
|
patch = _extract_patch(output)
|
|
246
265
|
if not patch:
|
|
@@ -41,6 +41,7 @@ class IssueEvent:
|
|
|
41
41
|
fingerprint: str = ""
|
|
42
42
|
severity: str = "ERROR"
|
|
43
43
|
timestamp: str = ""
|
|
44
|
+
submitter_user_id: str = "" # Slack user ID who raised this via Boss, if known
|
|
44
45
|
|
|
45
46
|
# Compatibility fields matching ErrorEvent interface
|
|
46
47
|
level: str = "ERROR"
|
|
@@ -53,6 +54,13 @@ class IssueEvent:
|
|
|
53
54
|
if not self.fingerprint:
|
|
54
55
|
raw = f"issue:{self.source}:{self.message[:200]}"
|
|
55
56
|
self.fingerprint = hashlib.sha1(raw.encode()).hexdigest()[:16]
|
|
57
|
+
if not self.submitter_user_id:
|
|
58
|
+
import re as _re
|
|
59
|
+
for _line in self.body.splitlines():
|
|
60
|
+
_m = _re.match(r'SUBMITTED_BY:.*\(([UW][A-Z0-9]+)\)', _line.strip())
|
|
61
|
+
if _m:
|
|
62
|
+
self.submitter_user_id = _m.group(1)
|
|
63
|
+
break
|
|
56
64
|
if not self.timestamp:
|
|
57
65
|
self.timestamp = datetime.now(timezone.utc).isoformat()
|
|
58
66
|
if not self.stack_trace:
|
package/python/sentinel/main.py
CHANGED
|
@@ -28,6 +28,7 @@ from .log_parser import parse_all, scan_all_for_markers, ErrorEvent
|
|
|
28
28
|
from .issue_watcher import scan_issues, mark_done, IssueEvent
|
|
29
29
|
from .repo_router import route
|
|
30
30
|
from .reporter import build_and_send, send_fix_notification, send_failure_notification, send_confirmed_notification, send_regression_notification, send_startup_notification, send_upgrade_notification
|
|
31
|
+
from .notify import notify_fix_blocked
|
|
31
32
|
from .health_checker import evaluate_repos
|
|
32
33
|
from .state_store import StateStore
|
|
33
34
|
|
|
@@ -87,15 +88,21 @@ async def _handle_error(event: ErrorEvent, cfg_loader: ConfigLoader, store: Stat
|
|
|
87
88
|
status, patch_path, marker = generate_fix(event, repo, sentinel, patches_dir, store)
|
|
88
89
|
|
|
89
90
|
if status != "patch" or patch_path is None:
|
|
90
|
-
outcome = "skipped" if status
|
|
91
|
+
outcome = "skipped" if status in ("skip", "needs_human") else "failed"
|
|
91
92
|
store.record_fix(event.fingerprint, outcome, repo_name=repo.repo_name)
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
93
|
+
# For log-detected errors: NEEDS_HUMAN -> DM/channel; SKIP -> email only (not spam)
|
|
94
|
+
if status == "needs_human":
|
|
95
|
+
notify_fix_blocked(sentinel, event.source, event.message,
|
|
96
|
+
reason=marker, repo_name=repo.repo_name,
|
|
97
|
+
submitter_user_id="")
|
|
98
|
+
else:
|
|
99
|
+
send_failure_notification(sentinel, {
|
|
100
|
+
"source": event.source,
|
|
101
|
+
"message": event.message,
|
|
102
|
+
"repo_name": repo.repo_name,
|
|
103
|
+
"reason": f"Claude Code returned {status.upper()}",
|
|
104
|
+
"body": event.full_text()[:500],
|
|
105
|
+
})
|
|
99
106
|
return
|
|
100
107
|
|
|
101
108
|
commit_status, commit_hash = apply_and_commit(event, patch_path, repo, sentinel)
|
|
@@ -105,7 +112,7 @@ async def _handle_error(event: ErrorEvent, cfg_loader: ConfigLoader, store: Stat
|
|
|
105
112
|
"source": event.source,
|
|
106
113
|
"message": event.message,
|
|
107
114
|
"repo_name": repo.repo_name,
|
|
108
|
-
"reason": "
|
|
115
|
+
"reason": "Patch was generated but commit/tests failed",
|
|
109
116
|
"body": event.full_text()[:500],
|
|
110
117
|
})
|
|
111
118
|
return
|
|
@@ -179,28 +186,25 @@ async def _handle_issue(event: IssueEvent, cfg_loader: ConfigLoader, store: Stat
|
|
|
179
186
|
status, patch_path, marker = generate_fix(event, repo, sentinel, patches_dir, store)
|
|
180
187
|
|
|
181
188
|
if status != "patch" or patch_path is None:
|
|
182
|
-
store.record_fix(event.fingerprint, "skipped" if status
|
|
189
|
+
store.record_fix(event.fingerprint, "skipped" if status in ("skip", "needs_human") else "failed",
|
|
183
190
|
repo_name=repo.repo_name)
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
})
|
|
191
|
+
# For user-submitted issues: always notify (person is waiting)
|
|
192
|
+
submitter_uid = getattr(event, "submitter_user_id", "")
|
|
193
|
+
reason_text = marker if status == "needs_human" else f"Claude Code returned {status.upper()}"
|
|
194
|
+
notify_fix_blocked(sentinel, event.source, event.message,
|
|
195
|
+
reason=reason_text, repo_name=repo.repo_name,
|
|
196
|
+
submitter_user_id=submitter_uid)
|
|
191
197
|
mark_done(event.issue_file)
|
|
192
198
|
return
|
|
193
199
|
|
|
194
200
|
commit_status, commit_hash = apply_and_commit(event, patch_path, repo, sentinel)
|
|
195
201
|
if commit_status != "committed":
|
|
196
202
|
store.record_fix(event.fingerprint, "failed", repo_name=repo.repo_name)
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
"body": event.body[:500],
|
|
203
|
-
})
|
|
203
|
+
submitter_uid = getattr(event, "submitter_user_id", "")
|
|
204
|
+
notify_fix_blocked(sentinel, event.source, event.message,
|
|
205
|
+
reason="Patch was generated but commit/tests failed",
|
|
206
|
+
repo_name=repo.repo_name,
|
|
207
|
+
submitter_user_id=submitter_uid)
|
|
204
208
|
mark_done(event.issue_file)
|
|
205
209
|
return
|
|
206
210
|
|
|
@@ -303,21 +307,31 @@ async def poll_cycle(cfg_loader: ConfigLoader, store: StateStore):
|
|
|
303
307
|
|
|
304
308
|
# -- Health URL checks -------------------------------------------------------
|
|
305
309
|
if cfg_loader.repos:
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
310
|
+
import asyncio as _asyncio
|
|
311
|
+
_loop = _asyncio.get_event_loop()
|
|
312
|
+
health_results = await _loop.run_in_executor(
|
|
313
|
+
None,
|
|
314
|
+
lambda: evaluate_repos(
|
|
315
|
+
cfg_loader.repos, cfg_loader.log_sources,
|
|
316
|
+
cfg_loader.sentinel.workspace_dir, store=store,
|
|
317
|
+
)
|
|
309
318
|
)
|
|
310
319
|
for hr in health_results:
|
|
311
320
|
if hr["action"] == "fix":
|
|
312
321
|
fp = f"health-{hr['repo_name']}"
|
|
313
322
|
store.record_error(fp, f"health_checker/{hr['repo_name']}", hr["message"])
|
|
314
323
|
if not store.fix_attempted_recently(fp, hours=6):
|
|
315
|
-
|
|
324
|
+
from .log_parser import ErrorEvent as _EE
|
|
325
|
+
from datetime import datetime, timezone as _tz
|
|
326
|
+
synth = _EE(
|
|
316
327
|
source=f"health_checker/{hr['repo_name']}",
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
|
|
328
|
+
log_file="",
|
|
329
|
+
timestamp=datetime.now(_tz.utc).isoformat(),
|
|
330
|
+
level="ERROR",
|
|
331
|
+
thread="health_checker",
|
|
332
|
+
logger_name="health_checker",
|
|
333
|
+
message=f"App startup failure detected: {hr['message']}",
|
|
334
|
+
stack_trace=[hr["startup_failure_line"]] if hr["startup_failure_line"] else [],
|
|
321
335
|
)
|
|
322
336
|
synth.fingerprint = fp
|
|
323
337
|
await _handle_error(synth, cfg_loader, store)
|
|
@@ -1,173 +1,249 @@
|
|
|
1
|
-
"""
|
|
2
|
-
notify.py — Best-effort Slack alerts from any Sentinel module.
|
|
3
|
-
|
|
4
|
-
Uses the Slack Web API directly (no Bolt / Socket Mode required).
|
|
5
|
-
Calls never raise — failures are logged and silently dropped.
|
|
6
|
-
"""
|
|
7
|
-
|
|
8
|
-
import logging
|
|
9
|
-
import re
|
|
10
|
-
import time
|
|
11
|
-
|
|
12
|
-
import requests
|
|
13
|
-
|
|
14
|
-
logger = logging.getLogger(__name__)
|
|
15
|
-
|
|
16
|
-
# ── Rate-limit / auth-failure detector ────────────────────────────────────────
|
|
17
|
-
|
|
18
|
-
_RATE_LIMIT_RE = re.compile(
|
|
19
|
-
r"rate.?limit|usage.?limit|too many requests|quota.?exceeded"
|
|
20
|
-
r"|overloaded|credit.?balance|billing|529"
|
|
21
|
-
r"|not.?authenticated|invalid.?api.?key|authentication.?fail"
|
|
22
|
-
r"|claude\.ai subscription|pro.?plan|login required",
|
|
23
|
-
re.IGNORECASE,
|
|
24
|
-
)
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
def is_rate_limited(text: str) -> bool:
|
|
28
|
-
"""Return True if the text contains a rate-limit or auth-failure signal."""
|
|
29
|
-
return bool(_RATE_LIMIT_RE.search(text))
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
# ── Circuit breaker ────────────────────────────────────────────────────────────
|
|
33
|
-
#
|
|
34
|
-
# Prevents alert storms when Claude is persistently rate-limited.
|
|
35
|
-
# Each `source` string gets its own independent circuit:
|
|
36
|
-
# CLOSED → normal; alerts pass through immediately
|
|
37
|
-
# OPEN → suppressed; one re-alert every CIRCUIT_COOLDOWN_SECONDS
|
|
38
|
-
#
|
|
39
|
-
# On recovery (first non-rate-limited output after OPEN):
|
|
40
|
-
# → post "resolved" to Slack, close the circuit
|
|
41
|
-
|
|
42
|
-
CIRCUIT_COOLDOWN_SECONDS = 3600 # 1 h between repeat alerts while open
|
|
43
|
-
|
|
44
|
-
# source → {opened_at, last_alerted_at, count}
|
|
45
|
-
_circuits: dict[str, dict] = {}
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
def get_circuit_status() -> dict:
|
|
49
|
-
"""
|
|
50
|
-
Return a snapshot of all open circuits.
|
|
51
|
-
Used by the `check_auth_status` Boss tool.
|
|
52
|
-
|
|
53
|
-
Returns:
|
|
54
|
-
{ source: { state, opened_at, open_for_seconds, alert_count } }
|
|
55
|
-
Only open circuits are included; an empty dict means everything is healthy.
|
|
56
|
-
"""
|
|
57
|
-
now = time.time()
|
|
58
|
-
return {
|
|
59
|
-
src: {
|
|
60
|
-
"state": "open",
|
|
61
|
-
"opened_at": c["opened_at"],
|
|
62
|
-
"open_for_seconds": int(now - c["opened_at"]),
|
|
63
|
-
"alert_count": c["count"],
|
|
64
|
-
}
|
|
65
|
-
for src, c in _circuits.items()
|
|
66
|
-
}
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
def _open_or_repeat(bot_token: str, channel: str, source: str, output: str) -> None:
|
|
70
|
-
"""Open circuit on first hit; re-alert after cooldown if still failing."""
|
|
71
|
-
now = time.time()
|
|
72
|
-
circuit = _circuits.get(source)
|
|
73
|
-
|
|
74
|
-
if circuit is None:
|
|
75
|
-
# First occurrence — open and alert immediately
|
|
76
|
-
_circuits[source] = {"opened_at": now, "last_alerted_at": now, "count": 1}
|
|
77
|
-
logger.error("Circuit opened for %s: %s", source, output[:200])
|
|
78
|
-
slack_alert(bot_token, channel, rate_limit_message(source, output))
|
|
79
|
-
return
|
|
80
|
-
|
|
81
|
-
circuit["count"] += 1
|
|
82
|
-
elapsed = now - circuit["last_alerted_at"]
|
|
83
|
-
if elapsed >= CIRCUIT_COOLDOWN_SECONDS:
|
|
84
|
-
# Still failing after cooldown — remind admins once per hour
|
|
85
|
-
circuit["last_alerted_at"] = now
|
|
86
|
-
open_mins = int((now - circuit["opened_at"]) / 60)
|
|
87
|
-
msg = (
|
|
88
|
-
f":warning: *Sentinel — Claude usage/auth problem still active ({source})*\n"
|
|
89
|
-
f"Still failing after {open_mins} minutes. Total occurrences: {circuit['count']}.\n"
|
|
90
|
-
f"Last error:\n```{output.strip()[:300]}```\n"
|
|
91
|
-
f"Run `check_auth_status` in Slack to see the full picture."
|
|
92
|
-
)
|
|
93
|
-
logger.error("Circuit still open for %s (count=%d)", source, circuit["count"])
|
|
94
|
-
slack_alert(bot_token, channel, msg)
|
|
95
|
-
# else: within cooldown window — suppress
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
def _close_if_open(bot_token: str, channel: str, source: str) -> None:
|
|
99
|
-
"""If circuit was open, close it and post a recovery alert."""
|
|
100
|
-
circuit = _circuits.pop(source, None)
|
|
101
|
-
if circuit is None:
|
|
102
|
-
return
|
|
103
|
-
duration_mins = int((time.time() - circuit["opened_at"]) / 60)
|
|
104
|
-
msg = (
|
|
105
|
-
f":white_check_mark: *Sentinel — Claude auth restored ({source})*\n"
|
|
106
|
-
f"Fixed after {duration_mins} min. Total failures during outage: {circuit['count']}."
|
|
107
|
-
)
|
|
108
|
-
logger.info("Circuit closed for %s after %d min, %d failures", source, duration_mins, circuit["count"])
|
|
109
|
-
slack_alert(bot_token, channel, msg)
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
def rate_limit_message(source: str, raw: str) -> str:
|
|
113
|
-
"""Produce a human-readable Slack alert for a rate-limit / auth event (first occurrence)."""
|
|
114
|
-
snippet = raw.strip()[:300].replace("\n", " ")
|
|
115
|
-
return (
|
|
116
|
-
f":warning: *Sentinel — Claude usage/auth problem ({source})*\n"
|
|
117
|
-
f"Claude returned an error that requires admin attention:\n"
|
|
118
|
-
f"```{snippet}```\n"
|
|
119
|
-
f"*What to check:*\n"
|
|
120
|
-
f"• API key: verify `ANTHROPIC_API_KEY` in `sentinel.properties` is valid and has credit\n"
|
|
121
|
-
f"• Claude Pro: run `claude login` on the server to refresh the OAuth session\n"
|
|
122
|
-
f"• Both: Sentinel tries both methods — at least one must be working\n"
|
|
123
|
-
f"Repeat alerts will be suppressed for 1 hour. "
|
|
124
|
-
f"Run `check_auth_status` in Slack to see current state."
|
|
125
|
-
)
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
# ── Alert dispatcher ──────────────────────────────────────────────────────────
|
|
129
|
-
|
|
130
|
-
def slack_alert(bot_token: str, channel: str, text: str) -> None:
|
|
131
|
-
"""
|
|
132
|
-
Post a plain-text alert to a Slack channel.
|
|
133
|
-
Best-effort: logs on failure, never raises.
|
|
134
|
-
"""
|
|
135
|
-
if not bot_token or not channel:
|
|
136
|
-
logger.debug("slack_alert: no token/channel configured — logging only: %s", text[:120])
|
|
137
|
-
return
|
|
138
|
-
try:
|
|
139
|
-
resp = requests.post(
|
|
140
|
-
"https://slack.com/api/chat.postMessage",
|
|
141
|
-
headers={
|
|
142
|
-
"Authorization": f"Bearer {bot_token}",
|
|
143
|
-
"Content-Type": "application/json",
|
|
144
|
-
},
|
|
145
|
-
json={"channel": channel, "text": text},
|
|
146
|
-
timeout=10,
|
|
147
|
-
)
|
|
148
|
-
data = resp.json()
|
|
149
|
-
if not data.get("ok"):
|
|
150
|
-
logger.warning("slack_alert: Slack API error: %s", data.get("error"))
|
|
151
|
-
except Exception as exc:
|
|
152
|
-
logger.warning("slack_alert: failed to post: %s", exc)
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
"""
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
1
|
+
"""
|
|
2
|
+
notify.py — Best-effort Slack alerts from any Sentinel module.
|
|
3
|
+
|
|
4
|
+
Uses the Slack Web API directly (no Bolt / Socket Mode required).
|
|
5
|
+
Calls never raise — failures are logged and silently dropped.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import logging
|
|
9
|
+
import re
|
|
10
|
+
import time
|
|
11
|
+
|
|
12
|
+
import requests
|
|
13
|
+
|
|
14
|
+
logger = logging.getLogger(__name__)
|
|
15
|
+
|
|
16
|
+
# ── Rate-limit / auth-failure detector ────────────────────────────────────────
|
|
17
|
+
|
|
18
|
+
_RATE_LIMIT_RE = re.compile(
|
|
19
|
+
r"rate.?limit|usage.?limit|too many requests|quota.?exceeded"
|
|
20
|
+
r"|overloaded|credit.?balance|billing|529"
|
|
21
|
+
r"|not.?authenticated|invalid.?api.?key|authentication.?fail"
|
|
22
|
+
r"|claude\.ai subscription|pro.?plan|login required",
|
|
23
|
+
re.IGNORECASE,
|
|
24
|
+
)
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def is_rate_limited(text: str) -> bool:
|
|
28
|
+
"""Return True if the text contains a rate-limit or auth-failure signal."""
|
|
29
|
+
return bool(_RATE_LIMIT_RE.search(text))
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
# ── Circuit breaker ────────────────────────────────────────────────────────────
|
|
33
|
+
#
|
|
34
|
+
# Prevents alert storms when Claude is persistently rate-limited.
|
|
35
|
+
# Each `source` string gets its own independent circuit:
|
|
36
|
+
# CLOSED → normal; alerts pass through immediately
|
|
37
|
+
# OPEN → suppressed; one re-alert every CIRCUIT_COOLDOWN_SECONDS
|
|
38
|
+
#
|
|
39
|
+
# On recovery (first non-rate-limited output after OPEN):
|
|
40
|
+
# → post "resolved" to Slack, close the circuit
|
|
41
|
+
|
|
42
|
+
CIRCUIT_COOLDOWN_SECONDS = 3600 # 1 h between repeat alerts while open
|
|
43
|
+
|
|
44
|
+
# source → {opened_at, last_alerted_at, count}
|
|
45
|
+
_circuits: dict[str, dict] = {}
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def get_circuit_status() -> dict:
|
|
49
|
+
"""
|
|
50
|
+
Return a snapshot of all open circuits.
|
|
51
|
+
Used by the `check_auth_status` Boss tool.
|
|
52
|
+
|
|
53
|
+
Returns:
|
|
54
|
+
{ source: { state, opened_at, open_for_seconds, alert_count } }
|
|
55
|
+
Only open circuits are included; an empty dict means everything is healthy.
|
|
56
|
+
"""
|
|
57
|
+
now = time.time()
|
|
58
|
+
return {
|
|
59
|
+
src: {
|
|
60
|
+
"state": "open",
|
|
61
|
+
"opened_at": c["opened_at"],
|
|
62
|
+
"open_for_seconds": int(now - c["opened_at"]),
|
|
63
|
+
"alert_count": c["count"],
|
|
64
|
+
}
|
|
65
|
+
for src, c in _circuits.items()
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def _open_or_repeat(bot_token: str, channel: str, source: str, output: str) -> None:
|
|
70
|
+
"""Open circuit on first hit; re-alert after cooldown if still failing."""
|
|
71
|
+
now = time.time()
|
|
72
|
+
circuit = _circuits.get(source)
|
|
73
|
+
|
|
74
|
+
if circuit is None:
|
|
75
|
+
# First occurrence — open and alert immediately
|
|
76
|
+
_circuits[source] = {"opened_at": now, "last_alerted_at": now, "count": 1}
|
|
77
|
+
logger.error("Circuit opened for %s: %s", source, output[:200])
|
|
78
|
+
slack_alert(bot_token, channel, rate_limit_message(source, output))
|
|
79
|
+
return
|
|
80
|
+
|
|
81
|
+
circuit["count"] += 1
|
|
82
|
+
elapsed = now - circuit["last_alerted_at"]
|
|
83
|
+
if elapsed >= CIRCUIT_COOLDOWN_SECONDS:
|
|
84
|
+
# Still failing after cooldown — remind admins once per hour
|
|
85
|
+
circuit["last_alerted_at"] = now
|
|
86
|
+
open_mins = int((now - circuit["opened_at"]) / 60)
|
|
87
|
+
msg = (
|
|
88
|
+
f":warning: *Sentinel — Claude usage/auth problem still active ({source})*\n"
|
|
89
|
+
f"Still failing after {open_mins} minutes. Total occurrences: {circuit['count']}.\n"
|
|
90
|
+
f"Last error:\n```{output.strip()[:300]}```\n"
|
|
91
|
+
f"Run `check_auth_status` in Slack to see the full picture."
|
|
92
|
+
)
|
|
93
|
+
logger.error("Circuit still open for %s (count=%d)", source, circuit["count"])
|
|
94
|
+
slack_alert(bot_token, channel, msg)
|
|
95
|
+
# else: within cooldown window — suppress
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
def _close_if_open(bot_token: str, channel: str, source: str) -> None:
|
|
99
|
+
"""If circuit was open, close it and post a recovery alert."""
|
|
100
|
+
circuit = _circuits.pop(source, None)
|
|
101
|
+
if circuit is None:
|
|
102
|
+
return
|
|
103
|
+
duration_mins = int((time.time() - circuit["opened_at"]) / 60)
|
|
104
|
+
msg = (
|
|
105
|
+
f":white_check_mark: *Sentinel — Claude auth restored ({source})*\n"
|
|
106
|
+
f"Fixed after {duration_mins} min. Total failures during outage: {circuit['count']}."
|
|
107
|
+
)
|
|
108
|
+
logger.info("Circuit closed for %s after %d min, %d failures", source, duration_mins, circuit["count"])
|
|
109
|
+
slack_alert(bot_token, channel, msg)
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
def rate_limit_message(source: str, raw: str) -> str:
|
|
113
|
+
"""Produce a human-readable Slack alert for a rate-limit / auth event (first occurrence)."""
|
|
114
|
+
snippet = raw.strip()[:300].replace("\n", " ")
|
|
115
|
+
return (
|
|
116
|
+
f":warning: *Sentinel — Claude usage/auth problem ({source})*\n"
|
|
117
|
+
f"Claude returned an error that requires admin attention:\n"
|
|
118
|
+
f"```{snippet}```\n"
|
|
119
|
+
f"*What to check:*\n"
|
|
120
|
+
f"• API key: verify `ANTHROPIC_API_KEY` in `sentinel.properties` is valid and has credit\n"
|
|
121
|
+
f"• Claude Pro: run `claude login` on the server to refresh the OAuth session\n"
|
|
122
|
+
f"• Both: Sentinel tries both methods — at least one must be working\n"
|
|
123
|
+
f"Repeat alerts will be suppressed for 1 hour. "
|
|
124
|
+
f"Run `check_auth_status` in Slack to see current state."
|
|
125
|
+
)
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
# ── Alert dispatcher ──────────────────────────────────────────────────────────
|
|
129
|
+
|
|
130
|
+
def slack_alert(bot_token: str, channel: str, text: str) -> None:
|
|
131
|
+
"""
|
|
132
|
+
Post a plain-text alert to a Slack channel.
|
|
133
|
+
Best-effort: logs on failure, never raises.
|
|
134
|
+
"""
|
|
135
|
+
if not bot_token or not channel:
|
|
136
|
+
logger.debug("slack_alert: no token/channel configured — logging only: %s", text[:120])
|
|
137
|
+
return
|
|
138
|
+
try:
|
|
139
|
+
resp = requests.post(
|
|
140
|
+
"https://slack.com/api/chat.postMessage",
|
|
141
|
+
headers={
|
|
142
|
+
"Authorization": f"Bearer {bot_token}",
|
|
143
|
+
"Content-Type": "application/json",
|
|
144
|
+
},
|
|
145
|
+
json={"channel": channel, "text": text},
|
|
146
|
+
timeout=10,
|
|
147
|
+
)
|
|
148
|
+
data = resp.json()
|
|
149
|
+
if not data.get("ok"):
|
|
150
|
+
logger.warning("slack_alert: Slack API error: %s", data.get("error"))
|
|
151
|
+
except Exception as exc:
|
|
152
|
+
logger.warning("slack_alert: failed to post: %s", exc)
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
def slack_dm(bot_token: str, user_id: str, text: str) -> None:
|
|
157
|
+
"""
|
|
158
|
+
Send a direct message to a specific Slack user.
|
|
159
|
+
Opens a DM channel via conversations.open, then posts.
|
|
160
|
+
Best-effort: logs on failure, never raises.
|
|
161
|
+
"""
|
|
162
|
+
if not bot_token or not user_id:
|
|
163
|
+
logger.debug("slack_dm: no token/user_id — skipping DM")
|
|
164
|
+
return
|
|
165
|
+
try:
|
|
166
|
+
resp = requests.post(
|
|
167
|
+
"https://slack.com/api/conversations.open",
|
|
168
|
+
headers={"Authorization": f"Bearer {bot_token}", "Content-Type": "application/json"},
|
|
169
|
+
json={"users": user_id},
|
|
170
|
+
timeout=10,
|
|
171
|
+
)
|
|
172
|
+
data = resp.json()
|
|
173
|
+
if not data.get("ok"):
|
|
174
|
+
logger.warning("slack_dm: conversations.open failed: %s", data.get("error"))
|
|
175
|
+
return
|
|
176
|
+
dm_channel = data["channel"]["id"]
|
|
177
|
+
slack_alert(bot_token, dm_channel, text)
|
|
178
|
+
except Exception as exc:
|
|
179
|
+
logger.warning("slack_dm: failed to DM %s: %s", user_id, exc)
|
|
180
|
+
|
|
181
|
+
|
|
182
|
+
def notify_fix_blocked(
|
|
183
|
+
cfg,
|
|
184
|
+
source: str,
|
|
185
|
+
message: str,
|
|
186
|
+
reason: str,
|
|
187
|
+
repo_name: str = "",
|
|
188
|
+
submitter_user_id: str = "",
|
|
189
|
+
) -> None:
|
|
190
|
+
"""
|
|
191
|
+
Notify that a fix needs human intervention.
|
|
192
|
+
|
|
193
|
+
- If submitter_user_id is known: DM that person directly.
|
|
194
|
+
- Otherwise: @channel in the configured Slack channel.
|
|
195
|
+
- Always: email admins via reporter.send_failure_notification.
|
|
196
|
+
"""
|
|
197
|
+
short_reason = (reason or "Claude could not determine a safe fix.")[:600]
|
|
198
|
+
repo_line = f"\n*Repo:* {repo_name}" if repo_name else ""
|
|
199
|
+
|
|
200
|
+
slack_text = (
|
|
201
|
+
f":hand: *Fix blocked — human intervention needed*\n"
|
|
202
|
+
f"*Source:* {source}\n"
|
|
203
|
+
f"*Issue:* {message[:200]}{repo_line}\n"
|
|
204
|
+
f"*Reason:*\n{short_reason}"
|
|
205
|
+
)
|
|
206
|
+
|
|
207
|
+
if submitter_user_id:
|
|
208
|
+
slack_dm(cfg.slack_bot_token, submitter_user_id, slack_text)
|
|
209
|
+
else:
|
|
210
|
+
# No known submitter — broadcast to the whole channel
|
|
211
|
+
slack_alert(
|
|
212
|
+
cfg.slack_bot_token,
|
|
213
|
+
cfg.slack_channel,
|
|
214
|
+
f"<!channel> {slack_text}",
|
|
215
|
+
)
|
|
216
|
+
|
|
217
|
+
# Always email admins
|
|
218
|
+
try:
|
|
219
|
+
from .reporter import send_failure_notification
|
|
220
|
+
send_failure_notification(cfg, {
|
|
221
|
+
"source": source,
|
|
222
|
+
"message": message,
|
|
223
|
+
"repo_name": repo_name,
|
|
224
|
+
"reason": f"Needs human intervention: {short_reason[:200]}",
|
|
225
|
+
"body": reason,
|
|
226
|
+
})
|
|
227
|
+
except Exception as exc:
|
|
228
|
+
logger.warning("notify_fix_blocked: email notification failed: %s", exc)
|
|
229
|
+
|
|
230
|
+
|
|
231
|
+
def alert_if_rate_limited(
|
|
232
|
+
bot_token: str,
|
|
233
|
+
channel: str,
|
|
234
|
+
source: str,
|
|
235
|
+
output: str,
|
|
236
|
+
) -> bool:
|
|
237
|
+
"""
|
|
238
|
+
Check output for rate-limit / auth signals and manage the circuit breaker.
|
|
239
|
+
|
|
240
|
+
- Rate limited → open/keep-open circuit, alert (with cooldown suppression)
|
|
241
|
+
- Not limited → close circuit if it was open (recovery alert), return False
|
|
242
|
+
|
|
243
|
+
Returns True if a rate-limit signal was found.
|
|
244
|
+
"""
|
|
245
|
+
if not is_rate_limited(output):
|
|
246
|
+
_close_if_open(bot_token, channel, source)
|
|
247
|
+
return False
|
|
248
|
+
_open_or_repeat(bot_token, channel, source, output)
|
|
249
|
+
return True
|