@misterhuydo/sentinel 1.1.4 → 1.1.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/.cairn/.hint-lock
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
2026-03-23T08:
|
|
1
|
+
2026-03-23T08:53:23.129Z
|
package/.cairn/session.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
|
-
"message": "Auto-checkpoint at 2026-03-23T08:
|
|
3
|
-
"checkpoint_at": "2026-03-23T08:
|
|
2
|
+
"message": "Auto-checkpoint at 2026-03-23T08:59:04.141Z",
|
|
3
|
+
"checkpoint_at": "2026-03-23T08:59:04.142Z",
|
|
4
4
|
"active_files": [],
|
|
5
5
|
"notes": [],
|
|
6
6
|
"mtime_snapshot": {}
|
package/package.json
CHANGED
|
@@ -7,6 +7,7 @@ Calls never raise — failures are logged and silently dropped.
|
|
|
7
7
|
|
|
8
8
|
import logging
|
|
9
9
|
import re
|
|
10
|
+
import time
|
|
10
11
|
|
|
11
12
|
import requests
|
|
12
13
|
|
|
@@ -28,8 +29,88 @@ def is_rate_limited(text: str) -> bool:
|
|
|
28
29
|
return bool(_RATE_LIMIT_RE.search(text))
|
|
29
30
|
|
|
30
31
|
|
|
32
|
+
# ── Circuit breaker ────────────────────────────────────────────────────────────
|
|
33
|
+
#
|
|
34
|
+
# Prevents alert storms when Claude is persistently rate-limited.
|
|
35
|
+
# Each `source` string gets its own independent circuit:
|
|
36
|
+
# CLOSED → normal; alerts pass through immediately
|
|
37
|
+
# OPEN → suppressed; one re-alert every CIRCUIT_COOLDOWN_SECONDS
|
|
38
|
+
#
|
|
39
|
+
# On recovery (first non-rate-limited output after OPEN):
|
|
40
|
+
# → post "resolved" to Slack, close the circuit
|
|
41
|
+
|
|
42
|
+
CIRCUIT_COOLDOWN_SECONDS = 3600 # 1 h between repeat alerts while open
|
|
43
|
+
|
|
44
|
+
# source → {opened_at, last_alerted_at, count}
|
|
45
|
+
_circuits: dict[str, dict] = {}
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def get_circuit_status() -> dict:
|
|
49
|
+
"""
|
|
50
|
+
Return a snapshot of all open circuits.
|
|
51
|
+
Used by the `check_auth_status` Boss tool.
|
|
52
|
+
|
|
53
|
+
Returns:
|
|
54
|
+
{ source: { state, opened_at, open_for_seconds, alert_count } }
|
|
55
|
+
Only open circuits are included; an empty dict means everything is healthy.
|
|
56
|
+
"""
|
|
57
|
+
now = time.time()
|
|
58
|
+
return {
|
|
59
|
+
src: {
|
|
60
|
+
"state": "open",
|
|
61
|
+
"opened_at": c["opened_at"],
|
|
62
|
+
"open_for_seconds": int(now - c["opened_at"]),
|
|
63
|
+
"alert_count": c["count"],
|
|
64
|
+
}
|
|
65
|
+
for src, c in _circuits.items()
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def _open_or_repeat(bot_token: str, channel: str, source: str, output: str) -> None:
|
|
70
|
+
"""Open circuit on first hit; re-alert after cooldown if still failing."""
|
|
71
|
+
now = time.time()
|
|
72
|
+
circuit = _circuits.get(source)
|
|
73
|
+
|
|
74
|
+
if circuit is None:
|
|
75
|
+
# First occurrence — open and alert immediately
|
|
76
|
+
_circuits[source] = {"opened_at": now, "last_alerted_at": now, "count": 1}
|
|
77
|
+
logger.error("Circuit opened for %s: %s", source, output[:200])
|
|
78
|
+
slack_alert(bot_token, channel, rate_limit_message(source, output))
|
|
79
|
+
return
|
|
80
|
+
|
|
81
|
+
circuit["count"] += 1
|
|
82
|
+
elapsed = now - circuit["last_alerted_at"]
|
|
83
|
+
if elapsed >= CIRCUIT_COOLDOWN_SECONDS:
|
|
84
|
+
# Still failing after cooldown — remind admins once per hour
|
|
85
|
+
circuit["last_alerted_at"] = now
|
|
86
|
+
open_mins = int((now - circuit["opened_at"]) / 60)
|
|
87
|
+
msg = (
|
|
88
|
+
f":warning: *Sentinel — Claude usage/auth problem still active ({source})*\n"
|
|
89
|
+
f"Still failing after {open_mins} minutes. Total occurrences: {circuit['count']}.\n"
|
|
90
|
+
f"Last error:\n```{output.strip()[:300]}```\n"
|
|
91
|
+
f"Run `check_auth_status` in Slack to see the full picture."
|
|
92
|
+
)
|
|
93
|
+
logger.error("Circuit still open for %s (count=%d)", source, circuit["count"])
|
|
94
|
+
slack_alert(bot_token, channel, msg)
|
|
95
|
+
# else: within cooldown window — suppress
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
def _close_if_open(bot_token: str, channel: str, source: str) -> None:
|
|
99
|
+
"""If circuit was open, close it and post a recovery alert."""
|
|
100
|
+
circuit = _circuits.pop(source, None)
|
|
101
|
+
if circuit is None:
|
|
102
|
+
return
|
|
103
|
+
duration_mins = int((time.time() - circuit["opened_at"]) / 60)
|
|
104
|
+
msg = (
|
|
105
|
+
f":white_check_mark: *Sentinel — Claude auth restored ({source})*\n"
|
|
106
|
+
f"Fixed after {duration_mins} min. Total failures during outage: {circuit['count']}."
|
|
107
|
+
)
|
|
108
|
+
logger.info("Circuit closed for %s after %d min, %d failures", source, duration_mins, circuit["count"])
|
|
109
|
+
slack_alert(bot_token, channel, msg)
|
|
110
|
+
|
|
111
|
+
|
|
31
112
|
def rate_limit_message(source: str, raw: str) -> str:
|
|
32
|
-
"""Produce a human-readable Slack alert for a rate-limit event."""
|
|
113
|
+
"""Produce a human-readable Slack alert for a rate-limit / auth event (first occurrence)."""
|
|
33
114
|
snippet = raw.strip()[:300].replace("\n", " ")
|
|
34
115
|
return (
|
|
35
116
|
f":warning: *Sentinel — Claude usage/auth problem ({source})*\n"
|
|
@@ -37,9 +118,10 @@ def rate_limit_message(source: str, raw: str) -> str:
|
|
|
37
118
|
f"```{snippet}```\n"
|
|
38
119
|
f"*What to check:*\n"
|
|
39
120
|
f"• API key: verify `ANTHROPIC_API_KEY` in `sentinel.properties` is valid and has credit\n"
|
|
40
|
-
f"• Claude Pro: run `claude login` on the server to refresh OAuth\n"
|
|
41
|
-
f"• Both: at least one
|
|
42
|
-
f"
|
|
121
|
+
f"• Claude Pro: run `claude login` on the server to refresh the OAuth session\n"
|
|
122
|
+
f"• Both: Sentinel tries both methods — at least one must be working\n"
|
|
123
|
+
f"Repeat alerts will be suppressed for 1 hour. "
|
|
124
|
+
f"Run `check_auth_status` in Slack to see current state."
|
|
43
125
|
)
|
|
44
126
|
|
|
45
127
|
|
|
@@ -77,12 +159,15 @@ def alert_if_rate_limited(
|
|
|
77
159
|
output: str,
|
|
78
160
|
) -> bool:
|
|
79
161
|
"""
|
|
80
|
-
Check output for rate-limit / auth signals.
|
|
81
|
-
|
|
162
|
+
Check output for rate-limit / auth signals and manage the circuit breaker.
|
|
163
|
+
|
|
164
|
+
- Rate limited → open/keep-open circuit, alert (with cooldown suppression)
|
|
165
|
+
- Not limited → close circuit if it was open (recovery alert), return False
|
|
166
|
+
|
|
167
|
+
Returns True if a rate-limit signal was found.
|
|
82
168
|
"""
|
|
83
169
|
if not is_rate_limited(output):
|
|
170
|
+
_close_if_open(bot_token, channel, source)
|
|
84
171
|
return False
|
|
85
|
-
|
|
86
|
-
logger.error("Claude rate-limit/auth failure in %s: %s", source, output[:200])
|
|
87
|
-
slack_alert(bot_token, channel, msg)
|
|
172
|
+
_open_or_repeat(bot_token, channel, source, output)
|
|
88
173
|
return True
|
|
@@ -137,6 +137,7 @@ reply with a short summary grouped by category:
|
|
|
137
137
|
*Fix management*
|
|
138
138
|
• `get_fix_details` — full details of a specific fix — "show fix abc123"
|
|
139
139
|
• `list_pending_prs` — all open Sentinel PRs awaiting review — "list open PRs"
|
|
140
|
+
• `check_auth_status` — Claude auth health, rate-limit circuit state, fix engine 24 h stats — "is Claude working?", "any rate limits?", "auth issues?"
|
|
140
141
|
|
|
141
142
|
*Project & task delivery*
|
|
142
143
|
• `list_projects` — all projects and repos Sentinel manages — "what projects do you manage?"
|
|
@@ -275,6 +276,16 @@ _TOOLS = [
|
|
|
275
276
|
"description": "List all open Sentinel PRs awaiting admin review.",
|
|
276
277
|
"input_schema": {"type": "object", "properties": {}},
|
|
277
278
|
},
|
|
279
|
+
{
|
|
280
|
+
"name": "check_auth_status",
|
|
281
|
+
"description": (
|
|
282
|
+
"Check Claude authentication health, current rate-limit / usage-limit circuit state, "
|
|
283
|
+
"and fix engine stats for the last 24 h. "
|
|
284
|
+
"Use when someone asks: 'is Claude working?', 'any rate limits?', 'why aren't fixes running?', "
|
|
285
|
+
"'is the API key OK?', 'auth issues?', 'fix engine status'."
|
|
286
|
+
),
|
|
287
|
+
"input_schema": {"type": "object", "properties": {}},
|
|
288
|
+
},
|
|
278
289
|
{
|
|
279
290
|
"name": "pause_sentinel",
|
|
280
291
|
"description": (
|
|
@@ -798,6 +809,67 @@ async def _run_tool(name: str, inputs: dict, cfg_loader, store, slack_client=Non
|
|
|
798
809
|
"sentinel_paused": Path("SENTINEL_PAUSE").exists(),
|
|
799
810
|
})
|
|
800
811
|
|
|
812
|
+
if name == "check_auth_status":
|
|
813
|
+
import subprocess as _sp
|
|
814
|
+
from .notify import get_circuit_status
|
|
815
|
+
cfg = cfg_loader.sentinel
|
|
816
|
+
|
|
817
|
+
# Auth configuration
|
|
818
|
+
has_key = bool(cfg.anthropic_api_key)
|
|
819
|
+
pro_for_tasks = cfg.claude_pro_for_tasks
|
|
820
|
+
if pro_for_tasks and has_key:
|
|
821
|
+
primary, fallback = "claude_pro_oauth", "api_key"
|
|
822
|
+
elif pro_for_tasks:
|
|
823
|
+
primary, fallback = "claude_pro_oauth", None
|
|
824
|
+
else:
|
|
825
|
+
primary, fallback = "api_key", "claude_pro_oauth" if not has_key else "claude_pro_oauth"
|
|
826
|
+
|
|
827
|
+
# Claude CLI liveness check
|
|
828
|
+
cli_ok, cli_version = False, ""
|
|
829
|
+
try:
|
|
830
|
+
r = _sp.run(
|
|
831
|
+
[cfg.claude_code_bin, "--version"],
|
|
832
|
+
capture_output=True, text=True, timeout=10,
|
|
833
|
+
)
|
|
834
|
+
if r.returncode == 0:
|
|
835
|
+
cli_ok = True
|
|
836
|
+
cli_version = r.stdout.strip() or r.stderr.strip()
|
|
837
|
+
except Exception:
|
|
838
|
+
pass
|
|
839
|
+
|
|
840
|
+
# Circuit breaker snapshot — only open (unhealthy) circuits appear here
|
|
841
|
+
circuits = get_circuit_status()
|
|
842
|
+
|
|
843
|
+
# Fix engine stats (last 24 h)
|
|
844
|
+
recent = store.get_recent_fixes(hours=24)
|
|
845
|
+
counts = {"applied": 0, "failed": 0, "skipped": 0, "pending": 0}
|
|
846
|
+
last_success = None
|
|
847
|
+
for f in recent:
|
|
848
|
+
s = f.get("status", "")
|
|
849
|
+
if s in counts:
|
|
850
|
+
counts[s] += 1
|
|
851
|
+
if s == "applied" and not last_success:
|
|
852
|
+
last_success = f.get("timestamp", "")
|
|
853
|
+
|
|
854
|
+
overall = "healthy"
|
|
855
|
+
if circuits:
|
|
856
|
+
overall = "degraded — rate/auth limit active on: " + ", ".join(circuits)
|
|
857
|
+
elif not cli_ok:
|
|
858
|
+
overall = "warning — claude CLI not reachable"
|
|
859
|
+
|
|
860
|
+
return json.dumps({
|
|
861
|
+
"overall": overall,
|
|
862
|
+
"auth": {
|
|
863
|
+
"api_key_configured": has_key,
|
|
864
|
+
"claude_pro_for_tasks": pro_for_tasks,
|
|
865
|
+
"primary_method": primary,
|
|
866
|
+
"fallback_method": fallback,
|
|
867
|
+
},
|
|
868
|
+
"claude_cli": {"available": cli_ok, "version": cli_version},
|
|
869
|
+
"rate_limit_circuits": circuits,
|
|
870
|
+
"fix_engine_24h": {**counts, "last_successful_fix": last_success},
|
|
871
|
+
})
|
|
872
|
+
|
|
801
873
|
if name == "create_issue":
|
|
802
874
|
description = inputs["description"]
|
|
803
875
|
target_repo = inputs.get("target_repo", "")
|
|
@@ -1938,7 +2010,7 @@ async def handle_message(
|
|
|
1938
2010
|
from .notify import rate_limit_message
|
|
1939
2011
|
alert_if_rate_limited(cfg.slack_bot_token, cfg.slack_channel,
|
|
1940
2012
|
"sentinel_boss/api", err_str)
|
|
1941
|
-
logger.warning("Boss: API key path failed (%s), trying CLI fallback", err_str
|
|
2013
|
+
logger.warning("Boss: API key path failed (%s), trying CLI fallback", err_str)
|
|
1942
2014
|
|
|
1943
2015
|
# 2nd priority: Claude Pro / OAuth via CLI (limited tools but no API key needed)
|
|
1944
2016
|
cli_reply, cli_done = await _handle_with_cli(
|