@misterhuydo/sentinel 1.2.6 → 1.2.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.cairn/.hint-lock +1 -1
- package/.cairn/session.json +2 -2
- package/package.json +1 -1
- package/python/sentinel/fix_engine.py +23 -5
- package/python/sentinel/issue_watcher.py +8 -0
- package/python/sentinel/main.py +34 -30
- package/python/sentinel/notify.py +249 -173
- package/python/sentinel/sentinel_boss.py +151 -5
package/.cairn/.hint-lock
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
2026-03-23T11:
|
|
1
|
+
2026-03-23T11:43:23.881Z
|
package/.cairn/session.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
|
-
"message": "Auto-checkpoint at 2026-03-23T11:
|
|
3
|
-
"checkpoint_at": "2026-03-23T11:
|
|
2
|
+
"message": "Auto-checkpoint at 2026-03-23T11:46:43.946Z",
|
|
3
|
+
"checkpoint_at": "2026-03-23T11:46:43.948Z",
|
|
4
4
|
"active_files": [],
|
|
5
5
|
"notes": [],
|
|
6
6
|
"mtime_snapshot": {}
|
package/package.json
CHANGED
|
@@ -80,9 +80,22 @@ def _build_prompt(event, repo: RepoConfig, log_file, marker: str, stale_markers:
|
|
|
80
80
|
f"1. {step1}",
|
|
81
81
|
"2. Use your available tools to explore the codebase and identify the root cause.",
|
|
82
82
|
f"3. {marker_instruction}",
|
|
83
|
-
"4.
|
|
84
|
-
"
|
|
85
|
-
"
|
|
83
|
+
"4. Consider all possible fix approaches. For each, weigh:",
|
|
84
|
+
" - Confidence: is this definitely the root cause?",
|
|
85
|
+
" - Safety: could this break other functionality?",
|
|
86
|
+
" - Scope: is it minimal and targeted?",
|
|
87
|
+
" Choose the safest minimal approach. If multiple valid options exist, pick the one",
|
|
88
|
+
" with highest confidence and lowest blast radius.",
|
|
89
|
+
"5. Output ONLY a unified diff patch (git diff format) for the chosen fix.",
|
|
90
|
+
"6. Do not explain. Output only the patch.",
|
|
91
|
+
"7. Only if you truly cannot produce a safe fix — e.g. the root cause requires a",
|
|
92
|
+
" DB schema change, infrastructure update, business logic decision, or is inside",
|
|
93
|
+
" a third-party library — output exactly:",
|
|
94
|
+
" NEEDS_HUMAN: <explanation>",
|
|
95
|
+
" Include: (a) root cause identified, (b) approaches you considered and why each",
|
|
96
|
+
" was insufficient or unsafe, (c) exactly what a human needs to do or decide.",
|
|
97
|
+
" Do NOT output NEEDS_HUMAN just because the fix is complex — only when human",
|
|
98
|
+
" judgement or access is genuinely required.",
|
|
86
99
|
]
|
|
87
100
|
return "\n".join(lines_out)
|
|
88
101
|
|
|
@@ -148,7 +161,7 @@ def generate_fix(
|
|
|
148
161
|
|
|
149
162
|
Returns:
|
|
150
163
|
(status, patch_path, marker)
|
|
151
|
-
status: "patch" | "skip" | "error"
|
|
164
|
+
status: "patch" | "skip" | "needs_human" | "error"
|
|
152
165
|
|
|
153
166
|
Auth strategy — API key and Claude Pro (OAuth) are interchangeable:
|
|
154
167
|
Primary : Claude Pro (OAuth) if claude_pro_for_tasks=True, else API key
|
|
@@ -237,10 +250,15 @@ def generate_fix(
|
|
|
237
250
|
output=output,
|
|
238
251
|
)
|
|
239
252
|
|
|
253
|
+
if output.strip().upper().startswith("NEEDS_HUMAN:"):
|
|
254
|
+
reason = output.strip()[len("NEEDS_HUMAN:"):].strip()
|
|
255
|
+
logger.info("Claude needs human for %s: %s", event.fingerprint, reason[:200])
|
|
256
|
+
return "needs_human", None, reason
|
|
257
|
+
|
|
240
258
|
if output.strip().upper().startswith("SKIP:"):
|
|
241
259
|
reason = output.strip()[5:].strip()
|
|
242
260
|
logger.info("Claude skipped fix for %s: %s", event.fingerprint, reason)
|
|
243
|
-
return "skip", None,
|
|
261
|
+
return "skip", None, reason
|
|
244
262
|
|
|
245
263
|
patch = _extract_patch(output)
|
|
246
264
|
if not patch:
|
|
@@ -41,6 +41,7 @@ class IssueEvent:
|
|
|
41
41
|
fingerprint: str = ""
|
|
42
42
|
severity: str = "ERROR"
|
|
43
43
|
timestamp: str = ""
|
|
44
|
+
submitter_user_id: str = "" # Slack user ID who raised this via Boss, if known
|
|
44
45
|
|
|
45
46
|
# Compatibility fields matching ErrorEvent interface
|
|
46
47
|
level: str = "ERROR"
|
|
@@ -53,6 +54,13 @@ class IssueEvent:
|
|
|
53
54
|
if not self.fingerprint:
|
|
54
55
|
raw = f"issue:{self.source}:{self.message[:200]}"
|
|
55
56
|
self.fingerprint = hashlib.sha1(raw.encode()).hexdigest()[:16]
|
|
57
|
+
if not self.submitter_user_id:
|
|
58
|
+
import re as _re
|
|
59
|
+
for _line in self.body.splitlines():
|
|
60
|
+
_m = _re.match(r'SUBMITTED_BY:.*\(([UW][A-Z0-9]+)\)', _line.strip())
|
|
61
|
+
if _m:
|
|
62
|
+
self.submitter_user_id = _m.group(1)
|
|
63
|
+
break
|
|
56
64
|
if not self.timestamp:
|
|
57
65
|
self.timestamp = datetime.now(timezone.utc).isoformat()
|
|
58
66
|
if not self.stack_trace:
|
package/python/sentinel/main.py
CHANGED
|
@@ -28,6 +28,7 @@ from .log_parser import parse_all, scan_all_for_markers, ErrorEvent
|
|
|
28
28
|
from .issue_watcher import scan_issues, mark_done, IssueEvent
|
|
29
29
|
from .repo_router import route
|
|
30
30
|
from .reporter import build_and_send, send_fix_notification, send_failure_notification, send_confirmed_notification, send_regression_notification, send_startup_notification, send_upgrade_notification
|
|
31
|
+
from .notify import notify_fix_blocked
|
|
31
32
|
from .health_checker import evaluate_repos
|
|
32
33
|
from .state_store import StateStore
|
|
33
34
|
|
|
@@ -87,27 +88,29 @@ async def _handle_error(event: ErrorEvent, cfg_loader: ConfigLoader, store: Stat
|
|
|
87
88
|
status, patch_path, marker = generate_fix(event, repo, sentinel, patches_dir, store)
|
|
88
89
|
|
|
89
90
|
if status != "patch" or patch_path is None:
|
|
90
|
-
outcome = "skipped" if status
|
|
91
|
+
outcome = "skipped" if status in ("skip", "needs_human") else "failed"
|
|
91
92
|
store.record_fix(event.fingerprint, outcome, repo_name=repo.repo_name)
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
93
|
+
submitter_uid = getattr(event, "submitter_user_id", "")
|
|
94
|
+
if status == "needs_human":
|
|
95
|
+
# marker holds the reason string for needs_human
|
|
96
|
+
notify_fix_blocked(sentinel, event.source, event.message,
|
|
97
|
+
reason=marker, repo_name=repo.repo_name,
|
|
98
|
+
submitter_user_id=submitter_uid)
|
|
99
|
+
else:
|
|
100
|
+
notify_fix_blocked(sentinel, event.source, event.message,
|
|
101
|
+
reason=f"Claude Code returned {status.upper()}",
|
|
102
|
+
repo_name=repo.repo_name,
|
|
103
|
+
submitter_user_id=submitter_uid)
|
|
99
104
|
return
|
|
100
105
|
|
|
101
106
|
commit_status, commit_hash = apply_and_commit(event, patch_path, repo, sentinel)
|
|
102
107
|
if commit_status != "committed":
|
|
103
108
|
store.record_fix(event.fingerprint, "failed", repo_name=repo.repo_name)
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
"body": event.full_text()[:500],
|
|
110
|
-
})
|
|
109
|
+
submitter_uid = getattr(event, "submitter_user_id", "")
|
|
110
|
+
notify_fix_blocked(sentinel, event.source, event.message,
|
|
111
|
+
reason="Patch was generated but commit/tests failed",
|
|
112
|
+
repo_name=repo.repo_name,
|
|
113
|
+
submitter_user_id=submitter_uid)
|
|
111
114
|
return
|
|
112
115
|
|
|
113
116
|
branch, pr_url = publish(event, repo, sentinel, commit_hash)
|
|
@@ -179,28 +182,29 @@ async def _handle_issue(event: IssueEvent, cfg_loader: ConfigLoader, store: Stat
|
|
|
179
182
|
status, patch_path, marker = generate_fix(event, repo, sentinel, patches_dir, store)
|
|
180
183
|
|
|
181
184
|
if status != "patch" or patch_path is None:
|
|
182
|
-
store.record_fix(event.fingerprint, "skipped" if status
|
|
185
|
+
store.record_fix(event.fingerprint, "skipped" if status in ("skip", "needs_human") else "failed",
|
|
183
186
|
repo_name=repo.repo_name)
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
187
|
+
submitter_uid = getattr(event, "submitter_user_id", "")
|
|
188
|
+
if status == "needs_human":
|
|
189
|
+
notify_fix_blocked(sentinel, event.source, event.message,
|
|
190
|
+
reason=marker, repo_name=repo.repo_name,
|
|
191
|
+
submitter_user_id=submitter_uid)
|
|
192
|
+
else:
|
|
193
|
+
notify_fix_blocked(sentinel, event.source, event.message,
|
|
194
|
+
reason=f"Claude Code returned {status.upper()}",
|
|
195
|
+
repo_name=repo.repo_name,
|
|
196
|
+
submitter_user_id=submitter_uid)
|
|
191
197
|
mark_done(event.issue_file)
|
|
192
198
|
return
|
|
193
199
|
|
|
194
200
|
commit_status, commit_hash = apply_and_commit(event, patch_path, repo, sentinel)
|
|
195
201
|
if commit_status != "committed":
|
|
196
202
|
store.record_fix(event.fingerprint, "failed", repo_name=repo.repo_name)
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
"body": event.body[:500],
|
|
203
|
-
})
|
|
203
|
+
submitter_uid = getattr(event, "submitter_user_id", "")
|
|
204
|
+
notify_fix_blocked(sentinel, event.source, event.message,
|
|
205
|
+
reason="Patch was generated but commit/tests failed",
|
|
206
|
+
repo_name=repo.repo_name,
|
|
207
|
+
submitter_user_id=submitter_uid)
|
|
204
208
|
mark_done(event.issue_file)
|
|
205
209
|
return
|
|
206
210
|
|
|
@@ -1,173 +1,249 @@
|
|
|
1
|
-
"""
|
|
2
|
-
notify.py — Best-effort Slack alerts from any Sentinel module.
|
|
3
|
-
|
|
4
|
-
Uses the Slack Web API directly (no Bolt / Socket Mode required).
|
|
5
|
-
Calls never raise — failures are logged and silently dropped.
|
|
6
|
-
"""
|
|
7
|
-
|
|
8
|
-
import logging
|
|
9
|
-
import re
|
|
10
|
-
import time
|
|
11
|
-
|
|
12
|
-
import requests
|
|
13
|
-
|
|
14
|
-
logger = logging.getLogger(__name__)
|
|
15
|
-
|
|
16
|
-
# ── Rate-limit / auth-failure detector ────────────────────────────────────────
|
|
17
|
-
|
|
18
|
-
_RATE_LIMIT_RE = re.compile(
|
|
19
|
-
r"rate.?limit|usage.?limit|too many requests|quota.?exceeded"
|
|
20
|
-
r"|overloaded|credit.?balance|billing|529"
|
|
21
|
-
r"|not.?authenticated|invalid.?api.?key|authentication.?fail"
|
|
22
|
-
r"|claude\.ai subscription|pro.?plan|login required",
|
|
23
|
-
re.IGNORECASE,
|
|
24
|
-
)
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
def is_rate_limited(text: str) -> bool:
|
|
28
|
-
"""Return True if the text contains a rate-limit or auth-failure signal."""
|
|
29
|
-
return bool(_RATE_LIMIT_RE.search(text))
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
# ── Circuit breaker ────────────────────────────────────────────────────────────
|
|
33
|
-
#
|
|
34
|
-
# Prevents alert storms when Claude is persistently rate-limited.
|
|
35
|
-
# Each `source` string gets its own independent circuit:
|
|
36
|
-
# CLOSED → normal; alerts pass through immediately
|
|
37
|
-
# OPEN → suppressed; one re-alert every CIRCUIT_COOLDOWN_SECONDS
|
|
38
|
-
#
|
|
39
|
-
# On recovery (first non-rate-limited output after OPEN):
|
|
40
|
-
# → post "resolved" to Slack, close the circuit
|
|
41
|
-
|
|
42
|
-
CIRCUIT_COOLDOWN_SECONDS = 3600 # 1 h between repeat alerts while open
|
|
43
|
-
|
|
44
|
-
# source → {opened_at, last_alerted_at, count}
|
|
45
|
-
_circuits: dict[str, dict] = {}
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
def get_circuit_status() -> dict:
|
|
49
|
-
"""
|
|
50
|
-
Return a snapshot of all open circuits.
|
|
51
|
-
Used by the `check_auth_status` Boss tool.
|
|
52
|
-
|
|
53
|
-
Returns:
|
|
54
|
-
{ source: { state, opened_at, open_for_seconds, alert_count } }
|
|
55
|
-
Only open circuits are included; an empty dict means everything is healthy.
|
|
56
|
-
"""
|
|
57
|
-
now = time.time()
|
|
58
|
-
return {
|
|
59
|
-
src: {
|
|
60
|
-
"state": "open",
|
|
61
|
-
"opened_at": c["opened_at"],
|
|
62
|
-
"open_for_seconds": int(now - c["opened_at"]),
|
|
63
|
-
"alert_count": c["count"],
|
|
64
|
-
}
|
|
65
|
-
for src, c in _circuits.items()
|
|
66
|
-
}
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
def _open_or_repeat(bot_token: str, channel: str, source: str, output: str) -> None:
|
|
70
|
-
"""Open circuit on first hit; re-alert after cooldown if still failing."""
|
|
71
|
-
now = time.time()
|
|
72
|
-
circuit = _circuits.get(source)
|
|
73
|
-
|
|
74
|
-
if circuit is None:
|
|
75
|
-
# First occurrence — open and alert immediately
|
|
76
|
-
_circuits[source] = {"opened_at": now, "last_alerted_at": now, "count": 1}
|
|
77
|
-
logger.error("Circuit opened for %s: %s", source, output[:200])
|
|
78
|
-
slack_alert(bot_token, channel, rate_limit_message(source, output))
|
|
79
|
-
return
|
|
80
|
-
|
|
81
|
-
circuit["count"] += 1
|
|
82
|
-
elapsed = now - circuit["last_alerted_at"]
|
|
83
|
-
if elapsed >= CIRCUIT_COOLDOWN_SECONDS:
|
|
84
|
-
# Still failing after cooldown — remind admins once per hour
|
|
85
|
-
circuit["last_alerted_at"] = now
|
|
86
|
-
open_mins = int((now - circuit["opened_at"]) / 60)
|
|
87
|
-
msg = (
|
|
88
|
-
f":warning: *Sentinel — Claude usage/auth problem still active ({source})*\n"
|
|
89
|
-
f"Still failing after {open_mins} minutes. Total occurrences: {circuit['count']}.\n"
|
|
90
|
-
f"Last error:\n```{output.strip()[:300]}```\n"
|
|
91
|
-
f"Run `check_auth_status` in Slack to see the full picture."
|
|
92
|
-
)
|
|
93
|
-
logger.error("Circuit still open for %s (count=%d)", source, circuit["count"])
|
|
94
|
-
slack_alert(bot_token, channel, msg)
|
|
95
|
-
# else: within cooldown window — suppress
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
def _close_if_open(bot_token: str, channel: str, source: str) -> None:
|
|
99
|
-
"""If circuit was open, close it and post a recovery alert."""
|
|
100
|
-
circuit = _circuits.pop(source, None)
|
|
101
|
-
if circuit is None:
|
|
102
|
-
return
|
|
103
|
-
duration_mins = int((time.time() - circuit["opened_at"]) / 60)
|
|
104
|
-
msg = (
|
|
105
|
-
f":white_check_mark: *Sentinel — Claude auth restored ({source})*\n"
|
|
106
|
-
f"Fixed after {duration_mins} min. Total failures during outage: {circuit['count']}."
|
|
107
|
-
)
|
|
108
|
-
logger.info("Circuit closed for %s after %d min, %d failures", source, duration_mins, circuit["count"])
|
|
109
|
-
slack_alert(bot_token, channel, msg)
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
def rate_limit_message(source: str, raw: str) -> str:
|
|
113
|
-
"""Produce a human-readable Slack alert for a rate-limit / auth event (first occurrence)."""
|
|
114
|
-
snippet = raw.strip()[:300].replace("\n", " ")
|
|
115
|
-
return (
|
|
116
|
-
f":warning: *Sentinel — Claude usage/auth problem ({source})*\n"
|
|
117
|
-
f"Claude returned an error that requires admin attention:\n"
|
|
118
|
-
f"```{snippet}```\n"
|
|
119
|
-
f"*What to check:*\n"
|
|
120
|
-
f"• API key: verify `ANTHROPIC_API_KEY` in `sentinel.properties` is valid and has credit\n"
|
|
121
|
-
f"• Claude Pro: run `claude login` on the server to refresh the OAuth session\n"
|
|
122
|
-
f"• Both: Sentinel tries both methods — at least one must be working\n"
|
|
123
|
-
f"Repeat alerts will be suppressed for 1 hour. "
|
|
124
|
-
f"Run `check_auth_status` in Slack to see current state."
|
|
125
|
-
)
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
# ── Alert dispatcher ──────────────────────────────────────────────────────────
|
|
129
|
-
|
|
130
|
-
def slack_alert(bot_token: str, channel: str, text: str) -> None:
|
|
131
|
-
"""
|
|
132
|
-
Post a plain-text alert to a Slack channel.
|
|
133
|
-
Best-effort: logs on failure, never raises.
|
|
134
|
-
"""
|
|
135
|
-
if not bot_token or not channel:
|
|
136
|
-
logger.debug("slack_alert: no token/channel configured — logging only: %s", text[:120])
|
|
137
|
-
return
|
|
138
|
-
try:
|
|
139
|
-
resp = requests.post(
|
|
140
|
-
"https://slack.com/api/chat.postMessage",
|
|
141
|
-
headers={
|
|
142
|
-
"Authorization": f"Bearer {bot_token}",
|
|
143
|
-
"Content-Type": "application/json",
|
|
144
|
-
},
|
|
145
|
-
json={"channel": channel, "text": text},
|
|
146
|
-
timeout=10,
|
|
147
|
-
)
|
|
148
|
-
data = resp.json()
|
|
149
|
-
if not data.get("ok"):
|
|
150
|
-
logger.warning("slack_alert: Slack API error: %s", data.get("error"))
|
|
151
|
-
except Exception as exc:
|
|
152
|
-
logger.warning("slack_alert: failed to post: %s", exc)
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
"""
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
1
|
+
"""
|
|
2
|
+
notify.py — Best-effort Slack alerts from any Sentinel module.
|
|
3
|
+
|
|
4
|
+
Uses the Slack Web API directly (no Bolt / Socket Mode required).
|
|
5
|
+
Calls never raise — failures are logged and silently dropped.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import logging
|
|
9
|
+
import re
|
|
10
|
+
import time
|
|
11
|
+
|
|
12
|
+
import requests
|
|
13
|
+
|
|
14
|
+
logger = logging.getLogger(__name__)
|
|
15
|
+
|
|
16
|
+
# ── Rate-limit / auth-failure detector ────────────────────────────────────────
|
|
17
|
+
|
|
18
|
+
_RATE_LIMIT_RE = re.compile(
|
|
19
|
+
r"rate.?limit|usage.?limit|too many requests|quota.?exceeded"
|
|
20
|
+
r"|overloaded|credit.?balance|billing|529"
|
|
21
|
+
r"|not.?authenticated|invalid.?api.?key|authentication.?fail"
|
|
22
|
+
r"|claude\.ai subscription|pro.?plan|login required",
|
|
23
|
+
re.IGNORECASE,
|
|
24
|
+
)
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def is_rate_limited(text: str) -> bool:
|
|
28
|
+
"""Return True if the text contains a rate-limit or auth-failure signal."""
|
|
29
|
+
return bool(_RATE_LIMIT_RE.search(text))
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
# ── Circuit breaker ────────────────────────────────────────────────────────────
|
|
33
|
+
#
|
|
34
|
+
# Prevents alert storms when Claude is persistently rate-limited.
|
|
35
|
+
# Each `source` string gets its own independent circuit:
|
|
36
|
+
# CLOSED → normal; alerts pass through immediately
|
|
37
|
+
# OPEN → suppressed; one re-alert every CIRCUIT_COOLDOWN_SECONDS
|
|
38
|
+
#
|
|
39
|
+
# On recovery (first non-rate-limited output after OPEN):
|
|
40
|
+
# → post "resolved" to Slack, close the circuit
|
|
41
|
+
|
|
42
|
+
CIRCUIT_COOLDOWN_SECONDS = 3600 # 1 h between repeat alerts while open
|
|
43
|
+
|
|
44
|
+
# source → {opened_at, last_alerted_at, count}
|
|
45
|
+
_circuits: dict[str, dict] = {}
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def get_circuit_status() -> dict:
|
|
49
|
+
"""
|
|
50
|
+
Return a snapshot of all open circuits.
|
|
51
|
+
Used by the `check_auth_status` Boss tool.
|
|
52
|
+
|
|
53
|
+
Returns:
|
|
54
|
+
{ source: { state, opened_at, open_for_seconds, alert_count } }
|
|
55
|
+
Only open circuits are included; an empty dict means everything is healthy.
|
|
56
|
+
"""
|
|
57
|
+
now = time.time()
|
|
58
|
+
return {
|
|
59
|
+
src: {
|
|
60
|
+
"state": "open",
|
|
61
|
+
"opened_at": c["opened_at"],
|
|
62
|
+
"open_for_seconds": int(now - c["opened_at"]),
|
|
63
|
+
"alert_count": c["count"],
|
|
64
|
+
}
|
|
65
|
+
for src, c in _circuits.items()
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def _open_or_repeat(bot_token: str, channel: str, source: str, output: str) -> None:
|
|
70
|
+
"""Open circuit on first hit; re-alert after cooldown if still failing."""
|
|
71
|
+
now = time.time()
|
|
72
|
+
circuit = _circuits.get(source)
|
|
73
|
+
|
|
74
|
+
if circuit is None:
|
|
75
|
+
# First occurrence — open and alert immediately
|
|
76
|
+
_circuits[source] = {"opened_at": now, "last_alerted_at": now, "count": 1}
|
|
77
|
+
logger.error("Circuit opened for %s: %s", source, output[:200])
|
|
78
|
+
slack_alert(bot_token, channel, rate_limit_message(source, output))
|
|
79
|
+
return
|
|
80
|
+
|
|
81
|
+
circuit["count"] += 1
|
|
82
|
+
elapsed = now - circuit["last_alerted_at"]
|
|
83
|
+
if elapsed >= CIRCUIT_COOLDOWN_SECONDS:
|
|
84
|
+
# Still failing after cooldown — remind admins once per hour
|
|
85
|
+
circuit["last_alerted_at"] = now
|
|
86
|
+
open_mins = int((now - circuit["opened_at"]) / 60)
|
|
87
|
+
msg = (
|
|
88
|
+
f":warning: *Sentinel — Claude usage/auth problem still active ({source})*\n"
|
|
89
|
+
f"Still failing after {open_mins} minutes. Total occurrences: {circuit['count']}.\n"
|
|
90
|
+
f"Last error:\n```{output.strip()[:300]}```\n"
|
|
91
|
+
f"Run `check_auth_status` in Slack to see the full picture."
|
|
92
|
+
)
|
|
93
|
+
logger.error("Circuit still open for %s (count=%d)", source, circuit["count"])
|
|
94
|
+
slack_alert(bot_token, channel, msg)
|
|
95
|
+
# else: within cooldown window — suppress
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
def _close_if_open(bot_token: str, channel: str, source: str) -> None:
|
|
99
|
+
"""If circuit was open, close it and post a recovery alert."""
|
|
100
|
+
circuit = _circuits.pop(source, None)
|
|
101
|
+
if circuit is None:
|
|
102
|
+
return
|
|
103
|
+
duration_mins = int((time.time() - circuit["opened_at"]) / 60)
|
|
104
|
+
msg = (
|
|
105
|
+
f":white_check_mark: *Sentinel — Claude auth restored ({source})*\n"
|
|
106
|
+
f"Fixed after {duration_mins} min. Total failures during outage: {circuit['count']}."
|
|
107
|
+
)
|
|
108
|
+
logger.info("Circuit closed for %s after %d min, %d failures", source, duration_mins, circuit["count"])
|
|
109
|
+
slack_alert(bot_token, channel, msg)
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
def rate_limit_message(source: str, raw: str) -> str:
|
|
113
|
+
"""Produce a human-readable Slack alert for a rate-limit / auth event (first occurrence)."""
|
|
114
|
+
snippet = raw.strip()[:300].replace("\n", " ")
|
|
115
|
+
return (
|
|
116
|
+
f":warning: *Sentinel — Claude usage/auth problem ({source})*\n"
|
|
117
|
+
f"Claude returned an error that requires admin attention:\n"
|
|
118
|
+
f"```{snippet}```\n"
|
|
119
|
+
f"*What to check:*\n"
|
|
120
|
+
f"• API key: verify `ANTHROPIC_API_KEY` in `sentinel.properties` is valid and has credit\n"
|
|
121
|
+
f"• Claude Pro: run `claude login` on the server to refresh the OAuth session\n"
|
|
122
|
+
f"• Both: Sentinel tries both methods — at least one must be working\n"
|
|
123
|
+
f"Repeat alerts will be suppressed for 1 hour. "
|
|
124
|
+
f"Run `check_auth_status` in Slack to see current state."
|
|
125
|
+
)
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
# ── Alert dispatcher ──────────────────────────────────────────────────────────
|
|
129
|
+
|
|
130
|
+
def slack_alert(bot_token: str, channel: str, text: str) -> None:
|
|
131
|
+
"""
|
|
132
|
+
Post a plain-text alert to a Slack channel.
|
|
133
|
+
Best-effort: logs on failure, never raises.
|
|
134
|
+
"""
|
|
135
|
+
if not bot_token or not channel:
|
|
136
|
+
logger.debug("slack_alert: no token/channel configured — logging only: %s", text[:120])
|
|
137
|
+
return
|
|
138
|
+
try:
|
|
139
|
+
resp = requests.post(
|
|
140
|
+
"https://slack.com/api/chat.postMessage",
|
|
141
|
+
headers={
|
|
142
|
+
"Authorization": f"Bearer {bot_token}",
|
|
143
|
+
"Content-Type": "application/json",
|
|
144
|
+
},
|
|
145
|
+
json={"channel": channel, "text": text},
|
|
146
|
+
timeout=10,
|
|
147
|
+
)
|
|
148
|
+
data = resp.json()
|
|
149
|
+
if not data.get("ok"):
|
|
150
|
+
logger.warning("slack_alert: Slack API error: %s", data.get("error"))
|
|
151
|
+
except Exception as exc:
|
|
152
|
+
logger.warning("slack_alert: failed to post: %s", exc)
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
def slack_dm(bot_token: str, user_id: str, text: str) -> None:
|
|
157
|
+
"""
|
|
158
|
+
Send a direct message to a specific Slack user.
|
|
159
|
+
Opens a DM channel via conversations.open, then posts.
|
|
160
|
+
Best-effort: logs on failure, never raises.
|
|
161
|
+
"""
|
|
162
|
+
if not bot_token or not user_id:
|
|
163
|
+
logger.debug("slack_dm: no token/user_id — skipping DM")
|
|
164
|
+
return
|
|
165
|
+
try:
|
|
166
|
+
resp = requests.post(
|
|
167
|
+
"https://slack.com/api/conversations.open",
|
|
168
|
+
headers={"Authorization": f"Bearer {bot_token}", "Content-Type": "application/json"},
|
|
169
|
+
json={"users": user_id},
|
|
170
|
+
timeout=10,
|
|
171
|
+
)
|
|
172
|
+
data = resp.json()
|
|
173
|
+
if not data.get("ok"):
|
|
174
|
+
logger.warning("slack_dm: conversations.open failed: %s", data.get("error"))
|
|
175
|
+
return
|
|
176
|
+
dm_channel = data["channel"]["id"]
|
|
177
|
+
slack_alert(bot_token, dm_channel, text)
|
|
178
|
+
except Exception as exc:
|
|
179
|
+
logger.warning("slack_dm: failed to DM %s: %s", user_id, exc)
|
|
180
|
+
|
|
181
|
+
|
|
182
|
+
def notify_fix_blocked(
|
|
183
|
+
cfg,
|
|
184
|
+
source: str,
|
|
185
|
+
message: str,
|
|
186
|
+
reason: str,
|
|
187
|
+
repo_name: str = "",
|
|
188
|
+
submitter_user_id: str = "",
|
|
189
|
+
) -> None:
|
|
190
|
+
"""
|
|
191
|
+
Notify that a fix needs human intervention.
|
|
192
|
+
|
|
193
|
+
- If submitter_user_id is known: DM that person directly.
|
|
194
|
+
- Otherwise: @channel in the configured Slack channel.
|
|
195
|
+
- Always: email admins via reporter.send_failure_notification.
|
|
196
|
+
"""
|
|
197
|
+
short_reason = (reason or "Claude could not determine a safe fix.")[:600]
|
|
198
|
+
repo_line = f"\n*Repo:* {repo_name}" if repo_name else ""
|
|
199
|
+
|
|
200
|
+
slack_text = (
|
|
201
|
+
f":hand: *Fix blocked — human intervention needed*\n"
|
|
202
|
+
f"*Source:* {source}\n"
|
|
203
|
+
f"*Issue:* {message[:200]}{repo_line}\n"
|
|
204
|
+
f"*Reason:*\n{short_reason}"
|
|
205
|
+
)
|
|
206
|
+
|
|
207
|
+
if submitter_user_id:
|
|
208
|
+
slack_dm(cfg.slack_bot_token, submitter_user_id, slack_text)
|
|
209
|
+
else:
|
|
210
|
+
# No known submitter — broadcast to the whole channel
|
|
211
|
+
slack_alert(
|
|
212
|
+
cfg.slack_bot_token,
|
|
213
|
+
cfg.slack_channel,
|
|
214
|
+
f"<!channel> {slack_text}",
|
|
215
|
+
)
|
|
216
|
+
|
|
217
|
+
# Always email admins
|
|
218
|
+
try:
|
|
219
|
+
from .reporter import send_failure_notification
|
|
220
|
+
send_failure_notification(cfg, {
|
|
221
|
+
"source": source,
|
|
222
|
+
"message": message,
|
|
223
|
+
"repo_name": repo_name,
|
|
224
|
+
"reason": f"Needs human intervention: {short_reason[:200]}",
|
|
225
|
+
"body": reason,
|
|
226
|
+
})
|
|
227
|
+
except Exception as exc:
|
|
228
|
+
logger.warning("notify_fix_blocked: email notification failed: %s", exc)
|
|
229
|
+
|
|
230
|
+
|
|
231
|
+
def alert_if_rate_limited(
|
|
232
|
+
bot_token: str,
|
|
233
|
+
channel: str,
|
|
234
|
+
source: str,
|
|
235
|
+
output: str,
|
|
236
|
+
) -> bool:
|
|
237
|
+
"""
|
|
238
|
+
Check output for rate-limit / auth signals and manage the circuit breaker.
|
|
239
|
+
|
|
240
|
+
- Rate limited → open/keep-open circuit, alert (with cooldown suppression)
|
|
241
|
+
- Not limited → close circuit if it was open (recovery alert), return False
|
|
242
|
+
|
|
243
|
+
Returns True if a rate-limit signal was found.
|
|
244
|
+
"""
|
|
245
|
+
if not is_rate_limited(output):
|
|
246
|
+
_close_if_open(bot_token, channel, source)
|
|
247
|
+
return False
|
|
248
|
+
_open_or_repeat(bot_token, channel, source, output)
|
|
249
|
+
return True
|
|
@@ -240,6 +240,29 @@ When to act vs. when to ask:
|
|
|
240
240
|
- If a tool call will take a moment (search, fetch, pull), prefix your reply with a brief "working" line ending in "..." before the results, e.g. "Searching SSOLWA for TryDig activity..." then the actual output.
|
|
241
241
|
Never just say a working line and stop — always follow it with the results in the same message.
|
|
242
242
|
|
|
243
|
+
Search reasoning — always do this before calling filter_logs or search_logs:
|
|
244
|
+
1. Interpret intent: what is the user actually looking for? Don't pass the raw message as the query.
|
|
245
|
+
Examples:
|
|
246
|
+
- "TryDig errors" → query="TryDig" (component name; look for it in any context)
|
|
247
|
+
- "payment failures last hour" → query="pay|payment|transaction", since_hours=1
|
|
248
|
+
- "why is the app crashing" → query="Exception|Error|FAILED|crash", look for stack traces
|
|
249
|
+
- "login issues today" → query="login|auth|401|403|session", since_hours=24
|
|
250
|
+
- "slow requests" → query="timeout|slow|latency|took [0-9]+ms|duration"
|
|
251
|
+
- "startup problems" → query="APPLICATION FAILED|BeanCreation|NoSuchMethod|ClassNotFound"
|
|
252
|
+
Use | in the regex to cover synonyms and related terms. Keep it focused — not too broad.
|
|
253
|
+
2. Choose since_hours if a time window is implied ("last hour", "today", "this morning").
|
|
254
|
+
3. Pick source if the user mentioned a specific service (SSOLWA, STS, etc.) or server.
|
|
255
|
+
|
|
256
|
+
After getting filter_logs results, always synthesize — never dump raw output:
|
|
257
|
+
- Lead with 1-2 sentences: total count, affected sources, dominant pattern.
|
|
258
|
+
e.g. "Found 47 matches across SSOLWA and STS — mostly NullPointerException in DigService (31 hits)."
|
|
259
|
+
- List the top 3-5 patterns with counts in plain language.
|
|
260
|
+
- Call out any notable time clustering (e.g. "spike between 10:23–10:47 UTC").
|
|
261
|
+
- Show 2-3 example lines at most — only the most informative ones.
|
|
262
|
+
- End with a recommendation if the pattern suggests something actionable:
|
|
263
|
+
e.g. "Looks like a dependency resolution issue — create an issue?" or "Pattern consistent with a null config value at startup."
|
|
264
|
+
- If total_matches=0, say so plainly and suggest what else to try.
|
|
265
|
+
|
|
243
266
|
Session context — critical rules:
|
|
244
267
|
- Loaded conversation history is prior-session background only. It may be hours or days old.
|
|
245
268
|
- NEVER say "the previous search", "I already fetched", "as I found earlier", or any phrase implying you already did part of the current task — unless a tool result appears in THIS response's tool calls.
|
|
@@ -1341,11 +1364,29 @@ async def _run_tool(name: str, inputs: dict, cfg_loader, store, slack_client=Non
|
|
|
1341
1364
|
|
|
1342
1365
|
if name == "filter_logs":
|
|
1343
1366
|
import re as _re
|
|
1367
|
+
from collections import Counter as _Counter
|
|
1344
1368
|
from datetime import datetime, timedelta, timezone as _tz
|
|
1369
|
+
|
|
1370
|
+
# Extract a short grouping key from a log line for pattern analysis
|
|
1371
|
+
_EXC_PAT = _re.compile(r'([A-Z][a-zA-Z]+(?:Exception|Error|Failure|Fault|Warning))')
|
|
1372
|
+
_LVL_PAT = _re.compile(r'\b(ERROR|WARN(?:ING)?|CRITICAL|FATAL|SEVERE)\b', _re.IGNORECASE)
|
|
1373
|
+
|
|
1374
|
+
def _signature(line):
|
|
1375
|
+
exc = _EXC_PAT.search(line)
|
|
1376
|
+
if exc:
|
|
1377
|
+
return exc.group(1)
|
|
1378
|
+
m = _LVL_PAT.search(line)
|
|
1379
|
+
if m:
|
|
1380
|
+
after = line[m.end():].strip()
|
|
1381
|
+
token = after.split()[0].rstrip(':.,') if after.split() else ''
|
|
1382
|
+
if token and len(token) > 2:
|
|
1383
|
+
return m.group(1).upper() + ' ' + token[:40]
|
|
1384
|
+
return line.strip()[:40]
|
|
1385
|
+
|
|
1345
1386
|
query_f = inputs.get("query", "")
|
|
1346
1387
|
source_f = inputs.get("source", "").lower()
|
|
1347
1388
|
since_hours = inputs.get("since_hours")
|
|
1348
|
-
max_matches = int(inputs.get("max_matches",
|
|
1389
|
+
max_matches = int(inputs.get("max_matches", 300))
|
|
1349
1390
|
case_flag = 0 if inputs.get("case_sensitive") else _re.IGNORECASE
|
|
1350
1391
|
try:
|
|
1351
1392
|
pat = _re.compile(query_f, case_flag)
|
|
@@ -1416,12 +1457,117 @@ async def _run_tool(name: str, inputs: dict, cfg_loader, store, slack_client=Non
|
|
|
1416
1457
|
"note": "No matches found in synced logs.",
|
|
1417
1458
|
})
|
|
1418
1459
|
|
|
1460
|
+
|
|
1461
|
+
try:
|
|
1462
|
+
pat = _re.compile(query_f, case_flag)
|
|
1463
|
+
except _re.error as e:
|
|
1464
|
+
return json.dumps({"error": f"Invalid regex: {e}"})
|
|
1465
|
+
|
|
1466
|
+
synced_base = Path("workspace/synced")
|
|
1467
|
+
if not synced_base.exists():
|
|
1468
|
+
return json.dumps({
|
|
1469
|
+
"error": "No synced logs found.",
|
|
1470
|
+
"hint": "Log sync runs every SYNC_INTERVAL_SECONDS (default 300s). "
|
|
1471
|
+
"If just started, wait a minute then try again.",
|
|
1472
|
+
})
|
|
1473
|
+
|
|
1474
|
+
cutoff = None
|
|
1475
|
+
if since_hours:
|
|
1476
|
+
cutoff = datetime.now(_tz.utc) - timedelta(hours=int(since_hours))
|
|
1477
|
+
|
|
1478
|
+
if source_f:
|
|
1479
|
+
src_dirs = [d for d in sorted(synced_base.iterdir())
|
|
1480
|
+
if d.is_dir() and source_f in d.name.lower()]
|
|
1481
|
+
else:
|
|
1482
|
+
src_dirs = [d for d in sorted(synced_base.iterdir()) if d.is_dir()]
|
|
1483
|
+
|
|
1484
|
+
if not src_dirs:
|
|
1485
|
+
available = [d.name for d in synced_base.iterdir() if d.is_dir()]
|
|
1486
|
+
return json.dumps({
|
|
1487
|
+
"error": f"No synced source matching '{source_f}'",
|
|
1488
|
+
"available_sources": available,
|
|
1489
|
+
})
|
|
1490
|
+
|
|
1491
|
+
all_matches = [] # list of (source_name, line)
|
|
1492
|
+
sources_hit = set()
|
|
1493
|
+
for src_dir in src_dirs:
|
|
1494
|
+
for log_file in sorted(src_dir.glob("*")):
|
|
1495
|
+
try:
|
|
1496
|
+
lines = log_file.read_text(encoding="utf-8", errors="replace").splitlines()
|
|
1497
|
+
for line in lines:
|
|
1498
|
+
if not pat.search(line):
|
|
1499
|
+
continue
|
|
1500
|
+
if cutoff:
|
|
1501
|
+
from .log_fetcher import _parse_line_ts
|
|
1502
|
+
ts = _parse_line_ts(line)
|
|
1503
|
+
if ts and ts < cutoff:
|
|
1504
|
+
continue
|
|
1505
|
+
all_matches.append((src_dir.name, line[:300]))
|
|
1506
|
+
sources_hit.add(src_dir.name)
|
|
1507
|
+
if len(all_matches) >= max_matches:
|
|
1508
|
+
break
|
|
1509
|
+
except Exception:
|
|
1510
|
+
pass
|
|
1511
|
+
if len(all_matches) >= max_matches:
|
|
1512
|
+
break
|
|
1513
|
+
|
|
1514
|
+
total = len(all_matches)
|
|
1515
|
+
if total == 0:
|
|
1516
|
+
return json.dumps({
|
|
1517
|
+
"query": query_f,
|
|
1518
|
+
"total_matches": 0,
|
|
1519
|
+
"sources_searched": [d.name for d in src_dirs],
|
|
1520
|
+
"note": "No matches found in synced logs.",
|
|
1521
|
+
})
|
|
1522
|
+
|
|
1523
|
+
# Pattern grouping: count occurrences of each error signature
|
|
1524
|
+
sig_counter = _Counter()
|
|
1525
|
+
sig_examples = {}
|
|
1526
|
+
for src, line in all_matches:
|
|
1527
|
+
sig = _signature(line)
|
|
1528
|
+
sig_counter[sig] += 1
|
|
1529
|
+
if sig not in sig_examples:
|
|
1530
|
+
sig_examples[sig] = f"[{src}] {line}"
|
|
1531
|
+
|
|
1532
|
+
top_patterns = [
|
|
1533
|
+
{"pattern": sig, "count": cnt, "example": sig_examples[sig][:250]}
|
|
1534
|
+
for sig, cnt in sig_counter.most_common(10)
|
|
1535
|
+
]
|
|
1536
|
+
|
|
1537
|
+
# Sample: first unique-signature line from each source
|
|
1538
|
+
sample_lines = []
|
|
1539
|
+
seen_sigs = set()
|
|
1540
|
+
for src, line in all_matches:
|
|
1541
|
+
sig = _signature(line)
|
|
1542
|
+
if sig not in seen_sigs:
|
|
1543
|
+
sample_lines.append(f"[{src}] {line}")
|
|
1544
|
+
seen_sigs.add(sig)
|
|
1545
|
+
if len(sample_lines) >= 10:
|
|
1546
|
+
break
|
|
1547
|
+
|
|
1548
|
+
# Time span
|
|
1549
|
+
time_span = {}
|
|
1550
|
+
try:
|
|
1551
|
+
from .log_fetcher import _parse_line_ts
|
|
1552
|
+
timestamps = [_parse_line_ts(ln) for _, ln in all_matches]
|
|
1553
|
+
timestamps = [t for t in timestamps if t]
|
|
1554
|
+
if timestamps:
|
|
1555
|
+
time_span = {
|
|
1556
|
+
"earliest": min(timestamps).strftime("%Y-%m-%d %H:%M:%S UTC"),
|
|
1557
|
+
"latest": max(timestamps).strftime("%Y-%m-%d %H:%M:%S UTC"),
|
|
1558
|
+
}
|
|
1559
|
+
except Exception:
|
|
1560
|
+
pass
|
|
1561
|
+
|
|
1419
1562
|
return json.dumps({
|
|
1420
|
-
"query":
|
|
1421
|
-
"
|
|
1422
|
-
"
|
|
1563
|
+
"query": query_f,
|
|
1564
|
+
"total_matches": total,
|
|
1565
|
+
"sources_hit": sorted(sources_hit),
|
|
1423
1566
|
"sources_searched": [d.name for d in src_dirs],
|
|
1424
|
-
"
|
|
1567
|
+
"top_patterns": top_patterns,
|
|
1568
|
+
"sample_lines": sample_lines,
|
|
1569
|
+
"time_span": time_span,
|
|
1570
|
+
"capped": total >= max_matches,
|
|
1425
1571
|
})
|
|
1426
1572
|
|
|
1427
1573
|
if name == "trigger_poll":
|