@misterhuydo/sentinel 1.2.4 → 1.2.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,2143 +1,2406 @@
1
- """
2
- sentinel_boss.py — Claude-backed Sentinel Boss.
3
-
4
- Claude acts as the boss: reads project state, decides on actions,
5
- executes them via tool use, and responds naturally. One agentic loop
6
- per turn — Claude may call multiple tools before replying.
7
- """
8
-
9
- import json
10
- import logging
11
- import os
12
- import re
13
- import subprocess
14
- import uuid
15
- from datetime import datetime, timezone
16
- from pathlib import Path
17
- from typing import Optional
18
-
19
- from .notify import alert_if_rate_limited, slack_alert, is_rate_limited
20
-
21
- logger = logging.getLogger(__name__)
22
-
23
- # ── System prompt ────────────────────────────────────────────────────────────
24
-
25
- _SYSTEM = """\
26
- You are Sentinel Boss — the AI interface for Sentinel, a 24/7 autonomous DevOps agent.
27
-
28
- Sentinel watches production logs, detects errors, generates code fixes via Claude Code,
29
- and opens GitHub PRs for admin review (or pushes directly if AUTO_PUBLISH=true).
30
-
31
- Your job:
32
- - Understand what the DevOps engineer needs in natural language
33
- - Query Sentinel's live state (errors, fixes, open PRs) on their behalf
34
- - Deliver tasks/issues to the right project — you know all projects in this workspace
35
- - Control Sentinel (pause/resume) when asked
36
- - Give honest, concise answers — you know this system inside out
37
- - If a project name is unclear or ambiguous, ask the engineer to clarify — never guess
38
-
39
- What you can do (tools available):
40
-
41
- 1. get_status — Show recent errors detected, fixes applied/pending, open PRs.
42
- e.g. "what happened today?", "any issues?", "show open PRs"
43
-
44
- 2. create_issue — Deliver a fix/task to any project in this workspace by short name.
45
- You know all project names — use list_projects if you're unsure.
46
- If the project name is ambiguous or not found, ask to clarify.
47
- e.g. "tell 1881 to fix X", "look into Y in elprint", "investigate Z"
48
-
49
- 3. pause_sentinel — Create SENTINEL_PAUSE file to halt all auto-fix activity.
50
- e.g. "pause sentinel", "stop auto-fixing"
51
-
52
- 4. resume_sentinel — Remove SENTINEL_PAUSE file to resume normal operation.
53
- e.g. "resume sentinel", "unpause"
54
-
55
- 5. list_projects — List all configured repos and log sources in this Sentinel instance.
56
- e.g. "what projects are you watching?", "list all repos"
57
-
58
- 6. search_logs — SSH live to servers and grep logs in real time (uses fetch_log.sh with
59
- the query as GREP_FILTER). Falls back to cached files if unavailable.
60
- e.g. "search logs for illegal PIN in 1881", "find X in SSOLWA", "grep logs for Z"
61
-
62
- 7. trigger_poll Trigger an immediate poll cycle without waiting for the schedule.
63
- e.g. "check now", "poll immediately", "don't wait, run now"
64
-
65
- 8. get_repo_status — Show the current git branch, last commit, and recent fix branches
66
- for a specific repository.
67
- e.g. "status of repo X", "what branch is cairn on?"
68
-
69
- 9. list_recent_commits List the most recent commits in a repo (including Sentinel's auto-fixes).
70
- e.g. "show me recent commits in elprint-sales", "what did sentinel commit?"
71
-
72
- 10. get_fix_detail — Get full details of a specific fix: error, patch path, PR URL, status.
73
- e.g. "show fix abc123", "details on that fix"
74
-
75
- 11. list_errors — List recent errors from the state store, optionally filtered by repo or source.
76
- e.g. "show all errors today", "what errors hit elprint this week?"
77
-
78
- 12. pull_repo — Run git pull on one or all managed application repos.
79
- e.g. "pull changes", "git pull all repos", "update the code"
80
-
81
- 13. pull_config — Run git pull on one or all Sentinel project config dirs.
82
- e.g. "pull config for 1881", "update sentinel config", "pull all configs"
83
-
84
- 14. fetch_logs — Run fetch_log.sh on demand to pull fresh logs from remote servers right now.
85
- Supports --debug mode and parameter overrides (tail count, grep filter).
86
- e.g. "fetch logs", "try fetch_log.sh for SSOLWA", "fetch logs with debug",
87
- "grab latest logs from STS", "fetch logs without filter"
88
-
89
- 15. watch_bot — Register a Slack bot for passive monitoring. Every message it posts is
90
- auto-queued as an issue in the bot's registered project.
91
- ALWAYS requires a project — infer from context or ask the user first.
92
- e.g. "listen to @alertbot", "watch @bot1 @bot2 for project 1881", "monitor @errorbot"
93
-
94
- 16. unwatch_bot — Remove a Slack bot from the passive watch list.
95
- e.g. "stop watching @alertbot", "unwatch @errorbot"
96
-
97
- 17. list_watched_bots — Show all Slack bots currently being passively monitored and which projects
98
- they are delivering to.
99
- e.g. "which bots are you watching?", "list monitored bots"
100
-
101
- 18. upgrade_sentinelPull the latest Sentinel agent code, update Python deps, and restart the
102
- process. Safe to run at any time — no restart if already up to date.
103
- e.g. "upgrade sentinel", "update sentinel", "upgrade yourself"
104
-
105
- 19. ask_codebase Ask any natural-language question about a managed repo's codebase.
106
- Claude Code answers using its full knowledge of the code.
107
- e.g. "what does the 1881 backend do?", "find PIN validation in elprint",
108
- "any TODOs in cairn?", "are there security issues in elprint-sales?"
109
-
110
- 20. restart_project — Stop and restart a specific Sentinel monitoring instance (stop.sh + start.sh).
111
- This restarts the Sentinel agent for that project, NOT the application itself.
112
- e.g. "restart sentinel for 1881", "restart the 1881 monitor", "reload elprint sentinel"
113
-
114
- 21. tail_log Fetch the last N lines of a log source live, without a grep filter.
115
- e.g. "show recent SSOLWA logs", "tail STS", "last 200 lines from 1881 logs"
116
-
117
- 22. post_file — Upload a text file to the Slack conversation (diff, log excerpt, report, CSV).
118
- Use when output is too large for chat, or the user asks to download/export something.
119
- e.g. "give me that as a file", "export the log", "send me the diff"
120
-
121
- When someone asks what you can do, what you support, what your capabilities are, or how you can help,
122
- reply with a short summary grouped by category:
123
-
124
- *Monitoring & status*
125
- `get_status` errors detected, fixes applied/pending/failed, open PRs "what happened today?"
126
- `get_repo_status` per-repo breakdown of errors and fixes — "how is elprint doing?"
127
- • `list_recent_commits` — recent Sentinel auto-fix commits — "what did Sentinel commit?"
128
-
129
- *Log management*
130
- • `fetch_logs` — pull fresh logs from servers right now — "fetch logs for SSOLWA"
131
- • `search_logs` — live SSH grep on production servers — "search logs for illegal PIN in 1881"
132
- • `tail_log` — last N lines of a log source, no filter — "show recent SSOLWA logs"
133
-
134
- *Codebase questions*
135
- • `ask_codebase` — any question about a repo's code — "what does 1881 do?", "find PIN validation", "any TODOs?", "security issues?"
136
-
137
- *Fix management*
138
- • `get_fix_details` — full details of a specific fix — "show fix abc123"
139
- `list_pending_prs` — all open Sentinel PRs awaiting review — "list open PRs"
140
- • `check_auth_status` — Claude auth health, rate-limit circuit state, fix engine 24 h stats "is Claude working?", "any rate limits?", "auth issues?"
141
-
142
- *Project & task delivery*
143
- • `list_projects` — all projects and repos Sentinel manages — "what projects do you manage?"
144
- • `create_issue` — deliver a task to any project by name — "tell 1881 to fix X"
145
- • `trigger_poll` — run a log-fetch + fix cycle right now — "check now"
146
- • `pause_sentinel` / `resume_sentinel` — halt or resume all auto-fix activity — "pause Sentinel"
147
-
148
- *Repo & config sync*
149
- • `pull_repo` — git pull on managed application repos — "pull latest code"
150
- • `pull_config` — git pull on Sentinel config dirs — "pull config for elprint"
151
-
152
- *File sharing*
153
- `post_file` upload a file to Slack — "give me that as a file", "export the log", "send me the diff"
154
-
155
- *Personal*
156
- • `my_stats` — your activity: issues submitted, fixes, conversation history — "my stats"
157
- `clear_my_history` — wipe your conversation history and start fresh — "clear my history"
158
-
159
- *Slack bot watching*
160
- • `list_watched_bots` — show all bots currently being monitored — "which bots are you watching?"
161
-
162
- *Admin* (SLACK_ADMIN_USERS if configured, otherwise all allowed users)
163
- • `watch_bot` — register a Slack bot for passive monitoring; its messages become issues — "listen to @alertbot"
164
- `unwatch_bot` — stop monitoring a bot — "stop watching @errorbot"
165
- • `restart_project` — stop + restart a Sentinel monitoring instance (not the app) — "restart sentinel for 1881"
166
- • `upgrade_sentinel` — pull latest Sentinel release and restart — "upgrade sentinel"
167
- `list_all_users` all Slack users who have talked to Sentinel + activity summary
168
- • `clear_user_history` — wipe a specific user's conversation history
169
- • `reset_fingerprint` — clear the 24h fix lock so Sentinel retries an error
170
- • `list_all_errors` — full unfiltered error database
171
- • `export_db` — dump full Sentinel state as a downloadable file
172
-
173
- Tone: direct, professional, like a senior engineer who owns the system.
174
- Don't pad responses. Don't say "Great question!" or "Certainly!".
175
- If you don't know something, use a tool to find out before saying you don't know.
176
-
177
- When to act vs. when to ask:
178
- - Clear command ("check status", "fetch logs", "pause sentinel") → call the tool immediately, reply with results.
179
- - Ambiguous or exploratory ("what does get_repo_status do?", "tell me about search_logs") → explain the tool naturally, then ask: "Want me to run it?"
180
- - Unclear intent (could be either) use judgment: brief explanation + "Want me to run this now?"
181
- - If a tool call will take a moment (search, fetch, pull), prefix your reply with a brief "working" line ending in "..." before the results, e.g. "Searching SSOLWA for TryDig activity..." then the actual output.
182
- Never just say a working line and stop — always follow it with the results in the same message.
183
-
184
- Session context critical rules:
185
- - Loaded conversation history is prior-session background only. It may be hours or days old.
186
- - NEVER say "the previous search", "I already fetched", "as I found earlier", or any phrase implying you already did part of the current task unless a tool result appears in THIS response's tool calls.
187
- - When handling a new request, call the tools fresh. Do not assume any prior tool result is still current or that any prior step "counts" toward the current task.
188
- - The only exception: if the user explicitly asks about something from the history ("what did you find earlier?"), you may reference it — but note it is from a prior session.
189
-
190
- Trust your tool results never contradict them:
191
- - If any search_logs call in this response returned total_matches > 0, you HAVE results. Report them.
192
- - Never say "no results found" or "nothing was found" when a tool result shows total_matches > 0.
193
- - If one source-specific call returns 0 but a broader call returned matches, use the broader results.
194
- - A cached result with files_searched=0 is a source-name lookup failure, NOT an absence of log data.
195
- Treat it as "source not recognised" and fall back to the broad search results you already have.
196
-
197
- Avoid redundant tool calls (within a single response only — always run tools fresh for new requests):
198
- - If a broad search (e.g. search_logs with no source filter) already returned results in THIS response, do NOT repeat the same search with a source filter to "refine" — use what you already fetched.
199
- - If a tool call fails in THIS response, do NOT retry the entire search from scratch. Continue with what succeeded and note the failure.
200
- - One pass per task: gather all needed data in a single round of tool calls, then produce the final answer.
201
-
202
- Issue identification — before calling create_issue:
203
- 1. Determine if the message is a REAL issue/task (bug report, feature request, investigation ask)
204
- vs. a status question, tool query, or casual chat. If not an issue, just answer normally.
205
- 2. If it IS an issue, gather what's needed before creating:
206
- - Project: which project? If unclear, ask. Use list_projects if you need to check names.
207
- - Context: what's the problem? Include everything: description, error text, steps to reproduce.
208
- - Attachments: summarise any files/screenshots the user shared.
209
- - Support URL: note any ticket/doc/link the user mentioned.
210
- - Identity: always captured automatically from the Slack session.
211
- 3. Populate `findings` with curated evidence only when relevant and concise:
212
- - If you ran search_logs, tail_log, ask_codebase, or get_status before creating the issue,
213
- summarise only the findings directly related to this specific issue.
214
- - Do NOT paste raw tool output. Summarise: which services, how often, key pattern, 1-3 example lines.
215
- - If the search returned nothing relevant, or the issue is purely user-described with no log evidence, leave `findings` empty.
216
- - The fix engine reads only the issue file. Give it signal, not noise — 500 words max.
217
- 4. Before calling the tool, confirm with the user in natural language:
218
- e.g. "I'll create an issue for project *1881* here's what I have: [summary]. Look right?"
219
- Wait for their confirmation before proceeding.
220
- EXCEPTION: if the user's message already contains a clear project + unambiguous description,
221
- skip the confirmation and create immediately don't ask when nothing is unclear.
222
- 5. After creating, tell them the issue was queued and Sentinel will pick it up on the next poll.
223
-
224
- When the engineer's request is fully handled, end your LAST message with the token: [DONE]
225
- IMPORTANT: Always write your actual reply text FIRST, then append [DONE] at the end. Example: "Hello! I'm Sentinel. [DONE]". Never output [DONE] as your only content.
226
- For greetings like "hello" or empty messages, introduce yourself briefly and offer help, then end with [DONE].
227
- If you need a follow-up from them, do NOT include [DONE] — wait for their next message.
228
- """
229
-
230
- # ── Tool definitions ─────────────────────────────────────────────────────────
231
-
232
- _TOOLS = [
233
- {
234
- "name": "get_status",
235
- "description": (
236
- "Get recent errors, fixes applied, fixes pending review, and open PRs. "
237
- "Use for: 'what happened today?', 'any issues?', 'how are things?', "
238
- "'what are the open PRs?', 'did sentinel fix anything?'"
239
- ),
240
- "input_schema": {
241
- "type": "object",
242
- "properties": {
243
- "hours": {
244
- "type": "integer",
245
- "description": "Look-back window in hours (default 24)",
246
- "default": 24,
247
- },
248
- },
249
- },
250
- },
251
- {
252
- "name": "create_issue",
253
- "description": (
254
- "Deliver a confirmed issue/task to a Sentinel project instance. "
255
- "Only call this after you have: (1) confirmed the message is a real issue or task, "
256
- "(2) identified the target project, (3) gathered enough context, and "
257
- "(4) confirmed with the user ('I'll create this issue for project Xdoes that look right?'). "
258
- "Do NOT call this for status questions, tool queries, or casual chat."
259
- ),
260
- "input_schema": {
261
- "type": "object",
262
- "properties": {
263
- "description": {
264
- "type": "string",
265
- "description": "Full problem/task description include all context the user gave you",
266
- },
267
- "project": {
268
- "type": "string",
269
- "description": "Project short name (e.g. '1881', 'elprint'). Ask if unclear.",
270
- },
271
- "target_repo": {
272
- "type": "string",
273
- "description": "Specific repo within the project (omit to let Sentinel auto-route)",
274
- },
275
- "support_url": {
276
- "type": "string",
277
- "description": "Any URL the user shared (ticket, doc, screenshot link, etc.)",
278
- },
279
- "attachments_summary": {
280
- "type": "string",
281
- "description": "Summary of any files/screenshots the user attached",
282
- },
283
- "findings": {
284
- "type": "string",
285
- "description": (
286
- "A concise, curated summary of evidence directly relevant to this issue"
287
- "NOT raw tool output. Include only what the fix engine needs: "
288
- "key error patterns, affected services, approximate frequency/timestamps, "
289
- "and 1-3 representative log lines. Omit unrelated results. "
290
- "Keep under 500 words. Leave empty if no tool results are relevant."
291
- ),
292
- },
293
- },
294
- "required": ["description"],
295
- },
296
- },
297
- {
298
- "name": "get_fix_details",
299
- "description": "Get full details of a specific fix by fingerprint (8+ hex chars).",
300
- "input_schema": {
301
- "type": "object",
302
- "properties": {
303
- "fingerprint": {"type": "string"},
304
- },
305
- "required": ["fingerprint"],
306
- },
307
- },
308
- {
309
- "name": "list_pending_prs",
310
- "description": "List all open Sentinel PRs awaiting admin review.",
311
- "input_schema": {"type": "object", "properties": {}},
312
- },
313
- {
314
- "name": "check_auth_status",
315
- "description": (
316
- "Check Claude authentication health, current rate-limit / usage-limit circuit state, "
317
- "and fix engine stats for the last 24 h. "
318
- "Use when someone asks: 'is Claude working?', 'any rate limits?', 'why aren't fixes running?', "
319
- "'is the API key OK?', 'auth issues?', 'fix engine status'."
320
- ),
321
- "input_schema": {"type": "object", "properties": {}},
322
- },
323
- {
324
- "name": "pause_sentinel",
325
- "description": (
326
- "Pause ALL Sentinel fix activity immediately. "
327
- "Use when the engineer says 'pause', 'stop', 'freeze', or 'hold off'."
328
- ),
329
- "input_schema": {"type": "object", "properties": {}},
330
- },
331
- {
332
- "name": "resume_sentinel",
333
- "description": "Resume Sentinel fix activity after a pause.",
334
- "input_schema": {"type": "object", "properties": {}},
335
- },
336
- {
337
- "name": "list_projects",
338
- "description": (
339
- "List all projects (Sentinel instances) in this workspace and the repos "
340
- "each one manages. Use for: 'what projects do you manage?', 'list projects', "
341
- "'what repos are configured?', 'show me all projects'."
342
- ),
343
- "input_schema": {"type": "object", "properties": {}},
344
- },
345
- {
346
- "name": "search_logs",
347
- "description": (
348
- "Search production logs for a keyword or pattern. "
349
- "When a project or source is specified (or can be inferred), performs a LIVE fetch "
350
- "via fetch_log.sh with the query as the grep filter — SSHes directly to the server. "
351
- "Falls back to searching locally-cached log files when no source can be determined. "
352
- "Use for: 'search logs for illegal PIN in 1881', 'find X in SSOLWA logs', "
353
- "'what did user Y do?', 'show entries for appid=Z', 'grep logs for X'."
354
- ),
355
- "input_schema": {
356
- "type": "object",
357
- "properties": {
358
- "query": {
359
- "type": "string",
360
- "description": "Keyword or regex to grep for",
361
- },
362
- "source": {
363
- "type": "string",
364
- "description": "Log source name to search (partial match against log-config filenames, e.g. 'SSOLWA', '1881'). Leave empty to search all sources.",
365
- },
366
- "max_matches": {
367
- "type": "integer",
368
- "description": "Max matching lines to return per source (default 30)",
369
- "default": 30,
370
- },
371
- "tail": {
372
- "type": "integer",
373
- "description": (
374
- "Number of log lines to fetch from the server before grepping (default: config value, typically 500). "
375
- "Increase when the user asks for a longer time window — e.g. 'yesterday up to now' → use 5000-10000. "
376
- "Higher values take longer but cover more history."
377
- ),
378
- },
379
- },
380
- "required": ["query"],
381
- },
382
- },
383
- {
384
- "name": "trigger_poll",
385
- "description": (
386
- "Trigger an immediate log-fetch and error-detection cycle without waiting "
387
- "for the next scheduled interval. Use when: 'check now', 'run now', "
388
- "'poll immediately', 'don't wait'."
389
- ),
390
- "input_schema": {"type": "object", "properties": {}},
391
- },
392
- {
393
- "name": "get_repo_status",
394
- "description": (
395
- "Per-repository breakdown of errors detected and fixes applied. "
396
- "Use for: 'how is repo X doing?', 'which repo has the most issues?', "
397
- "'break down by repo'."
398
- ),
399
- "input_schema": {
400
- "type": "object",
401
- "properties": {
402
- "hours": {
403
- "type": "integer",
404
- "description": "Look-back window in hours (default 24)",
405
- "default": 24,
406
- },
407
- },
408
- },
409
- },
410
- {
411
- "name": "list_recent_commits",
412
- "description": (
413
- "List recent commits made by Sentinel across all managed repos. "
414
- "Use for: 'what did Sentinel commit?', 'show recent auto-fixes', 'what was changed?'."
415
- ),
416
- "input_schema": {
417
- "type": "object",
418
- "properties": {
419
- "limit": {
420
- "type": "integer",
421
- "description": "Max commits per repo (default 5)",
422
- "default": 5,
423
- },
424
- },
425
- },
426
- },
427
- {
428
- "name": "pull_repo",
429
- "description": (
430
- "Run git pull on one or all managed repos to fetch latest changes from GitHub. "
431
- "Use for: 'pull changes', 'git pull', 'update repo X', 'fetch latest code'."
432
- ),
433
- "input_schema": {
434
- "type": "object",
435
- "properties": {
436
- "repo": {
437
- "type": "string",
438
- "description": "Repo name to pull (omit to pull all configured repos)",
439
- },
440
- },
441
- },
442
- },
443
- {
444
- "name": "pull_config",
445
- "description": (
446
- "Run git pull on one or all Sentinel project config directories. "
447
- "Projects are matched by short name ('1881', 'elprint') or full dir name ('sentinel-1881'). "
448
- "Use for: 'pull config for 1881', 'update sentinel config', 'pull all configs'."
449
- ),
450
- "input_schema": {
451
- "type": "object",
452
- "properties": {
453
- "project": {
454
- "type": "string",
455
- "description": "Project short name or dir name to pull (omit for all projects)",
456
- },
457
- },
458
- },
459
- },
460
- {
461
- "name": "fetch_logs",
462
- "description": (
463
- "Run fetch_log.sh for one or all configured log sources to pull the latest logs "
464
- "from remote servers right now. Use for: 'fetch logs', 'run fetch_log.sh', "
465
- "'grab latest logs from SSOLWA', 'try fetch_log.sh for STS', "
466
- "'pull logs from server', 'get fresh logs'."
467
- ),
468
- "input_schema": {
469
- "type": "object",
470
- "properties": {
471
- "source": {
472
- "type": "string",
473
- "description": "Log source name to fetch (partial match, e.g. 'SSOLWA'). Omit to fetch all.",
474
- },
475
- "debug": {
476
- "type": "boolean",
477
- "description": "Run fetch_log.sh with --debug flag to show SSH/grep details",
478
- "default": False,
479
- },
480
- "tail": {
481
- "type": "integer",
482
- "description": "Override TAIL lines (how many log lines to fetch)",
483
- },
484
- "grep_filter": {
485
- "type": "string",
486
- "description": "Override GREP_FILTER (regex). Pass 'none' to disable filtering.",
487
- },
488
- },
489
- },
490
- },
491
- {
492
- "name": "watch_bot",
493
- "description": (
494
- "Tell Sentinel to passively monitor a Slack bot — queuing its messages as issues. "
495
- "Extract all <@UXXXXXX> user IDs from the message and pass them here. "
496
- "Sentinel verifies each is actually a bot (not a human) before adding to the watch list. "
497
- "IMPORTANT: a bot watcher is only useful if its issues can be delivered to a project. "
498
- "Try to infer the project from context (bot name, prior messages, available projects). "
499
- "If it cannot be determined, do NOT call this tool — instead ask the user which project "
500
- "the bot's alerts belong to, then call this tool with the project filled in. "
501
- "Use for: 'listen to @alertbot', 'watch @bot1 @bot2', 'monitor @errorbot'."
502
- ),
503
- "input_schema": {
504
- "type": "object",
505
- "properties": {
506
- "user_ids": {
507
- "type": "array",
508
- "items": {"type": "string"},
509
- "description": "Slack user IDs to watch — extract from <@UXXXXXX> patterns in the message",
510
- },
511
- "project": {
512
- "type": "string",
513
- "description": "Project short name this bot's issues should be routed to (e.g. '1881', 'elprint'). Infer from context or ask user before calling.",
514
- },
515
- },
516
- "required": ["user_ids"],
517
- },
518
- },
519
- {
520
- "name": "unwatch_bot",
521
- "description": (
522
- "Stop Sentinel from monitoring a Slack bot. "
523
- "Use for: 'stop watching @alertbot', 'unwatch @bot', 'remove @errorbot from watchers'."
524
- ),
525
- "input_schema": {
526
- "type": "object",
527
- "properties": {
528
- "user_ids": {
529
- "type": "array",
530
- "items": {"type": "string"},
531
- "description": "Slack user IDs to remove from the watch list",
532
- },
533
- },
534
- "required": ["user_ids"],
535
- },
536
- },
537
- {
538
- "name": "list_watched_bots",
539
- "description": (
540
- "List all Slack bots Sentinel is currently monitoring passively. "
541
- "Use for: 'who are you watching?', 'which bots are you monitoring?', 'list watched bots'."
542
- ),
543
- "input_schema": {"type": "object", "properties": {}},
544
- },
545
- {
546
- "name": "upgrade_sentinel",
547
- "description": (
548
- "Upgrade the Sentinel agent itself: git pull the latest code, update Python deps, "
549
- "then restart the process. Safe to call at any time — if already up to date, "
550
- "no restart is triggered. "
551
- "Use for: 'upgrade sentinel', 'update sentinel', 'upgrade yourself', "
552
- "'pull latest sentinel code', 'restart sentinel after upgrade'."
553
- ),
554
- "input_schema": {"type": "object", "properties": {}},
555
- },
556
- {
557
- "name": "ask_codebase",
558
- "description": (
559
- "Ask any natural-language question about a managed codebase. "
560
- "Accepts a repo name (e.g. 'STS', 'elprint-sales') OR a project name (e.g. '1881', 'elprint') "
561
- " if a project name is given and it has multiple repos, all are queried. "
562
- "Claude Code answers using its full codebase knowledge no need to specify how. "
563
- "Use for: 'what does 1881 do?', 'TODOs in 1881', 'find PIN validation in STS', "
564
- "'security issues in elprint-sales?', 'summarize the cairn repo'."
565
- ),
566
- "input_schema": {
567
- "type": "object",
568
- "properties": {
569
- "repo": {
570
- "type": "string",
571
- "description": "Repo name (e.g. 'STS', 'elprint-sales') OR project name (e.g. '1881', 'elprint') — project name queries all its repos",
572
- },
573
- "question": {
574
- "type": "string",
575
- "description": "Natural language question about the codebase",
576
- },
577
- },
578
- "required": ["repo", "question"],
579
- },
580
- },
581
- {
582
- "name": "restart_project",
583
- "description": (
584
- "Stop and restart a specific Sentinel monitoring instance (runs stop.sh then start.sh). "
585
- "This restarts the Sentinel agent process for that project — it does NOT restart the application itself. "
586
- "Use when: 'restart sentinel for 1881', 'reload the 1881 monitor', 'restart elprint sentinel'. "
587
- "Safer than restarting all projects at once."
588
- ),
589
- "input_schema": {
590
- "type": "object",
591
- "properties": {
592
- "project": {
593
- "type": "string",
594
- "description": "Project short name or dir name (e.g. '1881', 'elprint')",
595
- },
596
- },
597
- "required": ["project"],
598
- },
599
- },
600
- {
601
- "name": "my_stats",
602
- "description": (
603
- "Show the current user's personal Sentinel dashboard: "
604
- "conversation history length, issues they submitted, and "
605
- "a summary of Sentinel fix activity (errors caught, fixes applied, "
606
- "fixes pending PR review, fixes confirmed live, fixes failed). "
607
- "Use for: 'what have you done for me?', 'show my stats', "
608
- "'how many issues have been fixed?', 'my history', 'summary', "
609
- "'what did sentinel fix this week?', 'pending fixes', 'open PRs'."
610
- ),
611
- "input_schema": {
612
- "type": "object",
613
- "properties": {
614
- "hours": {
615
- "type": "integer",
616
- "description": "Look-back window in hours (default 168 = 7 days)",
617
- "default": 168,
618
- },
619
- },
620
- },
621
- },
622
- {
623
- "name": "clear_my_history",
624
- "description": (
625
- "Clear the current user's conversation history with Sentinel. "
626
- "After clearing, future sessions start with no memory of past conversations. "
627
- "Use for: 'clear my history', 'forget our conversation', "
628
- "'start fresh', 'reset my context', 'wipe my history'."
629
- ),
630
- "input_schema": {"type": "object", "properties": {}},
631
- },
632
- {
633
- "name": "tail_log",
634
- "description": (
635
- "Fetch the last N lines of a log source's live production logs without any grep filter. "
636
- "Use when: 'show me recent SSOLWA logs', 'tail STS', 'what's happening in 1881 logs right now', "
637
- "'show last 100 lines from SSOLWA'. Different from search_logs — no pattern required."
638
- ),
639
- "input_schema": {
640
- "type": "object",
641
- "properties": {
642
- "source": {
643
- "type": "string",
644
- "description": "Log source name (partial match against log-config filenames, e.g. 'SSOLWA', 'STS')",
645
- },
646
- "lines": {
647
- "type": "integer",
648
- "description": "Number of recent lines to fetch (default 100)",
649
- "default": 100,
650
- },
651
- },
652
- "required": ["source"],
653
- },
654
- },
655
- {
656
- "name": "post_file",
657
- "description": (
658
- "Upload a text file directly to the Slack conversation so the user can read or download it. "
659
- "Use when: output is too large for a chat message, the user asks to 'download', 'export', or "
660
- "'send as a file', or when formatted content (diffs, logs, CSVs, reports) is clearer as a file. "
661
- "e.g. 'give me that as a file', 'export the log', 'send me the diff for PR #41', "
662
- "'download the health report', 'export recent errors as CSV'"
663
- ),
664
- "input_schema": {
665
- "type": "object",
666
- "properties": {
667
- "content": {
668
- "type": "string",
669
- "description": "The full text content of the file to upload",
670
- },
671
- "filename": {
672
- "type": "string",
673
- "description": "Filename with extension, e.g. 'fix-ab12.diff', 'sentinel-report.txt', 'errors.csv', 'ssolwa.log'",
674
- },
675
- "title": {
676
- "type": "string",
677
- "description": "Optional display title shown above the file in Slack (defaults to filename)",
678
- },
679
- },
680
- "required": ["content", "filename"],
681
- },
682
- },
683
- {
684
- "name": "list_all_users",
685
- "description": (
686
- "ADMIN ONLY. List all Slack users who have ever talked to Sentinel, "
687
- "with their issue count and conversation message count. "
688
- "e.g. 'list all users', 'who has talked to you?', 'show user activity'"
689
- ),
690
- "input_schema": {"type": "object", "properties": {}},
691
- },
692
- {
693
- "name": "clear_user_history",
694
- "description": (
695
- "ADMIN ONLY. Clear the conversation history for a specific Slack user. "
696
- "e.g. 'clear history for huy', 'reset bob's conversation'"
697
- ),
698
- "input_schema": {
699
- "type": "object",
700
- "properties": {
701
- "user_id": {
702
- "type": "string",
703
- "description": "Slack user ID to clear (e.g. U01AB2CD3EF)",
704
- },
705
- },
706
- "required": ["user_id"],
707
- },
708
- },
709
- {
710
- "name": "reset_fingerprint",
711
- "description": (
712
- "ADMIN ONLY. Remove the 24h fix lock for an error fingerprint so Sentinel will retry it "
713
- "on the next poll cycle. Use when a fix attempt failed and you want to force a retry. "
714
- "e.g. 'retry fix abc123', 'reset fingerprint abc123de', 'let Sentinel try that error again'"
715
- ),
716
- "input_schema": {
717
- "type": "object",
718
- "properties": {
719
- "fingerprint": {
720
- "type": "string",
721
- "description": "Error fingerprint hash (8+ hex chars, from get_fix_details or list_all_errors)",
722
- },
723
- },
724
- "required": ["fingerprint"],
725
- },
726
- },
727
- {
728
- "name": "list_all_errors",
729
- "description": (
730
- "ADMIN ONLY. Return the full unfiltered error database — all fingerprints, counts, "
731
- "sources, and last-seen times. "
732
- "e.g. 'show all errors', 'full error list', 'dump the error DB'"
733
- ),
734
- "input_schema": {
735
- "type": "object",
736
- "properties": {
737
- "hours": {
738
- "type": "integer",
739
- "description": "Limit to errors seen in the last N hours (0 = all time)",
740
- "default": 0,
741
- },
742
- },
743
- },
744
- },
745
- {
746
- "name": "export_db",
747
- "description": (
748
- "ADMIN ONLY. Export the full Sentinel state (errors, fixes, PRs, users) as a "
749
- "downloadable text file posted to Slack. "
750
- "e.g. 'export the DB', 'download state', 'give me a full report file'"
751
- ),
752
- "input_schema": {"type": "object", "properties": {}},
753
- },
754
- ]
755
-
756
-
757
- # ── Workspace helpers ─────────────────────────────────────────────────────────
758
-
759
- def _workspace_dir() -> Path:
760
- return Path(".").resolve().parent
761
-
762
- def _short_name(dir_name: str) -> str:
763
- """'sentinel-1881' → '1881', 'sentinel-elprint' → 'elprint', others unchanged."""
764
- if dir_name.startswith("sentinel-"):
765
- return dir_name[len("sentinel-"):]
766
- return dir_name
767
-
768
- def _read_project_name(project_dir: Path) -> str:
769
- """Return PROJECT_NAME from sentinel.properties if set, else fall back to _short_name(dir)."""
770
- props = project_dir / "config" / "sentinel.properties"
771
- if props.exists():
772
- try:
773
- for line in props.read_text(encoding="utf-8", errors="ignore").splitlines():
774
- line = line.strip()
775
- if line.startswith("PROJECT_NAME"):
776
- _, _, val = line.partition("=")
777
- val = val.partition("#")[0].strip()
778
- if val:
779
- return val
780
- except Exception:
781
- pass
782
- return _short_name(project_dir.name)
783
-
784
- def _find_project_dirs(target: str = "") -> list[Path]:
785
- """Return project dirs matching target (PROJECT_NAME, short name, or full dir name), or all if target empty."""
786
- workspace = _workspace_dir()
787
- results = []
788
- try:
789
- for d in sorted(workspace.iterdir()):
790
- if not d.is_dir() or d.name in ("code", ".git"):
791
- continue
792
- if not (d / "config").exists():
793
- continue
794
- if target:
795
- t = target.lower()
796
- if (t not in d.name.lower()
797
- and t not in _short_name(d.name).lower()
798
- and t not in _read_project_name(d).lower()):
799
- continue
800
- results.append(d)
801
- except Exception:
802
- pass
803
- return results
804
-
805
- def _git_pull(path: Path) -> dict:
806
- try:
807
- r = subprocess.run(
808
- ["git", "pull", "--rebase", "origin"],
809
- cwd=str(path), capture_output=True, text=True, timeout=60,
810
- )
811
- last = r.stdout.strip().splitlines()[-1] if r.stdout.strip() else "already up to date"
812
- return {"status": "ok" if r.returncode == 0 else "error",
813
- "detail": last if r.returncode == 0 else r.stderr.strip()}
814
- except Exception as e:
815
- return {"status": "error", "detail": str(e)}
816
-
817
-
818
- # ── Log-source name resolver ──────────────────────────────────────────────────
819
-
820
- def _filter_log_sources(props_files: list, source_hint: str) -> list:
821
- """
822
- Return the subset of props_files whose log source matches source_hint.
823
-
824
- Matching is tried in order (first match wins per file):
825
- 1. Substring of the filename stem (e.g. "sts" → STS.properties)
826
- 2. Substring of REMOTE_SERVICE_USER (e.g. "ssolwa" → ...SSOLoginWebApp...)
827
- 3. Substring of HOSTS (e.g. hostname fragment)
828
-
829
- Case-insensitive throughout. An empty source_hint returns all files unchanged.
830
- """
831
- if not source_hint:
832
- return props_files
833
- hint = source_hint.lower()
834
-
835
- def _props_contains(path: Path, key: str, hint: str) -> bool:
836
- try:
837
- for line in path.read_text(encoding="utf-8", errors="replace").splitlines():
838
- stripped = line.strip()
839
- if stripped.startswith("#"):
840
- continue
841
- if stripped.upper().startswith(key + "="):
842
- val = stripped.split("=", 1)[1].partition("#")[0].strip().lower()
843
- if hint in val:
844
- return True
845
- except OSError:
846
- pass
847
- return False
848
-
849
- matched = []
850
- for p in props_files:
851
- if hint in p.stem.lower():
852
- matched.append(p)
853
- elif _props_contains(p, "REMOTE_SERVICE_USER", hint):
854
- matched.append(p)
855
- elif _props_contains(p, "HOSTS", hint):
856
- matched.append(p)
857
- return matched
858
-
859
-
860
- # ── Tool execution ────────────────────────────────────────────────────────────
861
-
862
- async def _run_tool(name: str, inputs: dict, cfg_loader, store, slack_client=None, user_id: str = "", channel: str = "", is_admin: bool = False) -> str:
863
- if name == "get_status":
864
- hours = int(inputs.get("hours", 24))
865
- errors = store.get_recent_errors(hours)
866
- fixes = store.get_recent_fixes(hours)
867
- prs = store.get_open_prs()
868
- top_errors = [
869
- {
870
- "message": e["message"][:120],
871
- "count": e["count"],
872
- "source": e["source"],
873
- "last_seen": e["last_seen"],
874
- }
875
- for e in errors[:8]
876
- ]
877
- return json.dumps({
878
- "window_hours": hours,
879
- "errors_detected": len(errors),
880
- "top_errors": top_errors,
881
- "fixes_applied": sum(1 for f in fixes if f["status"] == "applied"),
882
- "fixes_pending": sum(1 for f in fixes if f["status"] == "pending"),
883
- "fixes_failed": sum(1 for f in fixes if f["status"] == "failed"),
884
- "open_prs": [
885
- {
886
- "repo": p["repo_name"],
887
- "branch": p["branch"],
888
- "pr_url": p["pr_url"],
889
- "age": p.get("timestamp", ""),
890
- }
891
- for p in prs
892
- ],
893
- "sentinel_paused": Path("SENTINEL_PAUSE").exists(),
894
- })
895
-
896
- if name == "check_auth_status":
897
- import subprocess as _sp
898
- from .notify import get_circuit_status
899
- cfg = cfg_loader.sentinel
900
-
901
- # Auth configuration
902
- has_key = bool(cfg.anthropic_api_key)
903
- pro_for_tasks = cfg.claude_pro_for_tasks
904
- if pro_for_tasks and has_key:
905
- primary, fallback = "claude_pro_oauth", "api_key"
906
- elif pro_for_tasks:
907
- primary, fallback = "claude_pro_oauth", None
908
- else:
909
- primary, fallback = "api_key", "claude_pro_oauth" if not has_key else "claude_pro_oauth"
910
-
911
- # Claude CLI liveness check
912
- cli_ok, cli_version = False, ""
913
- try:
914
- r = _sp.run(
915
- [cfg.claude_code_bin, "--version"],
916
- capture_output=True, text=True, timeout=10,
917
- )
918
- if r.returncode == 0:
919
- cli_ok = True
920
- cli_version = r.stdout.strip() or r.stderr.strip()
921
- except Exception:
922
- pass
923
-
924
- # Circuit breaker snapshot — only open (unhealthy) circuits appear here
925
- circuits = get_circuit_status()
926
-
927
- # Fix engine stats (last 24 h)
928
- recent = store.get_recent_fixes(hours=24)
929
- counts = {"applied": 0, "failed": 0, "skipped": 0, "pending": 0}
930
- last_success = None
931
- for f in recent:
932
- s = f.get("status", "")
933
- if s in counts:
934
- counts[s] += 1
935
- if s == "applied" and not last_success:
936
- last_success = f.get("timestamp", "")
937
-
938
- overall = "healthy"
939
- if circuits:
940
- overall = "degraded rate/auth limit active on: " + ", ".join(circuits)
941
- elif not cli_ok:
942
- overall = "warning claude CLI not reachable"
943
-
944
- return json.dumps({
945
- "overall": overall,
946
- "auth": {
947
- "api_key_configured": has_key,
948
- "claude_pro_for_tasks": pro_for_tasks,
949
- "primary_method": primary,
950
- "fallback_method": fallback,
951
- },
952
- "claude_cli": {"available": cli_ok, "version": cli_version},
953
- "rate_limit_circuits": circuits,
954
- "fix_engine_24h": {**counts, "last_successful_fix": last_success},
955
- })
956
-
957
- if name == "create_issue":
958
- description = inputs["description"]
959
- target_repo = inputs.get("target_repo", "")
960
- project_arg = inputs.get("project", "")
961
-
962
- if project_arg:
963
- project_dirs = _find_project_dirs(project_arg)
964
- if not project_dirs:
965
- all_names = [_read_project_name(d) for d in _find_project_dirs()]
966
- return json.dumps({
967
- "error": f"No project found matching '{project_arg}'",
968
- "available_projects": all_names,
969
- "action_needed": "Ask the user which project they meant.",
970
- })
971
- if len(project_dirs) > 1:
972
- matches = [_read_project_name(d) for d in project_dirs]
973
- return json.dumps({
974
- "error": f"Ambiguous project name '{project_arg}' — matches: {matches}",
975
- "action_needed": "Ask the user to clarify which project they mean.",
976
- })
977
- project_dir = project_dirs[0]
978
- else:
979
- project_dir = Path(".")
980
-
981
- support_url = inputs.get("support_url", "").strip()
982
- attachments_summary = inputs.get("attachments_summary", "").strip()
983
- findings = inputs.get("findings", "").strip()
984
-
985
- issues_dir = project_dir / "issues"
986
- issues_dir.mkdir(exist_ok=True)
987
- fname = f"slack-{uuid.uuid4().hex[:8]}.txt"
988
-
989
- submitter_name = store.get_user_name(user_id) if user_id else ""
990
- submitter_line = f"SUBMITTED_BY: {submitter_name} ({user_id})" if user_id else ""
991
- lines = []
992
- if submitter_line:
993
- lines.append(submitter_line)
994
- if target_repo:
995
- lines.append(f"TARGET_REPO: {target_repo}")
996
- if support_url:
997
- lines.append(f"SUPPORT_URL: {support_url}")
998
- lines.append(f"SUBMITTED_AT: {datetime.now(timezone.utc).isoformat()}")
999
- lines.append("")
1000
- lines.append(description)
1001
- if findings:
1002
- lines.append(f"\nEVIDENCE (gathered by Sentinel Boss):\n{findings}")
1003
- if attachments_summary:
1004
- lines.append(f"\nATTACHMENTS:\n{attachments_summary}")
1005
- content = "\n".join(lines)
1006
- (issues_dir / fname).write_text(content, encoding="utf-8")
1007
-
1008
- # Touch SENTINEL_POLL_NOW so the target instance picks it up immediately
1009
- (project_dir / "SENTINEL_POLL_NOW").touch()
1010
-
1011
- project_label = _read_project_name(project_dir.resolve()) if project_arg else "this project"
1012
- logger.info("Boss created issue for %s: %s", project_label, fname)
1013
- if user_id:
1014
- try:
1015
- store.record_submitted_issue(
1016
- user_id=user_id,
1017
- user_name=submitter_name,
1018
- project=project_label,
1019
- fname=fname,
1020
- description=description,
1021
- )
1022
- except Exception as _rec_err:
1023
- logger.debug("Boss: could not record submitted issue: %s", _rec_err)
1024
- return json.dumps({
1025
- "status": "queued",
1026
- "project": project_label,
1027
- "file": fname,
1028
- "note": f"Delivered to '{project_label}'. Sentinel will process it on the next poll cycle.",
1029
- })
1030
-
1031
- if name == "get_fix_details":
1032
- fp = inputs["fingerprint"]
1033
- fix = store.get_confirmed_fix(fp) or store.get_marker_seen_fix(fp)
1034
- if not fix:
1035
- # Fallback: search recent fixes by prefix
1036
- recent = store.get_recent_fixes(hours=72)
1037
- fix = next((f for f in recent if f.get("fingerprint", "").startswith(fp)), None)
1038
- return json.dumps(fix or {"error": "not found"})
1039
-
1040
- if name == "list_pending_prs":
1041
- prs = store.get_open_prs()
1042
- return json.dumps({
1043
- "count": len(prs),
1044
- "open_prs": [
1045
- {
1046
- "repo": p["repo_name"],
1047
- "branch": p["branch"],
1048
- "pr_url": p["pr_url"],
1049
- "timestamp": p.get("timestamp", ""),
1050
- }
1051
- for p in prs
1052
- ],
1053
- })
1054
-
1055
- if name == "pause_sentinel":
1056
- Path("SENTINEL_PAUSE").touch()
1057
- logger.info("Boss: SENTINEL_PAUSE created")
1058
- return json.dumps({"status": "paused"})
1059
-
1060
- if name == "resume_sentinel":
1061
- p = Path("SENTINEL_PAUSE")
1062
- if p.exists():
1063
- p.unlink()
1064
- logger.info("Boss: SENTINEL_PAUSE removed")
1065
- return json.dumps({"status": "resumed"})
1066
-
1067
- if name == "list_projects":
1068
- projects = []
1069
- for d in _find_project_dirs():
1070
- repo_cfg_dir = d / "config" / "repo-configs"
1071
- repos_in_project = []
1072
- if repo_cfg_dir.exists():
1073
- for p in sorted(repo_cfg_dir.glob("*.properties")):
1074
- if p.name.startswith("_"):
1075
- continue
1076
- repo_url = ""
1077
- for line in p.read_text(encoding="utf-8", errors="ignore").splitlines():
1078
- if line.startswith("REPO_URL"):
1079
- repo_url = line.split("=", 1)[-1].strip()
1080
- break
1081
- repos_in_project.append({"repo": p.stem, "url": repo_url})
1082
- projects.append({
1083
- "project": _read_project_name(d),
1084
- "dir": d.name,
1085
- "running": (d / "sentinel.pid").exists(),
1086
- "this": d.resolve() == Path(".").resolve(),
1087
- "repos": repos_in_project,
1088
- })
1089
- return json.dumps({"projects": projects})
1090
-
1091
- if name == "search_logs":
1092
- query = inputs.get("query", "")
1093
- source = inputs.get("source", "").lower()
1094
- max_matches = int(inputs.get("max_matches", 30))
1095
- tail_override = inputs.get("tail")
1096
-
1097
- # ── Live fetch path: SSH to servers and grep in real time ──────────────
1098
- script = Path(__file__).resolve().parent.parent / "scripts" / "fetch_log.sh"
1099
- log_cfg_dir = Path("config") / "log-configs"
1100
- if script.exists() and log_cfg_dir.exists():
1101
- props_files = _filter_log_sources(sorted(log_cfg_dir.glob("*.properties")), source)
1102
- if props_files:
1103
- live_results = []
1104
- for props in props_files:
1105
- env = os.environ.copy()
1106
- env["GREP_FILTER"] = query
1107
- if tail_override:
1108
- env["TAIL"] = str(tail_override)
1109
- try:
1110
- r = subprocess.run(
1111
- ["bash", str(script), str(props)],
1112
- capture_output=True, text=True, timeout=60, env=env,
1113
- )
1114
- lines = (r.stdout or "").strip().splitlines()
1115
- matches = [ln[:300] for ln in lines if ln.strip()][:max_matches]
1116
- if matches:
1117
- live_results.append({"source": props.stem, "matches": matches})
1118
- logger.info("Boss search_logs live %s rc=%d found=%d", props.stem, r.returncode, len(matches))
1119
- except subprocess.TimeoutExpired:
1120
- live_results.append({"source": props.stem, "error": "timed out"})
1121
- except Exception as e:
1122
- live_results.append({"source": props.stem, "error": str(e)})
1123
- total = sum(len(r.get("matches", [])) for r in live_results)
1124
- return json.dumps({
1125
- "query": query,
1126
- "mode": "live",
1127
- "total_matches": total,
1128
- "results": live_results,
1129
- "note": (
1130
- "Results already include a per-source breakdown. "
1131
- "Do NOT call search_logs again with a source filter to 'refine' — "
1132
- "use these results directly."
1133
- ) if total > 0 else None,
1134
- })
1135
-
1136
- # ── Fallback: search locally-cached log files ──────────────────────────
1137
- # Reaching here means: live script unavailable OR source filter matched no config files.
1138
- # A result with files_searched=0 means the source name wasn't recognised — NOT that
1139
- # there are no log entries. Do not interpret this as "no results found".
1140
- fetched_dir = Path("workspace/fetched")
1141
- if not fetched_dir.exists():
1142
- return json.dumps({
1143
- "error": "No fetched logs found and fetch_log.sh unavailable",
1144
- "note": "This is a config/setup problem, not a 'no results' answer.",
1145
- })
1146
- try:
1147
- pattern = re.compile(query, re.IGNORECASE)
1148
- except re.error as e:
1149
- return json.dumps({"error": f"Invalid regex: {e}"})
1150
- results = []
1151
- for log_file in sorted(fetched_dir.glob("*.log")):
1152
- if source and source not in log_file.name.lower():
1153
- continue
1154
- try:
1155
- lines = log_file.read_text(encoding="utf-8", errors="ignore").splitlines()
1156
- matches = [
1157
- {"line": i + 1, "text": line[:300]}
1158
- for i, line in enumerate(lines)
1159
- if pattern.search(line)
1160
- ][:max_matches]
1161
- if matches:
1162
- results.append({"file": log_file.name, "matches": matches})
1163
- except Exception:
1164
- pass
1165
- total = sum(len(r["matches"]) for r in results)
1166
- files_searched = len(list(fetched_dir.glob("*.log")))
1167
- result = {
1168
- "query": query,
1169
- "mode": "cached",
1170
- "total_matches": total,
1171
- "files_searched": files_searched,
1172
- "results": results,
1173
- }
1174
- if files_searched == 0:
1175
- result["warning"] = (
1176
- "Source name not recognised in cached files — this is a lookup failure, not 'no results'. "
1177
- "If you already have results from a broader search_logs call, use those. Stop retrying."
1178
- )
1179
- return json.dumps(result)
1180
-
1181
- if name == "trigger_poll":
1182
- Path("SENTINEL_POLL_NOW").touch()
1183
- logger.info("Boss: immediate poll requested")
1184
- return json.dumps({"status": "triggered", "note": "Sentinel will run a poll cycle within seconds"})
1185
-
1186
- if name == "get_repo_status":
1187
- hours = int(inputs.get("hours", 24))
1188
- fixes = store.get_recent_fixes(hours)
1189
- errors = store.get_recent_errors(hours)
1190
- by_repo: dict = {}
1191
- for fix in fixes:
1192
- repo = fix.get("repo_name", "unknown")
1193
- s = by_repo.setdefault(repo, {"applied": 0, "pending": 0, "failed": 0, "skipped": 0})
1194
- key = fix.get("status", "failed")
1195
- s[key] = s.get(key, 0) + 1
1196
- return json.dumps({"window_hours": hours, "total_errors": len(errors), "by_repo": by_repo})
1197
-
1198
- if name == "list_recent_commits":
1199
- limit = int(inputs.get("limit", 5))
1200
- results = []
1201
- for repo_name, repo in cfg_loader.repos.items():
1202
- local = Path(repo.local_path)
1203
- if not local.exists():
1204
- continue
1205
- try:
1206
- r = subprocess.run(
1207
- ["git", "log", "--oneline", "--grep=sentinel", "-n", str(limit)],
1208
- cwd=str(local), capture_output=True, text=True, timeout=10,
1209
- )
1210
- commits = r.stdout.strip().splitlines()
1211
- if commits:
1212
- results.append({"repo": repo_name, "commits": commits})
1213
- except Exception:
1214
- pass
1215
- return json.dumps({"sentinel_commits": results})
1216
-
1217
- if name == "pull_repo":
1218
- target = inputs.get("repo", "").lower()
1219
- results = []
1220
- for repo_name, repo in cfg_loader.repos.items():
1221
- if target and target not in repo_name.lower():
1222
- continue
1223
- local = Path(repo.local_path)
1224
- if not local.exists():
1225
- results.append({"repo": repo_name, "status": "error", "detail": "local path not found"})
1226
- continue
1227
- try:
1228
- r = subprocess.run(
1229
- ["git", "pull", "--rebase", "origin", repo.branch],
1230
- cwd=str(local), capture_output=True, text=True, timeout=60,
1231
- )
1232
- last_line = r.stdout.strip().splitlines()[-1] if r.stdout.strip() else "already up to date"
1233
- if r.returncode == 0:
1234
- results.append({"repo": repo_name, "status": "ok", "detail": last_line})
1235
- else:
1236
- results.append({"repo": repo_name, "status": "error", "detail": r.stderr.strip()})
1237
- except Exception as e:
1238
- results.append({"repo": repo_name, "status": "error", "detail": str(e)})
1239
- return json.dumps({"results": results})
1240
-
1241
- if name == "pull_config":
1242
- target = inputs.get("project", "")
1243
- dirs = _find_project_dirs(target)
1244
- if not dirs:
1245
- return json.dumps({"error": f"No project found matching '{target}'"})
1246
- results = []
1247
- for d in dirs:
1248
- res = _git_pull(d)
1249
- results.append({"project": _read_project_name(d), "dir": d.name, **res})
1250
- logger.info("Boss: pull_config %s %s", d.name, res["status"])
1251
- return json.dumps({"results": results})
1252
-
1253
- if name == "fetch_logs":
1254
- source_filter = inputs.get("source", "").lower()
1255
- debug = bool(inputs.get("debug", False))
1256
- tail_override = inputs.get("tail")
1257
- grep_override = inputs.get("grep_filter", "")
1258
-
1259
- # Find fetch_log.sh relative to this file
1260
- script = Path(__file__).resolve().parent.parent / "scripts" / "fetch_log.sh"
1261
- if not script.exists():
1262
- return json.dumps({"error": f"fetch_log.sh not found at {script}"})
1263
-
1264
- log_cfg_dir = Path("config") / "log-configs"
1265
- if not log_cfg_dir.exists():
1266
- return json.dumps({"error": "config/log-configs/ not found"})
1267
-
1268
- props_files = _filter_log_sources(sorted(log_cfg_dir.glob("*.properties")), source_filter)
1269
- if not props_files:
1270
- return json.dumps({"error": f"No log-config found matching '{source_filter}'"})
1271
-
1272
- results = []
1273
- for props in props_files:
1274
- env = os.environ.copy()
1275
- if tail_override:
1276
- env["TAIL"] = str(tail_override)
1277
- if grep_override:
1278
- env["GREP_FILTER"] = grep_override
1279
-
1280
- cmd = ["bash", str(script)]
1281
- if debug:
1282
- cmd.append("--debug")
1283
- cmd.append(str(props))
1284
-
1285
- try:
1286
- r = subprocess.run(
1287
- cmd, capture_output=True, text=True, timeout=120, env=env,
1288
- )
1289
- output = (r.stdout or "").strip()
1290
- stderr = (r.stderr or "").strip()
1291
- results.append({
1292
- "source": props.stem,
1293
- "returncode": r.returncode,
1294
- "output": output[-2000:] if output else "",
1295
- "stderr": stderr[-1000:] if stderr else "",
1296
- })
1297
- logger.info("Boss fetch_logs %s rc=%d", props.stem, r.returncode)
1298
- except subprocess.TimeoutExpired:
1299
- results.append({"source": props.stem, "error": "timed out after 120s"})
1300
- except Exception as e:
1301
- results.append({"source": props.stem, "error": str(e)})
1302
-
1303
- return json.dumps({"fetched": len(results), "results": results})
1304
-
1305
- if name == "watch_bot":
1306
- if not is_admin:
1307
- return json.dumps({"error": "Admin access required to register bots for monitoring."})
1308
- user_ids = inputs.get("user_ids", [])
1309
- project_arg = inputs.get("project", "").strip()
1310
- if not user_ids:
1311
- return json.dumps({"error": "No user_ids provided"})
1312
-
1313
- # Resolve + validate project — required for bot issue routing
1314
- resolved_project = ""
1315
- if project_arg:
1316
- project_dirs = _find_project_dirs(project_arg)
1317
- if not project_dirs:
1318
- all_names = [_read_project_name(d) for d in _find_project_dirs()]
1319
- return json.dumps({
1320
- "error": f"No project found matching '{project_arg}'",
1321
- "available_projects": all_names,
1322
- "action_needed": "Ask the user which project these bot alerts belong to.",
1323
- })
1324
- if len(project_dirs) > 1:
1325
- matches = [_read_project_name(d) for d in project_dirs]
1326
- return json.dumps({
1327
- "error": f"Ambiguous project name '{project_arg}' — matches: {matches}",
1328
- "action_needed": "Ask the user to clarify which project.",
1329
- })
1330
- resolved_project = _read_project_name(project_dirs[0])
1331
- else:
1332
- all_projects = _find_project_dirs()
1333
- if len(all_projects) == 1:
1334
- # Single project in workspace — auto-assign
1335
- resolved_project = _read_project_name(all_projects[0])
1336
- elif all_projects:
1337
- all_names = [_read_project_name(d) for d in all_projects]
1338
- return json.dumps({
1339
- "error": "Cannot determine which project these bot alerts belong to.",
1340
- "available_projects": all_names,
1341
- "action_needed": "Ask the user to specify the project, then retry with project filled in.",
1342
- })
1343
-
1344
- results = []
1345
- for uid in user_ids:
1346
- if not slack_client:
1347
- results.append({"user_id": uid, "status": "error", "reason": "no Slack client available"})
1348
- continue
1349
- try:
1350
- info = await slack_client.users_info(user=uid)
1351
- user = info.get("user", {})
1352
- if not user.get("is_bot", False):
1353
- results.append({"user_id": uid, "status": "skipped", "reason": "not a bot — only bots can be watched passively"})
1354
- continue
1355
- bot_name = user.get("real_name") or user.get("name") or uid
1356
- store.add_watched_bot(uid, bot_name, added_by="boss", project_name=resolved_project)
1357
- logger.info("Boss: now watching bot %s (%s) → project '%s'", bot_name, uid, resolved_project or "unset")
1358
- results.append({"user_id": uid, "bot_name": bot_name, "project": resolved_project, "status": "watching"})
1359
- except Exception as e:
1360
- results.append({"user_id": uid, "status": "error", "reason": str(e)})
1361
- return json.dumps({"results": results})
1362
-
1363
- if name == "unwatch_bot":
1364
- if not is_admin:
1365
- return json.dumps({"error": "Admin access required to remove bots from monitoring."})
1366
- user_ids = inputs.get("user_ids", [])
1367
- if not user_ids:
1368
- return json.dumps({"error": "No user_ids provided"})
1369
- results = []
1370
- for uid in user_ids:
1371
- removed = store.remove_watched_bot(uid)
1372
- logger.info("Boss: unwatch bot %s → %s", uid, "removed" if removed else "not found")
1373
- results.append({"user_id": uid, "status": "removed" if removed else "not found"})
1374
- return json.dumps({"results": results})
1375
-
1376
- if name == "list_watched_bots":
1377
- bots = store.get_watched_bots()
1378
- return json.dumps({
1379
- "count": len(bots),
1380
- "bots": [
1381
- {
1382
- "bot_id": b["bot_id"],
1383
- "bot_name": b["bot_name"],
1384
- "project": b.get("project_name") or "",
1385
- "added_by": b["added_by"],
1386
- "added_at": b["added_at"],
1387
- }
1388
- for b in bots
1389
- ],
1390
- })
1391
-
1392
- if name == "upgrade_sentinel":
1393
- if not is_admin:
1394
- return json.dumps({"error": "Admin access required to upgrade Sentinel."})
1395
- import threading
1396
-
1397
- # Sentinel is installed via npm — use `sentinel upgrade` which handles
1398
- # npm install + Python bundle copy + restart via stopAll/startAll.
1399
- # Run it in the background after a short delay so the Slack reply is
1400
- # sent before the process is replaced.
1401
- try:
1402
- r = subprocess.run(
1403
- ["sentinel", "--version"],
1404
- capture_output=True, text=True, timeout=10,
1405
- )
1406
- sentinel_bin_ok = r.returncode == 0
1407
- except Exception:
1408
- sentinel_bin_ok = False
1409
-
1410
- if not sentinel_bin_ok:
1411
- return json.dumps({
1412
- "status": "error",
1413
- "note": "`sentinel` CLI not found. Run: npm install -g @misterhuydo/sentinel",
1414
- })
1415
-
1416
- def _do_upgrade():
1417
- import time
1418
- time.sleep(10) # give Slack time to post the reply
1419
- subprocess.Popen(["sentinel", "upgrade"], close_fds=True)
1420
-
1421
- threading.Thread(target=_do_upgrade, daemon=True).start()
1422
- logger.info("Boss: upgrade_sentinel scheduled via `sentinel upgrade`")
1423
- return json.dumps({
1424
- "status": "ok",
1425
- "note": "Upgrade started — pulling latest version via npm and restarting. Give me ~30 seconds then I'll be back.",
1426
- })
1427
-
1428
- if name == "ask_codebase":
1429
- target = inputs.get("repo", "").lower()
1430
- question = inputs.get("question", "")
1431
-
1432
- # 1. Find repos whose name contains the target (e.g. "STS", "elprint-sales")
1433
- matched = [(rn, r) for rn, r in cfg_loader.repos.items() if target in rn.lower()]
1434
-
1435
- # 2. No repo match — check if target is a project name → use ALL repos in cfg_loader
1436
- # (each Sentinel instance is scoped to one project, so all repos belong to it)
1437
- if not matched:
1438
- current_project = _read_project_name(Path("."))
1439
- if target in current_project.lower() or current_project.lower() in target:
1440
- matched = list(cfg_loader.repos.items())
1441
-
1442
- if not matched:
1443
- return json.dumps({
1444
- "error": f"No repo or project found matching '{target}'",
1445
- "available_repos": list(cfg_loader.repos.keys()),
1446
- })
1447
-
1448
- cfg = cfg_loader.sentinel
1449
- env = os.environ.copy()
1450
- # Only inject API key when Claude Pro is NOT preferred for heavy tasks
1451
- if cfg.anthropic_api_key and not cfg.claude_pro_for_tasks:
1452
- env["ANTHROPIC_API_KEY"] = cfg.anthropic_api_key
1453
-
1454
- def _ask_one(repo_name, repo_cfg) -> dict:
1455
- local_path = Path(repo_cfg.local_path)
1456
- if not local_path.exists():
1457
- return {"repo": repo_name, "error": f"not cloned yet at {local_path}"}
1458
- prompt = (
1459
- f"You are a code analyst. Answer the following question about the codebase at: {local_path}\n\n"
1460
- f"Question: {question}\n\n"
1461
- f"Use whatever tools you need to answer accurately. Be concise and direct. Plain text only."
1462
- )
1463
- try:
1464
- r = subprocess.run(
1465
- ([cfg.claude_code_bin, "--dangerously-skip-permissions", "--print", prompt]
1466
- if os.getuid() != 0 else
1467
- [cfg.claude_code_bin, "--print", prompt]),
1468
- capture_output=True, text=True, timeout=180, env=env,
1469
- cwd=str(local_path),
1470
- )
1471
- output = (r.stdout or "").strip()
1472
- logger.info("Boss ask_codebase %s rc=%d len=%d", repo_name, r.returncode, len(output))
1473
- if r.returncode != 0 and not output:
1474
- raw_err = (r.stderr or "")
1475
- alert_if_rate_limited(
1476
- cfg.slack_bot_token, cfg.slack_channel,
1477
- f"ask_codebase/{repo_name}", raw_err,
1478
- )
1479
- return {"repo": repo_name, "error": f"claude --print failed (rc={r.returncode}): {raw_err[:200]}"}
1480
- return {"repo": repo_name, "answer": output[:3000]}
1481
- except subprocess.TimeoutExpired:
1482
- return {"repo": repo_name, "error": "timed out after 180s"}
1483
- except Exception as e:
1484
- return {"repo": repo_name, "error": str(e)}
1485
-
1486
- if len(matched) == 1:
1487
- result = _ask_one(*matched[0])
1488
- # Unwrap single-repo result for cleaner response
1489
- return json.dumps(result)
1490
-
1491
- # Multiple repos query each and combine
1492
- results = [_ask_one(rn, r) for rn, r in matched]
1493
- return json.dumps({"project": target, "repos_queried": len(results), "results": results})
1494
-
1495
- if name == "restart_project":
1496
- if not is_admin:
1497
- return json.dumps({"error": "Admin access required to restart a project."})
1498
- project_arg = inputs.get("project", "").lower()
1499
- dirs = _find_project_dirs(project_arg)
1500
- if not dirs:
1501
- return json.dumps({"error": f"No project found matching '{project_arg}'"})
1502
- results = []
1503
- for d in dirs:
1504
- stop_sh = d / "stop.sh"
1505
- start_sh = d / "start.sh"
1506
- if not stop_sh.exists() or not start_sh.exists():
1507
- results.append({"project": d.name, "status": "error", "detail": "stop.sh or start.sh not found"})
1508
- continue
1509
- try:
1510
- subprocess.run(["bash", str(stop_sh)], cwd=str(d), timeout=30)
1511
- subprocess.run(["bash", str(start_sh)], cwd=str(d), timeout=30)
1512
- results.append({"project": d.name, "status": "restarted"})
1513
- logger.info("Boss: restarted project %s", d.name)
1514
- except Exception as e:
1515
- results.append({"project": d.name, "status": "error", "detail": str(e)})
1516
- return json.dumps({"results": results})
1517
-
1518
- if name == "tail_log":
1519
- source = inputs.get("source", "").lower()
1520
- lines = int(inputs.get("lines", 100))
1521
- script = Path(__file__).resolve().parent.parent / "scripts" / "fetch_log.sh"
1522
- log_cfg_dir = Path("config") / "log-configs"
1523
-
1524
- if not script.exists():
1525
- return json.dumps({"error": "fetch_log.sh not found"})
1526
- if not log_cfg_dir.exists():
1527
- return json.dumps({"error": "config/log-configs/ not found"})
1528
-
1529
- props_files = sorted(log_cfg_dir.glob("*.properties"))
1530
- if source:
1531
- props_files = [p for p in props_files if source in p.stem.lower()]
1532
- if not props_files:
1533
- return json.dumps({"error": f"No log-config found matching '{source}'"})
1534
-
1535
- results = []
1536
- for props in props_files:
1537
- env = os.environ.copy()
1538
- env["TAIL"] = str(lines)
1539
- env["GREP_FILTER"] = "" # no filter — show everything
1540
- try:
1541
- r = subprocess.run(
1542
- ["bash", str(script), str(props)],
1543
- capture_output=True, text=True, timeout=60, env=env,
1544
- )
1545
- tail_lines = (r.stdout or "").strip().splitlines()[-lines:]
1546
- results.append({
1547
- "source": props.stem,
1548
- "lines": len(tail_lines),
1549
- "content": "\n".join(tail_lines),
1550
- })
1551
- logger.info("Boss tail_log %s rc=%d lines=%d", props.stem, r.returncode, len(tail_lines))
1552
- except subprocess.TimeoutExpired:
1553
- results.append({"source": props.stem, "error": "timed out"})
1554
- except Exception as e:
1555
- results.append({"source": props.stem, "error": str(e)})
1556
- return json.dumps({"results": results})
1557
-
1558
- if name == "post_file":
1559
- if not slack_client or not channel:
1560
- return json.dumps({"error": "No Slack channel context — cannot upload file"})
1561
- content = inputs.get("content", "")
1562
- filename = inputs.get("filename", "sentinel-output.txt")
1563
- title = inputs.get("title", filename)
1564
- if not content:
1565
- return json.dumps({"error": "No content provided"})
1566
- try:
1567
- await slack_client.files_upload_v2(
1568
- channel=channel,
1569
- content=content,
1570
- filename=filename,
1571
- title=title,
1572
- )
1573
- logger.info("Boss post_file: uploaded %s (%d bytes) to %s", filename, len(content), channel)
1574
- return json.dumps({"ok": True, "filename": filename, "bytes": len(content)})
1575
- except Exception as e:
1576
- logger.warning("Boss post_file failed: %s", e)
1577
- return json.dumps({"error": str(e)})
1578
-
1579
- if name == "my_stats":
1580
- hours = int(inputs.get("hours", 168))
1581
- errors = store.get_recent_errors(hours)
1582
- fixes = store.get_recent_fixes(hours)
1583
- prs = store.get_open_prs()
1584
- pending_conf = store.get_fixes_pending_confirmation()
1585
- # Conversation stats
1586
- history = store.load_conversation(user_id) if user_id else []
1587
- hist_len = len(history)
1588
- # Load conversation updated_at from DB
1589
- conv_updated = ""
1590
- try:
1591
- import sqlite3 as _sqlite3
1592
- with _sqlite3.connect(store.db_path) as _db:
1593
- row = _db.execute(
1594
- "SELECT updated_at FROM conversations WHERE user_id=?", (user_id,)
1595
- ).fetchone()
1596
- if row:
1597
- conv_updated = row[0]
1598
- except Exception:
1599
- pass
1600
- # Tally fix statuses
1601
- by_status: dict = {}
1602
- for fix in fixes:
1603
- s = fix.get("status", "unknown")
1604
- by_status[s] = by_status.get(s, 0) + 1
1605
- # Fixes confirmed via sentinel marker in prod
1606
- confirmed = [f for f in fixes if f.get("fix_outcome") == "confirmed"]
1607
- regressed = [f for f in fixes if f.get("fix_outcome") == "regressed"]
1608
- submitted = store.get_submitted_issues(user_id, hours=hours) if user_id else []
1609
- submitted_recent = store.get_submitted_issues(user_id, hours=hours) if user_id else []
1610
- return json.dumps({
1611
- "conversation": {
1612
- "messages_in_history": hist_len,
1613
- "turns": hist_len // 2,
1614
- "last_active": conv_updated or "no history",
1615
- },
1616
- "issues_you_submitted": {
1617
- "total_in_window": len(submitted_recent),
1618
- "all_time": len(store.get_submitted_issues(user_id) if user_id else []),
1619
- "recent": [
1620
- {"project": i["project"], "description": i["description"][:80],
1621
- "submitted_at": i["submitted_at"]}
1622
- for i in submitted_recent[:5]
1623
- ],
1624
- },
1625
- "window_hours": hours,
1626
- "errors_detected": len(errors),
1627
- "fixes": {
1628
- "applied": by_status.get("applied", 0),
1629
- "pending_pr": len(prs),
1630
- "failed": by_status.get("failed", 0),
1631
- "skipped": by_status.get("skipped", 0),
1632
- "error": by_status.get("error", 0),
1633
- },
1634
- "confirmed_in_prod": len(confirmed),
1635
- "regressed_after_fix": len(regressed),
1636
- "awaiting_confirmation": len(pending_conf),
1637
- "open_prs": [
1638
- {"repo": p["repo_name"], "pr_url": p["pr_url"], "timestamp": p["timestamp"]}
1639
- for p in prs
1640
- ],
1641
- "top_errors": [
1642
- {"message": e["message"][:100], "count": e["count"], "source": e["source"]}
1643
- for e in errors[:5]
1644
- ],
1645
- })
1646
- if name == "clear_my_history":
1647
- if user_id:
1648
- store.save_conversation(user_id, [])
1649
- logger.info("Boss: cleared conversation history for user %s", user_id)
1650
- return json.dumps({
1651
- "status": "cleared",
1652
- "note": "Your conversation history has been wiped. Next session starts fresh. [DONE]",
1653
- })
1654
- return json.dumps({"error": "cannot determine user — not clearing"})
1655
-
1656
- # ── Admin-only tools ──────────────────────────────────────────────────────
1657
- _ADMIN_TOOLS = {"list_all_users", "clear_user_history", "reset_fingerprint", "list_all_errors", "export_db"}
1658
- if name in _ADMIN_TOOLS:
1659
- if not is_admin:
1660
- return json.dumps({"error": "Admin access required. You are not in SLACK_ADMIN_USERS."})
1661
-
1662
- if name == "list_all_users":
1663
- stats = store.get_all_user_stats()
1664
- return json.dumps({"users": stats, "total": len(stats)})
1665
-
1666
- if name == "clear_user_history":
1667
- target = inputs.get("target_user_id", "").strip()
1668
- if not target:
1669
- return json.dumps({"error": "target_user_id is required"})
1670
- store.save_conversation(target, [])
1671
- display = store.get_user_name(target)
1672
- logger.info("Boss admin: cleared history for user %s (%s) by admin %s", target, display, user_id)
1673
- return json.dumps({"status": "cleared", "target_user_id": target, "display_name": display})
1674
-
1675
- if name == "reset_fingerprint":
1676
- fp = inputs.get("fingerprint", "").strip()
1677
- if not fp:
1678
- return json.dumps({"error": "fingerprint is required"})
1679
- found = store.reset_fingerprint(fp)
1680
- logger.info("Boss admin: reset fingerprint %s by admin %s (found=%s)", fp, user_id, found)
1681
- return json.dumps({"status": "reset" if found else "not_found", "fingerprint": fp,
1682
- "note": "Sentinel will retry this error on the next poll." if found else "No fix record found for this fingerprint."})
1683
-
1684
- if name == "list_all_errors":
1685
- hours = int(inputs.get("hours", 0))
1686
- errors = store.get_all_errors(hours)
1687
- return json.dumps({"errors": errors[:100], "total": len(errors),
1688
- "window_hours": hours or "all time"})
1689
-
1690
- if name == "export_db":
1691
- if not slack_client or not channel:
1692
- return json.dumps({"error": "No Slack channel context — cannot upload file"})
1693
- try:
1694
- import sqlite3 as _sq
1695
- import io as _io
1696
- lines = []
1697
- with _sq.connect(store.db_path) as _db:
1698
- for tbl in ["errors", "fixes", "reports", "slack_users", "conversations", "submitted_issues"]:
1699
- try:
1700
- rows = _db.execute(f"SELECT * FROM {tbl}").fetchall() # noqa: S608
1701
- cols = [d[0] for d in _db.execute(f"SELECT * FROM {tbl} LIMIT 0").description] # noqa: S608
1702
- lines.append(f"=== {tbl} ({len(rows)} rows) ===")
1703
- lines.append("\t".join(cols))
1704
- for row in rows:
1705
- lines.append("\t".join(str(v) if v is not None else "" for v in row))
1706
- lines.append("")
1707
- except Exception:
1708
- lines.append(f"=== {tbl} (unavailable) ===\n")
1709
- content = "\n".join(lines)
1710
- await slack_client.files_upload_v2(
1711
- channel=channel,
1712
- content=content,
1713
- filename="sentinel-db-export.tsv",
1714
- title="Sentinel DB Export",
1715
- )
1716
- logger.info("Boss admin: exported DB (%d bytes) by admin %s", len(content), user_id)
1717
- return json.dumps({"ok": True, "bytes": len(content)})
1718
- except Exception as e:
1719
- return json.dumps({"error": str(e)})
1720
-
1721
- return json.dumps({"error": f"unknown tool: {name}"})
1722
-
1723
-
1724
- # ── CLI fallback (OAuth / no API key) ────────────────────────────────────────
1725
-
1726
- def _attachments_to_text(attachments: list[dict]) -> str:
1727
- """Produce a plain-text summary of attachments to append to CLI prompts."""
1728
- if not attachments:
1729
- return ""
1730
- parts = []
1731
- for att in attachments:
1732
- if att["type"] == "text":
1733
- parts.append(
1734
- f"[Attached file: {att['name']}]\n{att['content']}"
1735
- )
1736
- elif att["type"] == "image":
1737
- parts.append(
1738
- f"[Attached image: {att['name']}] (saved at {att['path']})"
1739
- )
1740
- else:
1741
- parts.append(
1742
- f"[Attached file: {att['name']}] (saved at {att['path']} — read it if relevant)"
1743
- )
1744
- return "\n\nATTACHMENTS:\n" + "\n---\n".join(parts)
1745
-
1746
-
1747
- def _attachments_to_api_blocks(attachments: list[dict]) -> list[dict]:
1748
- """Convert attachments into Anthropic API message content blocks."""
1749
- blocks: list[dict] = []
1750
- for att in attachments:
1751
- if att["type"] == "image":
1752
- blocks.append({
1753
- "type": "image",
1754
- "source": {
1755
- "type": "base64",
1756
- "media_type": att.get("mime", "image/png"),
1757
- "data": att["content"],
1758
- },
1759
- })
1760
- elif att["type"] == "text":
1761
- blocks.append({
1762
- "type": "text",
1763
- "text": f"[Attached file: {att['name']}]\n{att['content']}",
1764
- })
1765
- else:
1766
- blocks.append({
1767
- "type": "text",
1768
- "text": f"[Attached file: {att['name']}] saved at {att['path']}",
1769
- })
1770
- return blocks
1771
-
1772
-
1773
- _ACTION_RE = re.compile(r"^ACTION:\s*(\{.*\})", re.MULTILINE)
1774
-
1775
-
1776
- async def _handle_with_cli(
1777
- message: str,
1778
- history: list,
1779
- cfg_loader,
1780
- store,
1781
- slack_client=None,
1782
- user_name: str = "",
1783
- user_id: str = "",
1784
- attachments: list | None = None,
1785
- is_admin: bool = False,
1786
- ) -> tuple[str, bool]:
1787
- """Fallback: use `claude --print` for users without an Anthropic API key."""
1788
- status_json = await _run_tool("get_status", {"hours": 24}, cfg_loader, store)
1789
- prs_json = await _run_tool("list_pending_prs", {}, cfg_loader, store)
1790
-
1791
- # Pre-fetch log search if the message is a search request.
1792
- # Use quoted strings as the query, or fall back to the full message.
1793
- # Never hardcode field names — the query is whatever the user said.
1794
- search_json = ""
1795
- _search_kws = ("search", "find", "look for", "show me log", "grep", "entries for")
1796
- if any(kw in message.lower() for kw in _search_kws):
1797
- quoted = re.findall(r'"([^"]+)"', message)
1798
- query = quoted[0] if quoted else message
1799
- search_json = await _run_tool("search_logs", {"query": query}, cfg_loader, store)
1800
-
1801
- paused = Path("SENTINEL_PAUSE").exists()
1802
- repos = list(cfg_loader.repos.keys())
1803
- log_sources = list(cfg_loader.log_sources.keys())
1804
- ts = datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M UTC")
1805
-
1806
- history_text = ""
1807
- for msg in history[-8:]:
1808
- role = msg["role"].upper()
1809
- content = msg["content"]
1810
- if isinstance(content, list):
1811
- content = " ".join(
1812
- (b.get("text", "") if isinstance(b, dict) else getattr(b, "text", ""))
1813
- for b in content
1814
- if (isinstance(b, dict) and b.get("type") == "text")
1815
- or (hasattr(b, "type") and b.type == "text")
1816
- )
1817
- history_text += f"\n{role}: {content}"
1818
-
1819
- slack_mention = f"<@{user_id}>" if user_id else (user_name or "")
1820
- known_users = store.get_all_users()
1821
- users_hint = ", ".join(f"<@{uid}> = {name}" for uid, name in known_users.items())
1822
- prompt = (
1823
- _SYSTEM
1824
- + (f"\nYou are speaking with: {user_name} (Slack mention: {slack_mention})" if user_name else "")
1825
- + "\nAlways start your reply by addressing the user directly using their Slack mention, e.g. \"<@U123> here is what I found...\"."
1826
- + " Never use their plain name — always use the <@USER_ID> format so Slack highlights it."
1827
- + (f"\nKnown Slack users: {users_hint}" if users_hint else "")
1828
- + f"\n\nCurrent time: {ts}"
1829
- + f"\nSentinel status: {'⏸ PAUSED' if paused else '▶ RUNNING'}"
1830
- + f"\nManaged repos: {', '.join(repos) if repos else '(none configured)'}"
1831
- + (f"\nLog sources: {', '.join(log_sources)}" if log_sources else "")
1832
- + f"\nAdmin access for this user: {'YES — admin tools are available' if is_admin else 'NO — admin tools will be refused'}"
1833
- + "\nNOTE: Running in CLI fallback mode — admin tools and some features are unavailable. Ask user to configure ANTHROPIC_API_KEY for full features."
1834
- + f"\n\nCurrent status (last 24 h):\n{status_json}"
1835
- + f"\n\nOpen PRs:\n{prs_json}"
1836
- + (f"\n\nLog search results:\n{search_json}" if search_json else "")
1837
- + (f"\n\nConversation so far:{history_text}" if history_text else "")
1838
- + _attachments_to_text(attachments or [])
1839
- + f"\n\nUSER: {message}"
1840
- + "\n\nIf you need to take an action, include a line like:\n"
1841
- + " ACTION: {\"action\": \"pause_sentinel\"}\n"
1842
- + " ACTION: {\"action\": \"resume_sentinel\"}\n"
1843
- + " ACTION: {\"action\": \"trigger_poll\"}\n"
1844
- + " ACTION: {\"action\": \"create_issue\", \"description\": \"...\", \"target_repo\": \"\"}\n"
1845
- + " ACTION: {\"action\": \"search_logs\", \"query\": \"<whatever the user asked to find>\"}\n"
1846
- + "End with [DONE] if the request is fully handled."
1847
- )
1848
-
1849
- cfg = cfg_loader.sentinel
1850
- env = os.environ.copy()
1851
- if cfg.anthropic_api_key:
1852
- env["ANTHROPIC_API_KEY"] = cfg.anthropic_api_key
1853
-
1854
- try:
1855
- result = subprocess.run(
1856
- ([cfg.claude_code_bin, "--dangerously-skip-permissions", "--print", prompt]
1857
- if os.getuid() != 0 else
1858
- [cfg.claude_code_bin, "--print", prompt]),
1859
- capture_output=True, text=True, timeout=180, env=env,
1860
- )
1861
- output = (result.stdout or "").strip()
1862
- if result.returncode != 0 or not output:
1863
- stderr = (result.stderr or "").strip()
1864
- logger.error(
1865
- "Boss CLI call failed (rc=%d): stdout=%r stderr=%r",
1866
- result.returncode, output[:200], stderr[:200],
1867
- )
1868
- raw_err = (result.stderr or "").strip()
1869
- if result.returncode != 0 and not output:
1870
- full_err = f"exit {result.returncode}: {raw_err[:300]}"
1871
- cfg = cfg_loader.sentinel
1872
- alert_if_rate_limited(cfg.slack_bot_token, cfg.slack_channel,
1873
- "sentinel_boss/cli", raw_err or full_err)
1874
- return f":warning: `claude --print` failed ({full_err})", True
1875
- except Exception as e:
1876
- logger.error("Boss CLI call failed: %s", e)
1877
- return f":warning: Boss unavailable: {e}", True
1878
-
1879
- for m in _ACTION_RE.finditer(output):
1880
- try:
1881
- action = json.loads(m.group(1))
1882
- name = action.pop("action", "")
1883
- if name:
1884
- result_str = await _run_tool(name, action, cfg_loader, store, user_id=user_id)
1885
- logger.info("Boss CLI action: %s → %s", name, result_str[:80])
1886
- except Exception as e:
1887
- logger.warning("Boss action parse error: %s", e)
1888
-
1889
- reply = _ACTION_RE.sub("", output).strip()
1890
- is_done = "[DONE]" in reply
1891
- reply = reply.replace("[DONE]", "").strip()
1892
- if not reply:
1893
- greeting = f"Hi {user_name}! " if user_name else "Hi! "
1894
- reply = f"{greeting}I'm Sentinel, your autonomous DevOps agent. How can I help you?"
1895
-
1896
- history.append({"role": "user", "content": message})
1897
- history.append({"role": "assistant", "content": reply})
1898
- return reply, is_done
1899
-
1900
-
1901
- # ── History serialization helpers ────────────────────────────────────────────
1902
-
1903
- def _serialize_content(content) -> list:
1904
- """Convert Anthropic SDK response content (Pydantic objects) to plain dicts.
1905
-
1906
- The SDK returns TextBlock / ToolUseBlock instances. json.dumps(..., default=str)
1907
- turns them into useless strings like "TextBlock(type='text', text='...')".
1908
- This converts them to proper dicts so history round-trips through SQLite safely.
1909
- """
1910
- if not isinstance(content, list):
1911
- return content
1912
- result = []
1913
- for block in content:
1914
- if isinstance(block, dict):
1915
- result.append(block)
1916
- elif hasattr(block, "model_dump"):
1917
- result.append(block.model_dump())
1918
- elif hasattr(block, "dict"):
1919
- result.append(block.dict())
1920
- elif hasattr(block, "type"):
1921
- if block.type == "text":
1922
- result.append({"type": "text", "text": getattr(block, "text", "")})
1923
- elif block.type == "tool_use":
1924
- result.append({
1925
- "type": "tool_use",
1926
- "id": getattr(block, "id", ""),
1927
- "name": getattr(block, "name", ""),
1928
- "input": getattr(block, "input", {}),
1929
- })
1930
- else:
1931
- result.append({"type": "text", "text": str(block)})
1932
- return result
1933
-
1934
-
1935
- def _clean_history(history: list) -> list:
1936
- """Remove turns that would cause a 400 from the Anthropic API.
1937
-
1938
- Strips orphaned tool_use blocks (assistant turn with tool_use but no
1939
- following tool_result turn) and consecutive same-role turns that result
1940
- from a previous session that crashed mid-tool-loop.
1941
- """
1942
- cleaned = []
1943
- i = 0
1944
- while i < len(history):
1945
- turn = history[i]
1946
- role = turn.get("role", "")
1947
- content = turn.get("content", [])
1948
-
1949
- # Drop assistant turns that contain tool_use if the next turn isn't tool_result
1950
- if role == "assistant" and isinstance(content, list):
1951
- has_tool_use = any(
1952
- (isinstance(b, dict) and b.get("type") == "tool_use")
1953
- for b in content
1954
- )
1955
- if has_tool_use:
1956
- next_turn = history[i + 1] if i + 1 < len(history) else None
1957
- next_content = (next_turn or {}).get("content", [])
1958
- has_result = isinstance(next_content, list) and any(
1959
- (isinstance(b, dict) and b.get("type") == "tool_result")
1960
- for b in next_content
1961
- )
1962
- if not has_result:
1963
- i += 1 # skip orphaned tool_use turn
1964
- continue
1965
-
1966
- # Drop consecutive same-role turns (keep the last one)
1967
- if cleaned and cleaned[-1].get("role") == role:
1968
- cleaned[-1] = turn
1969
- else:
1970
- cleaned.append(turn)
1971
- i += 1
1972
- return cleaned
1973
-
1974
-
1975
- # ── API-key path (structured tools, full agentic loop) ────────────────────────
1976
-
1977
- async def _handle_with_api(
1978
- message: str,
1979
- history: list,
1980
- cfg_loader,
1981
- store,
1982
- slack_client=None,
1983
- user_name: str = "",
1984
- user_id: str = "",
1985
- attachments: list | None = None,
1986
- channel: str = "",
1987
- is_admin: bool = False,
1988
- ) -> tuple[str, bool]:
1989
- import anthropic
1990
-
1991
- api_key = cfg_loader.sentinel.anthropic_api_key or os.environ.get("ANTHROPIC_API_KEY", "")
1992
- client = anthropic.Anthropic(api_key=api_key)
1993
-
1994
- paused = Path("SENTINEL_PAUSE").exists()
1995
- repos = list(cfg_loader.repos.keys())
1996
- ts = datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M UTC")
1997
- known_projects = [_read_project_name(d) for d in _find_project_dirs()]
1998
- log_sources = list(cfg_loader.log_sources.keys())
1999
- slack_mention = f"<@{user_id}>" if user_id else (user_name or "")
2000
- known_users = store.get_all_users() # {user_id: display_name}
2001
- users_hint = ", ".join(f"<@{uid}> = {name}" for uid, name in known_users.items())
2002
- system = (
2003
- _SYSTEM
2004
- + (f"\nYou are speaking with: {user_name} (Slack mention: {slack_mention})" if user_name else "")
2005
- + "\nAlways start your reply by addressing the user directly using their Slack mention, e.g. \"<@U123> here is what I found...\"."
2006
- + " Never use their plain name — always use the <@USER_ID> format so Slack highlights it."
2007
- + (f"\nKnown Slack users: {users_hint}" if users_hint else "")
2008
- + f"\n\nCurrent time: {ts}"
2009
- + f"\nSentinel status: {'⏸ PAUSED' if paused else '▶ RUNNING'}"
2010
- + f"\nManaged repos: {', '.join(repos) if repos else '(none configured)'}"
2011
- + (f"\nLog sources: {', '.join(log_sources)}" if log_sources else "")
2012
- + (f"\nKnown projects in workspace: {', '.join(known_projects)}" if known_projects else "")
2013
- + f"\nAdmin access for this user: {'YES — admin tools are available' if is_admin else 'NO — admin tools will be refused'}"
2014
- )
2015
-
2016
- # Build user content — include attachment blocks if any
2017
- attach_blocks = _attachments_to_api_blocks(attachments or [])
2018
- if attach_blocks:
2019
- user_content = attach_blocks + [{"type": "text", "text": message}]
2020
- else:
2021
- user_content = message
2022
-
2023
- # Work on a local copy — only commit to history on success to prevent
2024
- # cascading 400s if the API rejects a malformed/corrupted history.
2025
- messages = list(history) + [{"role": "user", "content": user_content}]
2026
-
2027
- while True:
2028
- response = client.messages.create(
2029
- model="claude-opus-4-6",
2030
- max_tokens=2048,
2031
- system=system,
2032
- tools=_TOOLS,
2033
- messages=messages,
2034
- )
2035
-
2036
- text_parts = []
2037
- tool_blocks = []
2038
- for block in response.content:
2039
- if block.type == "text":
2040
- text_parts.append(block.text)
2041
- elif block.type == "tool_use":
2042
- tool_blocks.append(block)
2043
-
2044
- if not tool_blocks:
2045
- reply = " ".join(text_parts).strip()
2046
- is_done = "[DONE]" in reply
2047
- reply = reply.replace("[DONE]", "").strip()
2048
- if not reply:
2049
- greeting = f"Hi {user_name}! " if user_name else "Hi! "
2050
- reply = f"{greeting}I'm Sentinel, your autonomous DevOps agent. How can I help you?"
2051
- # Heuristic override: if reply ends with a question, Claude is waiting for input
2052
- if is_done and re.search(r'\?\s*$', reply):
2053
- is_done = False
2054
- # Commit to history only on success serialize SDK objects to plain dicts
2055
- history.append({"role": "user", "content": user_content})
2056
- history.append({"role": "assistant", "content": _serialize_content(response.content)})
2057
- return reply, is_done
2058
-
2059
- messages.append({"role": "assistant", "content": _serialize_content(response.content)})
2060
- tool_results = []
2061
- for tc in tool_blocks:
2062
- result = await _run_tool(tc.name, tc.input, cfg_loader, store, slack_client=slack_client, user_id=user_id, channel=channel, is_admin=is_admin)
2063
- logger.info("Boss tool: %s(%s) → %s", tc.name, tc.input, result[:120])
2064
- tool_results.append({
2065
- "type": "tool_result",
2066
- "tool_use_id": tc.id,
2067
- "content": result,
2068
- })
2069
- messages.append({"role": "user", "content": tool_results})
2070
-
2071
-
2072
- # ── Main entry point ──────────────────────────────────────────────────────────
2073
-
2074
- async def handle_message(
2075
- message: str,
2076
- history: list,
2077
- cfg_loader,
2078
- store,
2079
- slack_client=None,
2080
- user_name: str = "",
2081
- user_id: str = "",
2082
- attachments: list | None = None,
2083
- channel: str = "",
2084
- is_admin: bool = False,
2085
- ) -> tuple[str, bool]:
2086
- """
2087
- Process one user message through the Sentinel Boss (Claude with tool use).
2088
-
2089
- Priority:
2090
- 1. Claude Pro / OAuth via `claude --print` (CLI path — no API key needed)
2091
- 2. ANTHROPIC_API_KEY fallback (structured tools, full agentic loop)
2092
-
2093
- Returns:
2094
- (reply_text, is_done)
2095
- is_done=True → session complete, release the Slack queue slot.
2096
- is_done=False waiting for user follow-up, keep the slot.
2097
- """
2098
- api_key = cfg_loader.sentinel.anthropic_api_key or os.environ.get("ANTHROPIC_API_KEY", "")
2099
-
2100
- # 1st priority: ANTHROPIC_API_KEY full structured tools, cheap per-token for Boss queries
2101
- if api_key:
2102
- try:
2103
- import anthropic # noqa: F401
2104
- return await _handle_with_api(
2105
- message, history, cfg_loader, store, slack_client=slack_client,
2106
- user_name=user_name, user_id=user_id, attachments=attachments, channel=channel,
2107
- is_admin=is_admin,
2108
- )
2109
- except Exception as api_err:
2110
- err_str = str(api_err)
2111
- # Detect rate-limit / auth failure and alert Slack before falling through
2112
- cfg = cfg_loader.sentinel
2113
- if is_rate_limited(err_str):
2114
- from .notify import rate_limit_message
2115
- alert_if_rate_limited(cfg.slack_bot_token, cfg.slack_channel,
2116
- "sentinel_boss/api", err_str)
2117
- logger.warning("Boss: API key path failed (%s), trying CLI fallback", err_str)
2118
-
2119
- # 2nd priority: Claude Pro / OAuth via CLI (limited tools but no API key needed)
2120
- cli_reply, cli_done = await _handle_with_cli(
2121
- message, history, cfg_loader, store, slack_client=slack_client, user_name=user_name,
2122
- user_id=user_id, attachments=attachments, is_admin=is_admin,
2123
- )
2124
- if not cli_reply.startswith(":warning:"):
2125
- return cli_reply, cli_done
2126
-
2127
- # Both paths failed — alert Slack and return error
2128
- cfg = cfg_loader.sentinel
2129
- err_output = cli_reply
2130
- alert_if_rate_limited(cfg.slack_bot_token, cfg.slack_channel,
2131
- "sentinel_boss/cli", err_output)
2132
- if not api_key:
2133
- # No auth at all configured
2134
- no_auth_msg = (
2135
- ":warning: *Sentinel Boss — no Claude auth configured*\n"
2136
- "Configure at least one of:\n"
2137
- " `ANTHROPIC_API_KEY` in `sentinel.properties` full features\n"
2138
- "• Claude Pro OAuth: run `claude login` on the server — required for fix_engine\n"
2139
- "See: https://github.com/misterhuydo/Sentinel#authentication"
2140
- )
2141
- slack_alert(cfg.slack_bot_token, cfg.slack_channel, no_auth_msg)
2142
- return ":warning: No Claude authentication configured. See Slack for details.", True
2143
- return cli_reply, cli_done
1
+ """
2
+ sentinel_boss.py — Claude-backed Sentinel Boss.
3
+
4
+ Claude acts as the boss: reads project state, decides on actions,
5
+ executes them via tool use, and responds naturally. One agentic loop
6
+ per turn — Claude may call multiple tools before replying.
7
+ """
8
+
9
+ import json
10
+ import logging
11
+ import os
12
+ import re
13
+ import subprocess
14
+ import uuid
15
+ from datetime import datetime, timezone
16
+ from pathlib import Path
17
+ from typing import Optional
18
+
19
+ from .notify import alert_if_rate_limited, slack_alert, is_rate_limited
20
+
21
+ logger = logging.getLogger(__name__)
22
+
23
+ # ── System prompt ────────────────────────────────────────────────────────────
24
+
25
+ _SYSTEM = """\
26
+ You are Sentinel Boss — the AI interface for Sentinel, a 24/7 autonomous DevOps agent.
27
+
28
+ Sentinel watches production logs, detects errors, generates code fixes via Claude Code,
29
+ and opens GitHub PRs for admin review (or pushes directly if AUTO_PUBLISH=true).
30
+
31
+ Your job:
32
+ - Understand what the DevOps engineer needs in natural language
33
+ - Query Sentinel's live state (errors, fixes, open PRs) on their behalf
34
+ - Deliver tasks/issues to the right project — you know all projects in this workspace
35
+ - Control Sentinel (pause/resume) when asked
36
+ - Give honest, concise answers — you know this system inside out
37
+ - If a project name is unclear or ambiguous, ask the engineer to clarify — never guess
38
+
39
+ What you can do (tools available):
40
+
41
+ 1. get_status — Show recent errors detected, fixes applied/pending, open PRs.
42
+ e.g. "what happened today?", "any issues?", "show open PRs"
43
+
44
+ 2. create_issue — Deliver a fix/task to any project in this workspace by short name.
45
+ You know all project names — use list_projects if you're unsure.
46
+ If the project name is ambiguous or not found, ask to clarify.
47
+ e.g. "tell 1881 to fix X", "look into Y in elprint", "investigate Z"
48
+
49
+ 3. pause_sentinel — Create SENTINEL_PAUSE file to halt all auto-fix activity.
50
+ e.g. "pause sentinel", "stop auto-fixing"
51
+
52
+ 4. resume_sentinel — Remove SENTINEL_PAUSE file to resume normal operation.
53
+ e.g. "resume sentinel", "unpause"
54
+
55
+ 5. list_projects — List all configured repos and log sources in this Sentinel instance.
56
+ e.g. "what projects are you watching?", "list all repos"
57
+
58
+ 6. search_logs — SSH live to servers and grep logs in real time (uses fetch_log.sh with
59
+ the query as GREP_FILTER). Falls back to cached files if unavailable.
60
+ e.g. "search logs for illegal PIN in 1881", "find X in SSOLWA", "grep logs for Z"
61
+
62
+ 6b. filter_logs Instant keyword/regex search on locally-synced logs. No SSH, sub-second.
63
+ Supports time range (since_hours) and case options.
64
+ e.g. "filter logs for TryDig", "find appid=X in STS logs", "errors last 6h"
65
+
66
+ 7. trigger_poll — Trigger an immediate poll cycle without waiting for the schedule.
67
+ e.g. "check now", "poll immediately", "don't wait, run now"
68
+
69
+ 8. get_repo_status Show the current git branch, last commit, and recent fix branches
70
+ for a specific repository.
71
+ e.g. "status of repo X", "what branch is cairn on?"
72
+
73
+ 9. list_recent_commits List the most recent commits in a repo (including Sentinel's auto-fixes).
74
+ e.g. "show me recent commits in elprint-sales", "what did sentinel commit?"
75
+
76
+ 10. get_fix_detail — Get full details of a specific fix: error, patch path, PR URL, status.
77
+ e.g. "show fix abc123", "details on that fix"
78
+
79
+ 11. list_errors — List recent errors from the state store, optionally filtered by repo or source.
80
+ e.g. "show all errors today", "what errors hit elprint this week?"
81
+
82
+ 12. pull_repo — Run git pull on one or all managed application repos.
83
+ e.g. "pull changes", "git pull all repos", "update the code"
84
+
85
+ 13. pull_config — Run git pull on one or all Sentinel project config dirs.
86
+ e.g. "pull config for 1881", "update sentinel config", "pull all configs"
87
+
88
+ 14. fetch_logs — Run fetch_log.sh on demand to pull fresh logs from remote servers right now.
89
+ Supports --debug mode and parameter overrides (tail count, grep filter).
90
+ e.g. "fetch logs", "try fetch_log.sh for SSOLWA", "fetch logs with debug",
91
+ "grab latest logs from STS", "fetch logs without filter"
92
+
93
+ 15. watch_bot — Register a Slack bot for passive monitoring. Every message it posts is
94
+ auto-queued as an issue in the bot's registered project.
95
+ ALWAYS requires a project infer from context or ask the user first.
96
+ e.g. "listen to @alertbot", "watch @bot1 @bot2 for project 1881", "monitor @errorbot"
97
+
98
+ 16. unwatch_bot — Remove a Slack bot from the passive watch list.
99
+ e.g. "stop watching @alertbot", "unwatch @errorbot"
100
+
101
+ 17. list_watched_botsShow all Slack bots currently being passively monitored and which projects
102
+ they are delivering to.
103
+ e.g. "which bots are you watching?", "list monitored bots"
104
+
105
+ 18. upgrade_sentinel Pull the latest Sentinel agent code, update Python deps, and restart the
106
+ process. Safe to run at any time no restart if already up to date.
107
+ e.g. "upgrade sentinel", "update sentinel", "upgrade yourself"
108
+
109
+ 19. ask_codebase — Ask any natural-language question about a managed repo's codebase.
110
+ Claude Code answers using its full knowledge of the code.
111
+ e.g. "what does the 1881 backend do?", "find PIN validation in elprint",
112
+ "any TODOs in cairn?", "are there security issues in elprint-sales?"
113
+
114
+ 20. restart_project Stop and restart a specific Sentinel monitoring instance (stop.sh + start.sh).
115
+ This restarts the Sentinel agent for that project, NOT the application itself.
116
+ e.g. "restart sentinel for 1881", "restart the 1881 monitor", "reload elprint sentinel"
117
+
118
+ 21. tail_log — Fetch the last N lines of a log source live, without a grep filter.
119
+ e.g. "show recent SSOLWA logs", "tail STS", "last 200 lines from 1881 logs"
120
+
121
+ 22. post_file — Upload a text file to the Slack conversation (diff, log excerpt, report, CSV).
122
+ Use when output is too large for chat, or the user asks to download/export something.
123
+ e.g. "give me that as a file", "export the log", "send me the diff"
124
+
125
+ When someone asks what you can do, what you support, what your capabilities are, or how you can help,
126
+ reply with a short summary grouped by category:
127
+
128
+ *Monitoring & status*
129
+ `get_status` — errors detected, fixes applied/pending/failed, open PRs — "what happened today?"
130
+ • `get_repo_status` — per-repo breakdown of errors and fixes — "how is elprint doing?"
131
+ • `list_recent_commits` — recent Sentinel auto-fix commits — "what did Sentinel commit?"
132
+
133
+ *Log management*
134
+ `fetch_logs` — pull fresh logs from servers right now — "fetch logs for SSOLWA"
135
+ • `search_logs` — live SSH grep on production servers — "search logs for illegal PIN in 1881"
136
+ • `filter_logs` — instant grep on locally-synced logs (no SSH) — "filter logs for TryDig", "show errors from last 24h"
137
+ `tail_log` — last N lines of a log source, no filter — "show recent SSOLWA logs"
138
+
139
+ *Codebase questions*
140
+ • `ask_codebase` — any question about a repo's code "what does 1881 do?", "find PIN validation", "any TODOs?", "security issues?"
141
+
142
+ *Fix management*
143
+ • `get_fix_details` — full details of a specific fix — "show fix abc123"
144
+ • `list_pending_prs` — all open Sentinel PRs awaiting review — "list open PRs"
145
+ • `check_auth_status` — Claude auth health, rate-limit circuit state, fix engine 24 h stats — "is Claude working?", "any rate limits?", "auth issues?"
146
+
147
+ *Project & task delivery*
148
+ `list_projects` all projects and repos Sentinel manages — "what projects do you manage?"
149
+ • `create_issue` — deliver a task to any project by name — "tell 1881 to fix X"
150
+ • `trigger_poll` — run a log-fetch + fix cycle right now — "check now"
151
+ • `pause_sentinel` / `resume_sentinel` — halt or resume all auto-fix activity — "pause Sentinel"
152
+
153
+ *Repo & config sync*
154
+ • `pull_repo` — git pull on managed application repos — "pull latest code"
155
+ • `pull_config` — git pull on Sentinel config dirs — "pull config for elprint"
156
+
157
+ *File sharing*
158
+ • `post_file` — upload a file to Slack — "give me that as a file", "export the log", "send me the diff"
159
+
160
+ *Personal*
161
+ • `my_stats` — your activity: issues submitted, fixes, conversation history — "my stats"
162
+ `clear_my_history` wipe your conversation history and start fresh — "clear my history"
163
+
164
+ *Slack bot watching*
165
+ • `list_watched_bots` — show all bots currently being monitored — "which bots are you watching?"
166
+
167
+ *Admin* (SLACK_ADMIN_USERS if configured, otherwise all allowed users)
168
+ • `watch_bot` — register a Slack bot for passive monitoring; its messages become issues — "listen to @alertbot"
169
+ • `unwatch_bot` — stop monitoring a bot "stop watching @errorbot"
170
+ • `restart_project` — stop + restart a Sentinel monitoring instance (not the app) — "restart sentinel for 1881"
171
+ • `upgrade_sentinel` — pull latest Sentinel release and restart "upgrade sentinel"
172
+ • `list_all_users` — all Slack users who have talked to Sentinel + activity summary
173
+ `clear_user_history` wipe a specific user's conversation history
174
+ `reset_fingerprint` clear the 24h fix lock so Sentinel retries an error
175
+ `list_all_errors` full unfiltered error database
176
+ • `export_db` — dump full Sentinel state as a downloadable file
177
+
178
+ About Sentinel answer any question someone asks:
179
+
180
+ Sentinel is a 24/7 autonomous DevOps agent deployed per-project. Here is everything you know:
181
+
182
+ Architecture:
183
+ - Poll loop every POLL_INTERVAL_SECONDS (default 120s)
184
+ - Log sources: SSH servers (rsync + live grep) or Cloudflare worker endpoints
185
+ - Local sync: rsync --append-verify copies remote logs to workspace/synced/ every SYNC_INTERVAL_SECONDS (default 300s); full history accumulated locally
186
+ - Error detection: regex-based parsing, multi-line stack trace grouping, fingerprinting (hash of normalised message + top 3 stack frames)
187
+ - Dedup: SQLite state_store.db 24h cooldown per fingerprint, plus git log check before each fix
188
+ - Routing: TARGET_REPO=auto uses PACKAGE_PREFIXES to map stack trace frames to the correct repo; explicit TARGET_REPO overrides
189
+ - Fix engine: Claude Code headless (claude --print) with structured prompt (error + stack trace + Cairn MCP context); unified diff output; max 5 files / 200 lines
190
+ - Commit: git pull --rebase, apply patch, run tests, commit with sentinel/fix-<fp> marker
191
+ - Publish: AUTO_PUBLISH=true push to main + CI/CD trigger; AUTO_PUBLISH=false branch + GitHub PR
192
+ - Fix confirmation: SENTINEL marker injected into every modified method; when marker appears in production logs quiet period starts; after MARKER_CONFIRM_HOURS with no recurrence → fix confirmed
193
+
194
+ Health monitoring (HEALTH_URL per repo):
195
+ - Polls the URL on each cycle; expects JSON with "Status": "true"
196
+ - 502/503/504 or connection refused → status=stopped
197
+ - 200 + Status != true status=failing
198
+ - stopped + startup failure in synced logs auto-fix attempt (Spring Boot BeanCreationException, NoSuchMethodError, APPLICATION FAILED TO START, etc.)
199
+ - stopped + no startup errors asks human ONCE "is this deliberate?", then stays silent (state=pending)
200
+ - Human says "maintenance <repo>" state=confirmed, fully silent until recovery
201
+ - Recovery (health=healthy again) → clears state, posts "App X is back online"
202
+
203
+ Duplicate / cross-source dedup:
204
+ - Fingerprint-based: same error from monitor bot + log scan same fingerprint state_store dedup
205
+ - git log check: before each fix attempt, checks recent commits for the fingerprint — skips if already fixed
206
+ - 24h cooldown per fingerprint prevents retry spam
207
+
208
+ Slack Boss:
209
+ - Socket Mode (xapp-... app-level token + xoxb-... bot token)
210
+ - Per-user sessions with SQLite-persisted history (last 40 messages)
211
+ - Tool-use loop with Anthropic API (cheap per-token, structured tools)
212
+ - Falls back to claude CLI if no API key configured
213
+ - Admin users (SLACK_ADMIN_USERS) can access destructive/sensitive tools
214
+
215
+ Common config questions:
216
+ - ANTHROPIC_API_KEY: used by Boss conversation (structured tool-use, cheap); optional for Fix Engine when CLAUDE_PRO_FOR_TASKS=true
217
+ - CLAUDE_PRO_FOR_TASKS=true (default): Fix Engine calls claude CLI using Claude Pro OAuth billing; falls back to API key on auth error
218
+ - AUTO_PUBLISH=false (default): Sentinel opens a PR for admin review; =true: pushes directly to main
219
+ - SYNC_RETENTION_DAYS (default 30): delete synced log files older than N days
220
+ - SYNC_MAX_FILE_MB (default 200): truncate synced log files exceeding this size (drops oldest half of lines)
221
+ - HEALTH_URL: HTTP endpoint per repo; JSON with "Status": "true" = healthy
222
+ - TARGET_REPO=auto: route errors to repo by longest-matching PACKAGE_PREFIXES; =<name>: always route to that repo
223
+ - SLACK_ALLOWED_USERS: if set, only these Slack user IDs can interact with Boss
224
+ - SLACK_ADMIN_USERS: subset of allowed users with access to admin-only tools (reset_fingerprint, export_db, watch_bot, etc.)
225
+
226
+ Required Slack Bot Token scopes: app_mentions:read, channels:history, groups:history, im:history, chat:write, files:read, reactions:write, users:read, conversations.connect:read
227
+ Required App-Level Token scope (Socket Mode): connections:write
228
+ Events to subscribe: app_mention, message.im, message.channels
229
+
230
+ Tone: direct, professional, like a senior engineer who owns the system.
231
+ Don't pad responses. Don't say "Great question!" or "Certainly!".
232
+ If you don't know something, use a tool to find out before saying you don't know.
233
+
234
+ When to act vs. when to ask:
235
+ - Clear command ("check status", "fetch logs", "pause sentinel") → call the tool immediately, reply with results.
236
+ - Ambiguous or exploratory ("what does get_repo_status do?", "tell me about search_logs") → explain the tool naturally, then ask: "Want me to run it?"
237
+ - Unclear intent (could be either) → use judgment: brief explanation + "Want me to run this now?"
238
+ - Prefer filter_logs over search_logs when synced logs are available it's instant and never causes session timeout.
239
+ Use search_logs only when the user explicitly wants live/real-time data or synced logs are not yet available.
240
+ - If a tool call will take a moment (search, fetch, pull), prefix your reply with a brief "working" line ending in "..." before the results, e.g. "Searching SSOLWA for TryDig activity..." then the actual output.
241
+ Never just say a working line and stop — always follow it with the results in the same message.
242
+
243
+ Session context — critical rules:
244
+ - Loaded conversation history is prior-session background only. It may be hours or days old.
245
+ - NEVER say "the previous search", "I already fetched", "as I found earlier", or any phrase implying you already did part of the current task — unless a tool result appears in THIS response's tool calls.
246
+ - When handling a new request, call the tools fresh. Do not assume any prior tool result is still current or that any prior step "counts" toward the current task.
247
+ - The only exception: if the user explicitly asks about something from the history ("what did you find earlier?"), you may reference it — but note it is from a prior session.
248
+
249
+ Trust your tool results — never contradict them:
250
+ - If any search_logs call in this response returned total_matches > 0, you HAVE results. Report them.
251
+ - Never say "no results found" or "nothing was found" when a tool result shows total_matches > 0.
252
+ - If one source-specific call returns 0 but a broader call returned matches, use the broader results.
253
+ - A cached result with files_searched=0 is a source-name lookup failure, NOT an absence of log data.
254
+ Treat it as "source not recognised" and fall back to the broad search results you already have.
255
+
256
+ Avoid redundant tool calls (within a single response only always run tools fresh for new requests):
257
+ - If a broad search (e.g. search_logs with no source filter) already returned results in THIS response, do NOT repeat the same search with a source filter to "refine"use what you already fetched.
258
+ - If a tool call fails in THIS response, do NOT retry the entire search from scratch. Continue with what succeeded and note the failure.
259
+ - One pass per task: gather all needed data in a single round of tool calls, then produce the final answer.
260
+
261
+ Issue identification — before calling create_issue:
262
+ 1. Determine if the message is a REAL issue/task (bug report, feature request, investigation ask)
263
+ vs. a status question, tool query, or casual chat. If not an issue, just answer normally.
264
+ 2. If it IS an issue, gather what's needed before creating:
265
+ - Project: which project? If unclear, ask. Use list_projects if you need to check names.
266
+ - Context: what's the problem? Include everything: description, error text, steps to reproduce.
267
+ - Attachments: summarise any files/screenshots the user shared.
268
+ - Support URL: note any ticket/doc/link the user mentioned.
269
+ - Identity: always captured automatically from the Slack session.
270
+ 3. Populate `findings` with curated evidence — only when relevant and concise:
271
+ - If you ran search_logs, tail_log, ask_codebase, or get_status before creating the issue,
272
+ summarise only the findings directly related to this specific issue.
273
+ - Do NOT paste raw tool output. Summarise: which services, how often, key pattern, 1-3 example lines.
274
+ - If the search returned nothing relevant, or the issue is purely user-described with no log evidence, leave `findings` empty.
275
+ - The fix engine reads only the issue file. Give it signal, not noise — 500 words max.
276
+ 4. Before calling the tool, confirm with the user in natural language:
277
+ e.g. "I'll create an issue for project *1881* here's what I have: [summary]. Look right?"
278
+ Wait for their confirmation before proceeding.
279
+ EXCEPTION: if the user's message already contains a clear project + unambiguous description,
280
+ skip the confirmation and create immediately — don't ask when nothing is unclear.
281
+ 5. After creating, tell them the issue was queued and Sentinel will pick it up on the next poll.
282
+
283
+ When the engineer's request is fully handled, end your LAST message with the token: [DONE]
284
+ IMPORTANT: Always write your actual reply text FIRST, then append [DONE] at the end. Example: "Hello! I'm Sentinel. [DONE]". Never output [DONE] as your only content.
285
+ For greetings like "hello" or empty messages, introduce yourself briefly and offer help, then end with [DONE].
286
+ If you need a follow-up from them, do NOT include [DONE]wait for their next message.
287
+ """
288
+
289
+ # ── Tool definitions ─────────────────────────────────────────────────────────
290
+
291
+ _TOOLS = [
292
+ {
293
+ "name": "get_status",
294
+ "description": (
295
+ "Get recent errors, fixes applied, fixes pending review, and open PRs. "
296
+ "Use for: 'what happened today?', 'any issues?', 'how are things?', "
297
+ "'what are the open PRs?', 'did sentinel fix anything?'"
298
+ ),
299
+ "input_schema": {
300
+ "type": "object",
301
+ "properties": {
302
+ "hours": {
303
+ "type": "integer",
304
+ "description": "Look-back window in hours (default 24)",
305
+ "default": 24,
306
+ },
307
+ },
308
+ },
309
+ },
310
+ {
311
+ "name": "create_issue",
312
+ "description": (
313
+ "Deliver a confirmed issue/task to a Sentinel project instance. "
314
+ "Only call this after you have: (1) confirmed the message is a real issue or task, "
315
+ "(2) identified the target project, (3) gathered enough context, and "
316
+ "(4) confirmed with the user ('I'll create this issue for project X — does that look right?'). "
317
+ "Do NOT call this for status questions, tool queries, or casual chat."
318
+ ),
319
+ "input_schema": {
320
+ "type": "object",
321
+ "properties": {
322
+ "description": {
323
+ "type": "string",
324
+ "description": "Full problem/task description — include all context the user gave you",
325
+ },
326
+ "project": {
327
+ "type": "string",
328
+ "description": "Project short name (e.g. '1881', 'elprint'). Ask if unclear.",
329
+ },
330
+ "target_repo": {
331
+ "type": "string",
332
+ "description": "Specific repo within the project (omit to let Sentinel auto-route)",
333
+ },
334
+ "support_url": {
335
+ "type": "string",
336
+ "description": "Any URL the user shared (ticket, doc, screenshot link, etc.)",
337
+ },
338
+ "attachments_summary": {
339
+ "type": "string",
340
+ "description": "Summary of any files/screenshots the user attached",
341
+ },
342
+ "findings": {
343
+ "type": "string",
344
+ "description": (
345
+ "A concise, curated summary of evidence directly relevant to this issue — "
346
+ "NOT raw tool output. Include only what the fix engine needs: "
347
+ "key error patterns, affected services, approximate frequency/timestamps, "
348
+ "and 1-3 representative log lines. Omit unrelated results. "
349
+ "Keep under 500 words. Leave empty if no tool results are relevant."
350
+ ),
351
+ },
352
+ },
353
+ "required": ["description"],
354
+ },
355
+ },
356
+ {
357
+ "name": "get_fix_details",
358
+ "description": "Get full details of a specific fix by fingerprint (8+ hex chars).",
359
+ "input_schema": {
360
+ "type": "object",
361
+ "properties": {
362
+ "fingerprint": {"type": "string"},
363
+ },
364
+ "required": ["fingerprint"],
365
+ },
366
+ },
367
+ {
368
+ "name": "list_pending_prs",
369
+ "description": "List all open Sentinel PRs awaiting admin review.",
370
+ "input_schema": {"type": "object", "properties": {}},
371
+ },
372
+ {
373
+ "name": "check_auth_status",
374
+ "description": (
375
+ "Check Claude authentication health, current rate-limit / usage-limit circuit state, "
376
+ "and fix engine stats for the last 24 h. "
377
+ "Use when someone asks: 'is Claude working?', 'any rate limits?', 'why aren't fixes running?', "
378
+ "'is the API key OK?', 'auth issues?', 'fix engine status'."
379
+ ),
380
+ "input_schema": {"type": "object", "properties": {}},
381
+ },
382
+ {
383
+ "name": "pause_sentinel",
384
+ "description": (
385
+ "Pause ALL Sentinel fix activity immediately. "
386
+ "Use when the engineer says 'pause', 'stop', 'freeze', or 'hold off'."
387
+ ),
388
+ "input_schema": {"type": "object", "properties": {}},
389
+ },
390
+ {
391
+ "name": "resume_sentinel",
392
+ "description": "Resume Sentinel fix activity after a pause.",
393
+ "input_schema": {"type": "object", "properties": {}},
394
+ },
395
+ {
396
+ "name": "list_projects",
397
+ "description": (
398
+ "List all projects (Sentinel instances) in this workspace and the repos "
399
+ "each one manages. Use for: 'what projects do you manage?', 'list projects', "
400
+ "'what repos are configured?', 'show me all projects'."
401
+ ),
402
+ "input_schema": {"type": "object", "properties": {}},
403
+ },
404
+ {
405
+ "name": "search_logs",
406
+ "description": (
407
+ "Search production logs for a keyword or pattern. "
408
+ "When a project or source is specified (or can be inferred), performs a LIVE fetch "
409
+ "via fetch_log.sh with the query as the grep filter — SSHes directly to the server. "
410
+ "Falls back to searching locally-cached log files when no source can be determined. "
411
+ "Use for: 'search logs for illegal PIN in 1881', 'find X in SSOLWA logs', "
412
+ "'what did user Y do?', 'show entries for appid=Z', 'grep logs for X'."
413
+ ),
414
+ "input_schema": {
415
+ "type": "object",
416
+ "properties": {
417
+ "query": {
418
+ "type": "string",
419
+ "description": "Keyword or regex to grep for",
420
+ },
421
+ "source": {
422
+ "type": "string",
423
+ "description": "Log source name to search (partial match against log-config filenames, e.g. 'SSOLWA', '1881'). Leave empty to search all sources.",
424
+ },
425
+ "max_matches": {
426
+ "type": "integer",
427
+ "description": "Max matching lines to return per source (default 30)",
428
+ "default": 30,
429
+ },
430
+ "tail": {
431
+ "type": "integer",
432
+ "description": (
433
+ "Number of log lines to fetch from the server before grepping (default: config value, typically 500). "
434
+ "Increase when the user asks for a longer time window — e.g. 'yesterday up to now' → use 5000-10000. "
435
+ "Higher values take longer but cover more history."
436
+ ),
437
+ },
438
+ },
439
+ "required": ["query"],
440
+ },
441
+ },
442
+ {
443
+ "name": "filter_logs",
444
+ "description": (
445
+ "Search locally-synced log files by keyword or regex — instant, no SSH required. "
446
+ "Use this for fast queries once logs are synced (check with list_projects if unsure). "
447
+ "Supports time-range filtering and case options. "
448
+ "Use for: 'find TryDig in synced logs', 'show errors from last 24h', "
449
+ "'filter logs for appid=X', 'search local logs for Y'."
450
+ ),
451
+ "input_schema": {
452
+ "type": "object",
453
+ "properties": {
454
+ "query": {
455
+ "type": "string",
456
+ "description": "Keyword or regex to search for",
457
+ },
458
+ "source": {
459
+ "type": "string",
460
+ "description": "Log source name (partial match, e.g. 'STS', 'SSOLWA'). Leave empty to search all synced sources.",
461
+ },
462
+ "since_hours": {
463
+ "type": "integer",
464
+ "description": "Only return lines from the last N hours (uses log line timestamps). Omit for all available history.",
465
+ },
466
+ "max_matches": {
467
+ "type": "integer",
468
+ "description": "Max matching lines to return per source file (default 50)",
469
+ "default": 50,
470
+ },
471
+ "case_sensitive": {
472
+ "type": "boolean",
473
+ "description": "Case-sensitive match (default false)",
474
+ "default": False,
475
+ },
476
+ },
477
+ "required": ["query"],
478
+ },
479
+ },
480
+ {
481
+ "name": "trigger_poll",
482
+ "description": (
483
+ "Trigger an immediate log-fetch and error-detection cycle without waiting "
484
+ "for the next scheduled interval. Use when: 'check now', 'run now', "
485
+ "'poll immediately', 'don't wait'."
486
+ ),
487
+ "input_schema": {"type": "object", "properties": {}},
488
+ },
489
+ {
490
+ "name": "get_repo_status",
491
+ "description": (
492
+ "Per-repository breakdown of errors detected and fixes applied. "
493
+ "Use for: 'how is repo X doing?', 'which repo has the most issues?', "
494
+ "'break down by repo'."
495
+ ),
496
+ "input_schema": {
497
+ "type": "object",
498
+ "properties": {
499
+ "hours": {
500
+ "type": "integer",
501
+ "description": "Look-back window in hours (default 24)",
502
+ "default": 24,
503
+ },
504
+ },
505
+ },
506
+ },
507
+ {
508
+ "name": "list_recent_commits",
509
+ "description": (
510
+ "List recent commits made by Sentinel across all managed repos. "
511
+ "Use for: 'what did Sentinel commit?', 'show recent auto-fixes', 'what was changed?'."
512
+ ),
513
+ "input_schema": {
514
+ "type": "object",
515
+ "properties": {
516
+ "limit": {
517
+ "type": "integer",
518
+ "description": "Max commits per repo (default 5)",
519
+ "default": 5,
520
+ },
521
+ },
522
+ },
523
+ },
524
+ {
525
+ "name": "pull_repo",
526
+ "description": (
527
+ "Run git pull on one or all managed repos to fetch latest changes from GitHub. "
528
+ "Use for: 'pull changes', 'git pull', 'update repo X', 'fetch latest code'."
529
+ ),
530
+ "input_schema": {
531
+ "type": "object",
532
+ "properties": {
533
+ "repo": {
534
+ "type": "string",
535
+ "description": "Repo name to pull (omit to pull all configured repos)",
536
+ },
537
+ },
538
+ },
539
+ },
540
+ {
541
+ "name": "pull_config",
542
+ "description": (
543
+ "Run git pull on one or all Sentinel project config directories. "
544
+ "Projects are matched by short name ('1881', 'elprint') or full dir name ('sentinel-1881'). "
545
+ "Use for: 'pull config for 1881', 'update sentinel config', 'pull all configs'."
546
+ ),
547
+ "input_schema": {
548
+ "type": "object",
549
+ "properties": {
550
+ "project": {
551
+ "type": "string",
552
+ "description": "Project short name or dir name to pull (omit for all projects)",
553
+ },
554
+ },
555
+ },
556
+ },
557
+ {
558
+ "name": "fetch_logs",
559
+ "description": (
560
+ "Run fetch_log.sh for one or all configured log sources to pull the latest logs "
561
+ "from remote servers right now. Use for: 'fetch logs', 'run fetch_log.sh', "
562
+ "'grab latest logs from SSOLWA', 'try fetch_log.sh for STS', "
563
+ "'pull logs from server', 'get fresh logs'."
564
+ ),
565
+ "input_schema": {
566
+ "type": "object",
567
+ "properties": {
568
+ "source": {
569
+ "type": "string",
570
+ "description": "Log source name to fetch (partial match, e.g. 'SSOLWA'). Omit to fetch all.",
571
+ },
572
+ "debug": {
573
+ "type": "boolean",
574
+ "description": "Run fetch_log.sh with --debug flag to show SSH/grep details",
575
+ "default": False,
576
+ },
577
+ "tail": {
578
+ "type": "integer",
579
+ "description": "Override TAIL lines (how many log lines to fetch)",
580
+ },
581
+ "grep_filter": {
582
+ "type": "string",
583
+ "description": "Override GREP_FILTER (regex). Pass 'none' to disable filtering.",
584
+ },
585
+ },
586
+ },
587
+ },
588
+ {
589
+ "name": "watch_bot",
590
+ "description": (
591
+ "Tell Sentinel to passively monitor a Slack bot — queuing its messages as issues. "
592
+ "Extract all <@UXXXXXX> user IDs from the message and pass them here. "
593
+ "Sentinel verifies each is actually a bot (not a human) before adding to the watch list. "
594
+ "IMPORTANT: a bot watcher is only useful if its issues can be delivered to a project. "
595
+ "Try to infer the project from context (bot name, prior messages, available projects). "
596
+ "If it cannot be determined, do NOT call this tool — instead ask the user which project "
597
+ "the bot's alerts belong to, then call this tool with the project filled in. "
598
+ "Use for: 'listen to @alertbot', 'watch @bot1 @bot2', 'monitor @errorbot'."
599
+ ),
600
+ "input_schema": {
601
+ "type": "object",
602
+ "properties": {
603
+ "user_ids": {
604
+ "type": "array",
605
+ "items": {"type": "string"},
606
+ "description": "Slack user IDs to watch extract from <@UXXXXXX> patterns in the message",
607
+ },
608
+ "project": {
609
+ "type": "string",
610
+ "description": "Project short name this bot's issues should be routed to (e.g. '1881', 'elprint'). Infer from context or ask user before calling.",
611
+ },
612
+ },
613
+ "required": ["user_ids"],
614
+ },
615
+ },
616
+ {
617
+ "name": "unwatch_bot",
618
+ "description": (
619
+ "Stop Sentinel from monitoring a Slack bot. "
620
+ "Use for: 'stop watching @alertbot', 'unwatch @bot', 'remove @errorbot from watchers'."
621
+ ),
622
+ "input_schema": {
623
+ "type": "object",
624
+ "properties": {
625
+ "user_ids": {
626
+ "type": "array",
627
+ "items": {"type": "string"},
628
+ "description": "Slack user IDs to remove from the watch list",
629
+ },
630
+ },
631
+ "required": ["user_ids"],
632
+ },
633
+ },
634
+ {
635
+ "name": "list_watched_bots",
636
+ "description": (
637
+ "List all Slack bots Sentinel is currently monitoring passively. "
638
+ "Use for: 'who are you watching?', 'which bots are you monitoring?', 'list watched bots'."
639
+ ),
640
+ "input_schema": {"type": "object", "properties": {}},
641
+ },
642
+ {
643
+ "name": "upgrade_sentinel",
644
+ "description": (
645
+ "Upgrade the Sentinel agent itself: git pull the latest code, update Python deps, "
646
+ "then restart the process. Safe to call at any time — if already up to date, "
647
+ "no restart is triggered. "
648
+ "Use for: 'upgrade sentinel', 'update sentinel', 'upgrade yourself', "
649
+ "'pull latest sentinel code', 'restart sentinel after upgrade'."
650
+ ),
651
+ "input_schema": {"type": "object", "properties": {}},
652
+ },
653
+ {
654
+ "name": "ask_codebase",
655
+ "description": (
656
+ "Ask any natural-language question about a managed codebase. "
657
+ "Accepts a repo name (e.g. 'STS', 'elprint-sales') OR a project name (e.g. '1881', 'elprint') "
658
+ " if a project name is given and it has multiple repos, all are queried. "
659
+ "Claude Code answers using its full codebase knowledge no need to specify how. "
660
+ "Use for: 'what does 1881 do?', 'TODOs in 1881', 'find PIN validation in STS', "
661
+ "'security issues in elprint-sales?', 'summarize the cairn repo'."
662
+ ),
663
+ "input_schema": {
664
+ "type": "object",
665
+ "properties": {
666
+ "repo": {
667
+ "type": "string",
668
+ "description": "Repo name (e.g. 'STS', 'elprint-sales') OR project name (e.g. '1881', 'elprint') — project name queries all its repos",
669
+ },
670
+ "question": {
671
+ "type": "string",
672
+ "description": "Natural language question about the codebase",
673
+ },
674
+ },
675
+ "required": ["repo", "question"],
676
+ },
677
+ },
678
+ {
679
+ "name": "restart_project",
680
+ "description": (
681
+ "Stop and restart a specific Sentinel monitoring instance (runs stop.sh then start.sh). "
682
+ "This restarts the Sentinel agent process for that project — it does NOT restart the application itself. "
683
+ "Use when: 'restart sentinel for 1881', 'reload the 1881 monitor', 'restart elprint sentinel'. "
684
+ "Safer than restarting all projects at once."
685
+ ),
686
+ "input_schema": {
687
+ "type": "object",
688
+ "properties": {
689
+ "project": {
690
+ "type": "string",
691
+ "description": "Project short name or dir name (e.g. '1881', 'elprint')",
692
+ },
693
+ },
694
+ "required": ["project"],
695
+ },
696
+ },
697
+ {
698
+ "name": "my_stats",
699
+ "description": (
700
+ "Show the current user's personal Sentinel dashboard: "
701
+ "conversation history length, issues they submitted, and "
702
+ "a summary of Sentinel fix activity (errors caught, fixes applied, "
703
+ "fixes pending PR review, fixes confirmed live, fixes failed). "
704
+ "Use for: 'what have you done for me?', 'show my stats', "
705
+ "'how many issues have been fixed?', 'my history', 'summary', "
706
+ "'what did sentinel fix this week?', 'pending fixes', 'open PRs'."
707
+ ),
708
+ "input_schema": {
709
+ "type": "object",
710
+ "properties": {
711
+ "hours": {
712
+ "type": "integer",
713
+ "description": "Look-back window in hours (default 168 = 7 days)",
714
+ "default": 168,
715
+ },
716
+ },
717
+ },
718
+ },
719
+ {
720
+ "name": "clear_my_history",
721
+ "description": (
722
+ "Clear the current user's conversation history with Sentinel. "
723
+ "After clearing, future sessions start with no memory of past conversations. "
724
+ "Use for: 'clear my history', 'forget our conversation', "
725
+ "'start fresh', 'reset my context', 'wipe my history'."
726
+ ),
727
+ "input_schema": {"type": "object", "properties": {}},
728
+ },
729
+ {
730
+ "name": "tail_log",
731
+ "description": (
732
+ "Fetch the last N lines of a log source's live production logs without any grep filter. "
733
+ "Use when: 'show me recent SSOLWA logs', 'tail STS', 'what's happening in 1881 logs right now', "
734
+ "'show last 100 lines from SSOLWA'. Different from search_logs — no pattern required."
735
+ ),
736
+ "input_schema": {
737
+ "type": "object",
738
+ "properties": {
739
+ "source": {
740
+ "type": "string",
741
+ "description": "Log source name (partial match against log-config filenames, e.g. 'SSOLWA', 'STS')",
742
+ },
743
+ "lines": {
744
+ "type": "integer",
745
+ "description": "Number of recent lines to fetch (default 100)",
746
+ "default": 100,
747
+ },
748
+ },
749
+ "required": ["source"],
750
+ },
751
+ },
752
+ {
753
+ "name": "post_file",
754
+ "description": (
755
+ "Upload a text file directly to the Slack conversation so the user can read or download it. "
756
+ "Use when: output is too large for a chat message, the user asks to 'download', 'export', or "
757
+ "'send as a file', or when formatted content (diffs, logs, CSVs, reports) is clearer as a file. "
758
+ "e.g. 'give me that as a file', 'export the log', 'send me the diff for PR #41', "
759
+ "'download the health report', 'export recent errors as CSV'"
760
+ ),
761
+ "input_schema": {
762
+ "type": "object",
763
+ "properties": {
764
+ "content": {
765
+ "type": "string",
766
+ "description": "The full text content of the file to upload",
767
+ },
768
+ "filename": {
769
+ "type": "string",
770
+ "description": "Filename with extension, e.g. 'fix-ab12.diff', 'sentinel-report.txt', 'errors.csv', 'ssolwa.log'",
771
+ },
772
+ "title": {
773
+ "type": "string",
774
+ "description": "Optional display title shown above the file in Slack (defaults to filename)",
775
+ },
776
+ },
777
+ "required": ["content", "filename"],
778
+ },
779
+ },
780
+ {
781
+ "name": "list_all_users",
782
+ "description": (
783
+ "ADMIN ONLY. List all Slack users who have ever talked to Sentinel, "
784
+ "with their issue count and conversation message count. "
785
+ "e.g. 'list all users', 'who has talked to you?', 'show user activity'"
786
+ ),
787
+ "input_schema": {"type": "object", "properties": {}},
788
+ },
789
+ {
790
+ "name": "clear_user_history",
791
+ "description": (
792
+ "ADMIN ONLY. Clear the conversation history for a specific Slack user. "
793
+ "e.g. 'clear history for huy', 'reset bob's conversation'"
794
+ ),
795
+ "input_schema": {
796
+ "type": "object",
797
+ "properties": {
798
+ "user_id": {
799
+ "type": "string",
800
+ "description": "Slack user ID to clear (e.g. U01AB2CD3EF)",
801
+ },
802
+ },
803
+ "required": ["user_id"],
804
+ },
805
+ },
806
+ {
807
+ "name": "reset_fingerprint",
808
+ "description": (
809
+ "ADMIN ONLY. Remove the 24h fix lock for an error fingerprint so Sentinel will retry it "
810
+ "on the next poll cycle. Use when a fix attempt failed and you want to force a retry. "
811
+ "e.g. 'retry fix abc123', 'reset fingerprint abc123de', 'let Sentinel try that error again'"
812
+ ),
813
+ "input_schema": {
814
+ "type": "object",
815
+ "properties": {
816
+ "fingerprint": {
817
+ "type": "string",
818
+ "description": "Error fingerprint hash (8+ hex chars, from get_fix_details or list_all_errors)",
819
+ },
820
+ },
821
+ "required": ["fingerprint"],
822
+ },
823
+ },
824
+ {
825
+ "name": "list_all_errors",
826
+ "description": (
827
+ "ADMIN ONLY. Return the full unfiltered error database — all fingerprints, counts, "
828
+ "sources, and last-seen times. "
829
+ "e.g. 'show all errors', 'full error list', 'dump the error DB'"
830
+ ),
831
+ "input_schema": {
832
+ "type": "object",
833
+ "properties": {
834
+ "hours": {
835
+ "type": "integer",
836
+ "description": "Limit to errors seen in the last N hours (0 = all time)",
837
+ "default": 0,
838
+ },
839
+ },
840
+ },
841
+ },
842
+ {
843
+ "name": "export_db",
844
+ "description": (
845
+ "ADMIN ONLY. Export the full Sentinel state (errors, fixes, PRs, users) as a "
846
+ "downloadable text file posted to Slack. "
847
+ "e.g. 'export the DB', 'download state', 'give me a full report file'"
848
+ ),
849
+ "input_schema": {"type": "object", "properties": {}},
850
+ },
851
+ {
852
+ "name": "set_maintenance",
853
+ "description": (
854
+ "Confirm that a repo/app is deliberately stopped for maintenance. "
855
+ "Sentinel will silently monitor the health URL and notify when it comes back online. "
856
+ "Use when Sentinel asked if a 502/503 is deliberate. "
857
+ "e.g. 'yes it's maintenance', 'maintenance ssolwa', 'confirm ssolwa is down for maintenance'"
858
+ ),
859
+ "input_schema": {
860
+ "type": "object",
861
+ "properties": {
862
+ "repo_name": {
863
+ "type": "string",
864
+ "description": "Repo name as configured (from repo-configs/*.properties)",
865
+ },
866
+ "note": {
867
+ "type": "string",
868
+ "description": "Optional reason e.g. 'scheduled maintenance', 'dependency update'",
869
+ },
870
+ },
871
+ "required": ["repo_name"],
872
+ },
873
+ },
874
+ ]
875
+
876
+
877
+ # ── Workspace helpers ─────────────────────────────────────────────────────────
878
+
879
+ def _workspace_dir() -> Path:
880
+ return Path(".").resolve().parent
881
+
882
+ def _short_name(dir_name: str) -> str:
883
+ """'sentinel-1881' '1881', 'sentinel-elprint' 'elprint', others unchanged."""
884
+ if dir_name.startswith("sentinel-"):
885
+ return dir_name[len("sentinel-"):]
886
+ return dir_name
887
+
888
+ def _read_project_name(project_dir: Path) -> str:
889
+ """Return PROJECT_NAME from sentinel.properties if set, else fall back to _short_name(dir)."""
890
+ props = project_dir / "config" / "sentinel.properties"
891
+ if props.exists():
892
+ try:
893
+ for line in props.read_text(encoding="utf-8", errors="ignore").splitlines():
894
+ line = line.strip()
895
+ if line.startswith("PROJECT_NAME"):
896
+ _, _, val = line.partition("=")
897
+ val = val.partition("#")[0].strip()
898
+ if val:
899
+ return val
900
+ except Exception:
901
+ pass
902
+ return _short_name(project_dir.name)
903
+
904
+ def _find_project_dirs(target: str = "") -> list[Path]:
905
+ """Return project dirs matching target (PROJECT_NAME, short name, or full dir name), or all if target empty."""
906
+ workspace = _workspace_dir()
907
+ results = []
908
+ try:
909
+ for d in sorted(workspace.iterdir()):
910
+ if not d.is_dir() or d.name in ("code", ".git"):
911
+ continue
912
+ if not (d / "config").exists():
913
+ continue
914
+ if target:
915
+ t = target.lower()
916
+ if (t not in d.name.lower()
917
+ and t not in _short_name(d.name).lower()
918
+ and t not in _read_project_name(d).lower()):
919
+ continue
920
+ results.append(d)
921
+ except Exception:
922
+ pass
923
+ return results
924
+
925
+ def _git_pull(path: Path) -> dict:
926
+ try:
927
+ r = subprocess.run(
928
+ ["git", "pull", "--rebase", "origin"],
929
+ cwd=str(path), capture_output=True, text=True, timeout=60,
930
+ )
931
+ last = r.stdout.strip().splitlines()[-1] if r.stdout.strip() else "already up to date"
932
+ return {"status": "ok" if r.returncode == 0 else "error",
933
+ "detail": last if r.returncode == 0 else r.stderr.strip()}
934
+ except Exception as e:
935
+ return {"status": "error", "detail": str(e)}
936
+
937
+
938
+ # ── Log-source name resolver ──────────────────────────────────────────────────
939
+
940
+ def _filter_log_sources(props_files: list, source_hint: str) -> list:
941
+ """
942
+ Return the subset of props_files whose log source matches source_hint.
943
+
944
+ Matching is tried in order (first match wins per file):
945
+ 1. Substring of the filename stem (e.g. "sts" STS.properties)
946
+ 2. Substring of REMOTE_SERVICE_USER (e.g. "ssolwa" → ...SSOLoginWebApp...)
947
+ 3. Substring of HOSTS (e.g. hostname fragment)
948
+
949
+ Case-insensitive throughout. An empty source_hint returns all files unchanged.
950
+ """
951
+ if not source_hint:
952
+ return props_files
953
+ hint = source_hint.lower()
954
+
955
+ def _props_contains(path: Path, key: str, hint: str) -> bool:
956
+ try:
957
+ for line in path.read_text(encoding="utf-8", errors="replace").splitlines():
958
+ stripped = line.strip()
959
+ if stripped.startswith("#"):
960
+ continue
961
+ if stripped.upper().startswith(key + "="):
962
+ val = stripped.split("=", 1)[1].partition("#")[0].strip().lower()
963
+ if hint in val:
964
+ return True
965
+ except OSError:
966
+ pass
967
+ return False
968
+
969
+ matched = []
970
+ for p in props_files:
971
+ if hint in p.stem.lower():
972
+ matched.append(p)
973
+ elif _props_contains(p, "REMOTE_SERVICE_USER", hint):
974
+ matched.append(p)
975
+ elif _props_contains(p, "HOSTS", hint):
976
+ matched.append(p)
977
+ return matched
978
+
979
+
980
+ # ── Tool execution ────────────────────────────────────────────────────────────
981
+
982
+ async def _run_tool(name: str, inputs: dict, cfg_loader, store, slack_client=None, user_id: str = "", channel: str = "", is_admin: bool = False) -> str:
983
+ if name == "get_status":
984
+ hours = int(inputs.get("hours", 24))
985
+ errors = store.get_recent_errors(hours)
986
+ fixes = store.get_recent_fixes(hours)
987
+ prs = store.get_open_prs()
988
+ top_errors = [
989
+ {
990
+ "message": e["message"][:120],
991
+ "count": e["count"],
992
+ "source": e["source"],
993
+ "last_seen": e["last_seen"],
994
+ }
995
+ for e in errors[:8]
996
+ ]
997
+ return json.dumps({
998
+ "window_hours": hours,
999
+ "errors_detected": len(errors),
1000
+ "top_errors": top_errors,
1001
+ "fixes_applied": sum(1 for f in fixes if f["status"] == "applied"),
1002
+ "fixes_pending": sum(1 for f in fixes if f["status"] == "pending"),
1003
+ "fixes_failed": sum(1 for f in fixes if f["status"] == "failed"),
1004
+ "open_prs": [
1005
+ {
1006
+ "repo": p["repo_name"],
1007
+ "branch": p["branch"],
1008
+ "pr_url": p["pr_url"],
1009
+ "age": p.get("timestamp", ""),
1010
+ }
1011
+ for p in prs
1012
+ ],
1013
+ "sentinel_paused": Path("SENTINEL_PAUSE").exists(),
1014
+ })
1015
+
1016
+ if name == "check_auth_status":
1017
+ import subprocess as _sp
1018
+ from .notify import get_circuit_status
1019
+ cfg = cfg_loader.sentinel
1020
+
1021
+ # Auth configuration
1022
+ has_key = bool(cfg.anthropic_api_key)
1023
+ pro_for_tasks = cfg.claude_pro_for_tasks
1024
+ if pro_for_tasks and has_key:
1025
+ primary, fallback = "claude_pro_oauth", "api_key"
1026
+ elif pro_for_tasks:
1027
+ primary, fallback = "claude_pro_oauth", None
1028
+ else:
1029
+ primary, fallback = "api_key", "claude_pro_oauth" if not has_key else "claude_pro_oauth"
1030
+
1031
+ # Claude CLI liveness check
1032
+ cli_ok, cli_version = False, ""
1033
+ try:
1034
+ r = _sp.run(
1035
+ [cfg.claude_code_bin, "--version"],
1036
+ capture_output=True, text=True, timeout=10,
1037
+ )
1038
+ if r.returncode == 0:
1039
+ cli_ok = True
1040
+ cli_version = r.stdout.strip() or r.stderr.strip()
1041
+ except Exception:
1042
+ pass
1043
+
1044
+ # Circuit breaker snapshot — only open (unhealthy) circuits appear here
1045
+ circuits = get_circuit_status()
1046
+
1047
+ # Fix engine stats (last 24 h)
1048
+ recent = store.get_recent_fixes(hours=24)
1049
+ counts = {"applied": 0, "failed": 0, "skipped": 0, "pending": 0}
1050
+ last_success = None
1051
+ for f in recent:
1052
+ s = f.get("status", "")
1053
+ if s in counts:
1054
+ counts[s] += 1
1055
+ if s == "applied" and not last_success:
1056
+ last_success = f.get("timestamp", "")
1057
+
1058
+ overall = "healthy"
1059
+ if circuits:
1060
+ overall = "degraded — rate/auth limit active on: " + ", ".join(circuits)
1061
+ elif not cli_ok:
1062
+ overall = "warning — claude CLI not reachable"
1063
+
1064
+ return json.dumps({
1065
+ "overall": overall,
1066
+ "auth": {
1067
+ "api_key_configured": has_key,
1068
+ "claude_pro_for_tasks": pro_for_tasks,
1069
+ "primary_method": primary,
1070
+ "fallback_method": fallback,
1071
+ },
1072
+ "claude_cli": {"available": cli_ok, "version": cli_version},
1073
+ "rate_limit_circuits": circuits,
1074
+ "fix_engine_24h": {**counts, "last_successful_fix": last_success},
1075
+ })
1076
+
1077
+ if name == "create_issue":
1078
+ description = inputs["description"]
1079
+ target_repo = inputs.get("target_repo", "")
1080
+ project_arg = inputs.get("project", "")
1081
+
1082
+ if project_arg:
1083
+ project_dirs = _find_project_dirs(project_arg)
1084
+ if not project_dirs:
1085
+ all_names = [_read_project_name(d) for d in _find_project_dirs()]
1086
+ return json.dumps({
1087
+ "error": f"No project found matching '{project_arg}'",
1088
+ "available_projects": all_names,
1089
+ "action_needed": "Ask the user which project they meant.",
1090
+ })
1091
+ if len(project_dirs) > 1:
1092
+ matches = [_read_project_name(d) for d in project_dirs]
1093
+ return json.dumps({
1094
+ "error": f"Ambiguous project name '{project_arg}' — matches: {matches}",
1095
+ "action_needed": "Ask the user to clarify which project they mean.",
1096
+ })
1097
+ project_dir = project_dirs[0]
1098
+ else:
1099
+ project_dir = Path(".")
1100
+
1101
+ support_url = inputs.get("support_url", "").strip()
1102
+ attachments_summary = inputs.get("attachments_summary", "").strip()
1103
+ findings = inputs.get("findings", "").strip()
1104
+
1105
+ issues_dir = project_dir / "issues"
1106
+ issues_dir.mkdir(exist_ok=True)
1107
+ fname = f"slack-{uuid.uuid4().hex[:8]}.txt"
1108
+
1109
+ submitter_name = store.get_user_name(user_id) if user_id else ""
1110
+ submitter_line = f"SUBMITTED_BY: {submitter_name} ({user_id})" if user_id else ""
1111
+ lines = []
1112
+ if submitter_line:
1113
+ lines.append(submitter_line)
1114
+ if target_repo:
1115
+ lines.append(f"TARGET_REPO: {target_repo}")
1116
+ if support_url:
1117
+ lines.append(f"SUPPORT_URL: {support_url}")
1118
+ lines.append(f"SUBMITTED_AT: {datetime.now(timezone.utc).isoformat()}")
1119
+ lines.append("")
1120
+ lines.append(description)
1121
+ if findings:
1122
+ lines.append(f"\nEVIDENCE (gathered by Sentinel Boss):\n{findings}")
1123
+ if attachments_summary:
1124
+ lines.append(f"\nATTACHMENTS:\n{attachments_summary}")
1125
+ content = "\n".join(lines)
1126
+ (issues_dir / fname).write_text(content, encoding="utf-8")
1127
+
1128
+ # Touch SENTINEL_POLL_NOW so the target instance picks it up immediately
1129
+ (project_dir / "SENTINEL_POLL_NOW").touch()
1130
+
1131
+ project_label = _read_project_name(project_dir.resolve()) if project_arg else "this project"
1132
+ logger.info("Boss created issue for %s: %s", project_label, fname)
1133
+ if user_id:
1134
+ try:
1135
+ store.record_submitted_issue(
1136
+ user_id=user_id,
1137
+ user_name=submitter_name,
1138
+ project=project_label,
1139
+ fname=fname,
1140
+ description=description,
1141
+ )
1142
+ except Exception as _rec_err:
1143
+ logger.debug("Boss: could not record submitted issue: %s", _rec_err)
1144
+ return json.dumps({
1145
+ "status": "queued",
1146
+ "project": project_label,
1147
+ "file": fname,
1148
+ "note": f"Delivered to '{project_label}'. Sentinel will process it on the next poll cycle.",
1149
+ })
1150
+
1151
+ if name == "get_fix_details":
1152
+ fp = inputs["fingerprint"]
1153
+ fix = store.get_confirmed_fix(fp) or store.get_marker_seen_fix(fp)
1154
+ if not fix:
1155
+ # Fallback: search recent fixes by prefix
1156
+ recent = store.get_recent_fixes(hours=72)
1157
+ fix = next((f for f in recent if f.get("fingerprint", "").startswith(fp)), None)
1158
+ return json.dumps(fix or {"error": "not found"})
1159
+
1160
+ if name == "list_pending_prs":
1161
+ prs = store.get_open_prs()
1162
+ return json.dumps({
1163
+ "count": len(prs),
1164
+ "open_prs": [
1165
+ {
1166
+ "repo": p["repo_name"],
1167
+ "branch": p["branch"],
1168
+ "pr_url": p["pr_url"],
1169
+ "timestamp": p.get("timestamp", ""),
1170
+ }
1171
+ for p in prs
1172
+ ],
1173
+ })
1174
+
1175
+ if name == "pause_sentinel":
1176
+ Path("SENTINEL_PAUSE").touch()
1177
+ logger.info("Boss: SENTINEL_PAUSE created")
1178
+ return json.dumps({"status": "paused"})
1179
+
1180
+ if name == "resume_sentinel":
1181
+ p = Path("SENTINEL_PAUSE")
1182
+ if p.exists():
1183
+ p.unlink()
1184
+ logger.info("Boss: SENTINEL_PAUSE removed")
1185
+ return json.dumps({"status": "resumed"})
1186
+
1187
+ if name == "list_projects":
1188
+ projects = []
1189
+ for d in _find_project_dirs():
1190
+ repo_cfg_dir = d / "config" / "repo-configs"
1191
+ repos_in_project = []
1192
+ if repo_cfg_dir.exists():
1193
+ for p in sorted(repo_cfg_dir.glob("*.properties")):
1194
+ if p.name.startswith("_"):
1195
+ continue
1196
+ repo_url = ""
1197
+ for line in p.read_text(encoding="utf-8", errors="ignore").splitlines():
1198
+ if line.startswith("REPO_URL"):
1199
+ repo_url = line.split("=", 1)[-1].strip()
1200
+ break
1201
+ repos_in_project.append({"repo": p.stem, "url": repo_url})
1202
+ projects.append({
1203
+ "project": _read_project_name(d),
1204
+ "dir": d.name,
1205
+ "running": (d / "sentinel.pid").exists(),
1206
+ "this": d.resolve() == Path(".").resolve(),
1207
+ "repos": repos_in_project,
1208
+ })
1209
+ return json.dumps({"projects": projects})
1210
+
1211
+ if name == "search_logs":
1212
+ query = inputs.get("query", "")
1213
+ source = inputs.get("source", "").lower()
1214
+ max_matches = int(inputs.get("max_matches", 30))
1215
+ tail_override = inputs.get("tail")
1216
+
1217
+ # ── Preferred path: search locally-synced files (instant, no SSH) ──────
1218
+ synced_base = Path("workspace/synced")
1219
+ if synced_base.exists():
1220
+ log_cfg_dir_s = Path("config") / "log-configs"
1221
+ candidate_sources = (
1222
+ [p.stem for p in _filter_log_sources(sorted(log_cfg_dir_s.glob("*.properties")), source)]
1223
+ if log_cfg_dir_s.exists() else
1224
+ [d.name for d in sorted(synced_base.iterdir()) if d.is_dir()]
1225
+ )
1226
+ synced_results = []
1227
+ try:
1228
+ qpat_s = re.compile(query, re.IGNORECASE)
1229
+ except re.error:
1230
+ qpat_s = re.compile(re.escape(query), re.IGNORECASE)
1231
+ for src_name in candidate_sources:
1232
+ src_dir = synced_base / src_name
1233
+ if not src_dir.is_dir():
1234
+ continue
1235
+ for log_file in sorted(src_dir.glob("*")):
1236
+ try:
1237
+ lines = log_file.read_text(encoding="utf-8", errors="replace").splitlines()
1238
+ matches = [ln[:300] for ln in lines if qpat_s.search(ln)][:max_matches]
1239
+ if matches:
1240
+ synced_results.append({"source": src_name, "file": log_file.name, "matches": matches})
1241
+ except Exception:
1242
+ pass
1243
+ if synced_results:
1244
+ total = sum(len(r["matches"]) for r in synced_results)
1245
+ return json.dumps({
1246
+ "query": query,
1247
+ "mode": "synced",
1248
+ "total_matches": total,
1249
+ "results": synced_results,
1250
+ "note": "Results from locally-synced files. No SSH needed.",
1251
+ })
1252
+
1253
+ # ── Live fetch path: SSH to servers and grep in real time ──────────────
1254
+ script = Path(__file__).resolve().parent.parent / "scripts" / "fetch_log.sh"
1255
+ log_cfg_dir = Path("config") / "log-configs"
1256
+ if script.exists() and log_cfg_dir.exists():
1257
+ props_files = _filter_log_sources(sorted(log_cfg_dir.glob("*.properties")), source)
1258
+ if props_files:
1259
+ live_results = []
1260
+ for props in props_files:
1261
+ env = os.environ.copy()
1262
+ env["GREP_FILTER"] = query
1263
+ if tail_override:
1264
+ env["TAIL"] = str(tail_override)
1265
+ try:
1266
+ r = subprocess.run(
1267
+ ["bash", str(script), str(props)],
1268
+ capture_output=True, text=True, timeout=60, env=env,
1269
+ )
1270
+ try:
1271
+ _qpat = re.compile(query, re.IGNORECASE)
1272
+ except re.error:
1273
+ _qpat = re.compile(re.escape(query), re.IGNORECASE)
1274
+ lines = (r.stdout or "").strip().splitlines()
1275
+ matches = [ln[:300] for ln in lines if _qpat.search(ln)][:max_matches]
1276
+ if matches:
1277
+ live_results.append({"source": props.stem, "matches": matches})
1278
+ logger.info("Boss search_logs live %s rc=%d found=%d", props.stem, r.returncode, len(matches))
1279
+ except subprocess.TimeoutExpired:
1280
+ live_results.append({"source": props.stem, "error": "timed out"})
1281
+ except Exception as e:
1282
+ live_results.append({"source": props.stem, "error": str(e)})
1283
+ total = sum(len(r.get("matches", [])) for r in live_results)
1284
+ return json.dumps({
1285
+ "query": query,
1286
+ "mode": "live",
1287
+ "total_matches": total,
1288
+ "results": live_results,
1289
+ "note": (
1290
+ "Results already include a per-source breakdown. "
1291
+ "Do NOT call search_logs again with a source filter to 'refine' — "
1292
+ "use these results directly."
1293
+ ) if total > 0 else None,
1294
+ })
1295
+
1296
+ # ── Fallback: search locally-cached log files ──────────────────────────
1297
+ # Reaching here means: live script unavailable OR source filter matched no config files.
1298
+ # A result with files_searched=0 means the source name wasn't recognised — NOT that
1299
+ # there are no log entries. Do not interpret this as "no results found".
1300
+ fetched_dir = Path("workspace/fetched")
1301
+ if not fetched_dir.exists():
1302
+ return json.dumps({
1303
+ "error": "No fetched logs found and fetch_log.sh unavailable",
1304
+ "note": "This is a config/setup problem, not a 'no results' answer.",
1305
+ })
1306
+ try:
1307
+ pattern = re.compile(query, re.IGNORECASE)
1308
+ except re.error as e:
1309
+ return json.dumps({"error": f"Invalid regex: {e}"})
1310
+ results = []
1311
+ for log_file in sorted(fetched_dir.glob("*.log")):
1312
+ if source and source not in log_file.name.lower():
1313
+ continue
1314
+ try:
1315
+ lines = log_file.read_text(encoding="utf-8", errors="ignore").splitlines()
1316
+ matches = [
1317
+ {"line": i + 1, "text": line[:300]}
1318
+ for i, line in enumerate(lines)
1319
+ if pattern.search(line)
1320
+ ][:max_matches]
1321
+ if matches:
1322
+ results.append({"file": log_file.name, "matches": matches})
1323
+ except Exception:
1324
+ pass
1325
+ total = sum(len(r["matches"]) for r in results)
1326
+ files_searched = len(list(fetched_dir.glob("*.log")))
1327
+ result = {
1328
+ "query": query,
1329
+ "mode": "cached",
1330
+ "total_matches": total,
1331
+ "files_searched": files_searched,
1332
+ "results": results,
1333
+ }
1334
+ if files_searched == 0:
1335
+ result["warning"] = (
1336
+ "Source name not recognised in cached files — this is a lookup failure, not 'no results'. "
1337
+ "If you already have results from a broader search_logs call, use those. Stop retrying."
1338
+ )
1339
+ return json.dumps(result)
1340
+
1341
+
1342
+ if name == "filter_logs":
1343
+ import re as _re
1344
+ from datetime import datetime, timedelta, timezone as _tz
1345
+ query_f = inputs.get("query", "")
1346
+ source_f = inputs.get("source", "").lower()
1347
+ since_hours = inputs.get("since_hours")
1348
+ max_matches = int(inputs.get("max_matches", 50))
1349
+ case_flag = 0 if inputs.get("case_sensitive") else _re.IGNORECASE
1350
+ try:
1351
+ pat = _re.compile(query_f, case_flag)
1352
+ except _re.error as e:
1353
+ return json.dumps({"error": f"Invalid regex: {e}"})
1354
+
1355
+ synced_base = Path("workspace/synced")
1356
+ if not synced_base.exists():
1357
+ return json.dumps({
1358
+ "error": "No synced logs found.",
1359
+ "hint": "Log sync runs every SYNC_INTERVAL_SECONDS (default 300s). "
1360
+ "If just started, wait a minute then try again.",
1361
+ })
1362
+
1363
+ # Build cutoff timestamp for since_hours filter
1364
+ cutoff = None
1365
+ if since_hours:
1366
+ cutoff = datetime.now(_tz.utc) - timedelta(hours=int(since_hours))
1367
+
1368
+ # Determine which source directories to search
1369
+ if source_f:
1370
+ src_dirs = [d for d in sorted(synced_base.iterdir())
1371
+ if d.is_dir() and source_f in d.name.lower()]
1372
+ else:
1373
+ src_dirs = [d for d in sorted(synced_base.iterdir()) if d.is_dir()]
1374
+
1375
+ if not src_dirs:
1376
+ available = [d.name for d in synced_base.iterdir() if d.is_dir()]
1377
+ return json.dumps({
1378
+ "error": f"No synced source matching '{source_f}'",
1379
+ "available_sources": available,
1380
+ })
1381
+
1382
+ results = []
1383
+ total_matches = 0
1384
+ for src_dir in src_dirs:
1385
+ for log_file in sorted(src_dir.glob("*")):
1386
+ try:
1387
+ lines = log_file.read_text(encoding="utf-8", errors="replace").splitlines()
1388
+ matches = []
1389
+ for line in lines:
1390
+ if not pat.search(line):
1391
+ continue
1392
+ if cutoff:
1393
+ # Try to parse timestamp from line
1394
+ from .log_fetcher import _parse_line_ts
1395
+ ts = _parse_line_ts(line)
1396
+ if ts and ts < cutoff:
1397
+ continue
1398
+ matches.append(line[:300])
1399
+ if len(matches) >= max_matches:
1400
+ break
1401
+ if matches:
1402
+ results.append({
1403
+ "source": src_dir.name,
1404
+ "file": log_file.name,
1405
+ "matches": matches,
1406
+ })
1407
+ total_matches += len(matches)
1408
+ except Exception:
1409
+ pass
1410
+
1411
+ if not results:
1412
+ return json.dumps({
1413
+ "query": query_f,
1414
+ "total_matches": 0,
1415
+ "sources_searched": [d.name for d in src_dirs],
1416
+ "note": "No matches found in synced logs.",
1417
+ })
1418
+
1419
+ return json.dumps({
1420
+ "query": query_f,
1421
+ "mode": "local",
1422
+ "total_matches": total_matches,
1423
+ "sources_searched": [d.name for d in src_dirs],
1424
+ "results": results,
1425
+ })
1426
+
1427
+ if name == "trigger_poll":
1428
+ Path("SENTINEL_POLL_NOW").touch()
1429
+ logger.info("Boss: immediate poll requested")
1430
+ return json.dumps({"status": "triggered", "note": "Sentinel will run a poll cycle within seconds"})
1431
+
1432
+ if name == "get_repo_status":
1433
+ hours = int(inputs.get("hours", 24))
1434
+ fixes = store.get_recent_fixes(hours)
1435
+ errors = store.get_recent_errors(hours)
1436
+ by_repo: dict = {}
1437
+ for fix in fixes:
1438
+ repo = fix.get("repo_name", "unknown")
1439
+ s = by_repo.setdefault(repo, {"applied": 0, "pending": 0, "failed": 0, "skipped": 0})
1440
+ key = fix.get("status", "failed")
1441
+ s[key] = s.get(key, 0) + 1
1442
+ return json.dumps({"window_hours": hours, "total_errors": len(errors), "by_repo": by_repo})
1443
+
1444
+ if name == "list_recent_commits":
1445
+ limit = int(inputs.get("limit", 5))
1446
+ results = []
1447
+ for repo_name, repo in cfg_loader.repos.items():
1448
+ local = Path(repo.local_path)
1449
+ if not local.exists():
1450
+ continue
1451
+ try:
1452
+ r = subprocess.run(
1453
+ ["git", "log", "--oneline", "--grep=sentinel", "-n", str(limit)],
1454
+ cwd=str(local), capture_output=True, text=True, timeout=10,
1455
+ )
1456
+ commits = r.stdout.strip().splitlines()
1457
+ if commits:
1458
+ results.append({"repo": repo_name, "commits": commits})
1459
+ except Exception:
1460
+ pass
1461
+ return json.dumps({"sentinel_commits": results})
1462
+
1463
+ if name == "pull_repo":
1464
+ target = inputs.get("repo", "").lower()
1465
+ results = []
1466
+ for repo_name, repo in cfg_loader.repos.items():
1467
+ if target and target not in repo_name.lower():
1468
+ continue
1469
+ local = Path(repo.local_path)
1470
+ if not local.exists():
1471
+ results.append({"repo": repo_name, "status": "error", "detail": "local path not found"})
1472
+ continue
1473
+ try:
1474
+ r = subprocess.run(
1475
+ ["git", "pull", "--rebase", "origin", repo.branch],
1476
+ cwd=str(local), capture_output=True, text=True, timeout=60,
1477
+ )
1478
+ last_line = r.stdout.strip().splitlines()[-1] if r.stdout.strip() else "already up to date"
1479
+ if r.returncode == 0:
1480
+ results.append({"repo": repo_name, "status": "ok", "detail": last_line})
1481
+ else:
1482
+ results.append({"repo": repo_name, "status": "error", "detail": r.stderr.strip()})
1483
+ except Exception as e:
1484
+ results.append({"repo": repo_name, "status": "error", "detail": str(e)})
1485
+ return json.dumps({"results": results})
1486
+
1487
+ if name == "pull_config":
1488
+ target = inputs.get("project", "")
1489
+ dirs = _find_project_dirs(target)
1490
+ if not dirs:
1491
+ return json.dumps({"error": f"No project found matching '{target}'"})
1492
+ results = []
1493
+ for d in dirs:
1494
+ res = _git_pull(d)
1495
+ results.append({"project": _read_project_name(d), "dir": d.name, **res})
1496
+ logger.info("Boss: pull_config %s → %s", d.name, res["status"])
1497
+ return json.dumps({"results": results})
1498
+
1499
+ if name == "fetch_logs":
1500
+ source_filter = inputs.get("source", "").lower()
1501
+ debug = bool(inputs.get("debug", False))
1502
+ tail_override = inputs.get("tail")
1503
+ grep_override = inputs.get("grep_filter", "")
1504
+
1505
+ # Find fetch_log.sh relative to this file
1506
+ script = Path(__file__).resolve().parent.parent / "scripts" / "fetch_log.sh"
1507
+ if not script.exists():
1508
+ return json.dumps({"error": f"fetch_log.sh not found at {script}"})
1509
+
1510
+ log_cfg_dir = Path("config") / "log-configs"
1511
+ if not log_cfg_dir.exists():
1512
+ return json.dumps({"error": "config/log-configs/ not found"})
1513
+
1514
+ props_files = _filter_log_sources(sorted(log_cfg_dir.glob("*.properties")), source_filter)
1515
+ if not props_files:
1516
+ return json.dumps({"error": f"No log-config found matching '{source_filter}'"})
1517
+
1518
+ results = []
1519
+ for props in props_files:
1520
+ env = os.environ.copy()
1521
+ if tail_override:
1522
+ env["TAIL"] = str(tail_override)
1523
+ if grep_override:
1524
+ env["GREP_FILTER"] = grep_override
1525
+
1526
+ cmd = ["bash", str(script)]
1527
+ if debug:
1528
+ cmd.append("--debug")
1529
+ cmd.append(str(props))
1530
+
1531
+ try:
1532
+ r = subprocess.run(
1533
+ cmd, capture_output=True, text=True, timeout=120, env=env,
1534
+ )
1535
+ output = (r.stdout or "").strip()
1536
+ stderr = (r.stderr or "").strip()
1537
+ results.append({
1538
+ "source": props.stem,
1539
+ "returncode": r.returncode,
1540
+ "output": output[-2000:] if output else "",
1541
+ "stderr": stderr[-1000:] if stderr else "",
1542
+ })
1543
+ logger.info("Boss fetch_logs %s rc=%d", props.stem, r.returncode)
1544
+ except subprocess.TimeoutExpired:
1545
+ results.append({"source": props.stem, "error": "timed out after 120s"})
1546
+ except Exception as e:
1547
+ results.append({"source": props.stem, "error": str(e)})
1548
+
1549
+ return json.dumps({"fetched": len(results), "results": results})
1550
+
1551
+ if name == "watch_bot":
1552
+ if not is_admin:
1553
+ return json.dumps({"error": "Admin access required to register bots for monitoring."})
1554
+ user_ids = inputs.get("user_ids", [])
1555
+ project_arg = inputs.get("project", "").strip()
1556
+ if not user_ids:
1557
+ return json.dumps({"error": "No user_ids provided"})
1558
+
1559
+ # Resolve + validate project — required for bot issue routing
1560
+ resolved_project = ""
1561
+ if project_arg:
1562
+ project_dirs = _find_project_dirs(project_arg)
1563
+ if not project_dirs:
1564
+ all_names = [_read_project_name(d) for d in _find_project_dirs()]
1565
+ return json.dumps({
1566
+ "error": f"No project found matching '{project_arg}'",
1567
+ "available_projects": all_names,
1568
+ "action_needed": "Ask the user which project these bot alerts belong to.",
1569
+ })
1570
+ if len(project_dirs) > 1:
1571
+ matches = [_read_project_name(d) for d in project_dirs]
1572
+ return json.dumps({
1573
+ "error": f"Ambiguous project name '{project_arg}' matches: {matches}",
1574
+ "action_needed": "Ask the user to clarify which project.",
1575
+ })
1576
+ resolved_project = _read_project_name(project_dirs[0])
1577
+ else:
1578
+ all_projects = _find_project_dirs()
1579
+ if len(all_projects) == 1:
1580
+ # Single project in workspace — auto-assign
1581
+ resolved_project = _read_project_name(all_projects[0])
1582
+ elif all_projects:
1583
+ all_names = [_read_project_name(d) for d in all_projects]
1584
+ return json.dumps({
1585
+ "error": "Cannot determine which project these bot alerts belong to.",
1586
+ "available_projects": all_names,
1587
+ "action_needed": "Ask the user to specify the project, then retry with project filled in.",
1588
+ })
1589
+
1590
+ results = []
1591
+ for uid in user_ids:
1592
+ if not slack_client:
1593
+ results.append({"user_id": uid, "status": "error", "reason": "no Slack client available"})
1594
+ continue
1595
+ try:
1596
+ info = await slack_client.users_info(user=uid)
1597
+ user = info.get("user", {})
1598
+ if not user.get("is_bot", False):
1599
+ results.append({"user_id": uid, "status": "skipped", "reason": "not a bot — only bots can be watched passively"})
1600
+ continue
1601
+ bot_name = user.get("real_name") or user.get("name") or uid
1602
+ store.add_watched_bot(uid, bot_name, added_by="boss", project_name=resolved_project)
1603
+ logger.info("Boss: now watching bot %s (%s) → project '%s'", bot_name, uid, resolved_project or "unset")
1604
+ results.append({"user_id": uid, "bot_name": bot_name, "project": resolved_project, "status": "watching"})
1605
+ except Exception as e:
1606
+ results.append({"user_id": uid, "status": "error", "reason": str(e)})
1607
+ return json.dumps({"results": results})
1608
+
1609
+ if name == "unwatch_bot":
1610
+ if not is_admin:
1611
+ return json.dumps({"error": "Admin access required to remove bots from monitoring."})
1612
+ user_ids = inputs.get("user_ids", [])
1613
+ if not user_ids:
1614
+ return json.dumps({"error": "No user_ids provided"})
1615
+ results = []
1616
+ for uid in user_ids:
1617
+ removed = store.remove_watched_bot(uid)
1618
+ logger.info("Boss: unwatch bot %s → %s", uid, "removed" if removed else "not found")
1619
+ results.append({"user_id": uid, "status": "removed" if removed else "not found"})
1620
+ return json.dumps({"results": results})
1621
+
1622
+ if name == "list_watched_bots":
1623
+ bots = store.get_watched_bots()
1624
+ return json.dumps({
1625
+ "count": len(bots),
1626
+ "bots": [
1627
+ {
1628
+ "bot_id": b["bot_id"],
1629
+ "bot_name": b["bot_name"],
1630
+ "project": b.get("project_name") or "",
1631
+ "added_by": b["added_by"],
1632
+ "added_at": b["added_at"],
1633
+ }
1634
+ for b in bots
1635
+ ],
1636
+ })
1637
+
1638
+ if name == "upgrade_sentinel":
1639
+ if not is_admin:
1640
+ return json.dumps({"error": "Admin access required to upgrade Sentinel."})
1641
+ import threading
1642
+
1643
+ # Sentinel is installed via npm — use `sentinel upgrade` which handles
1644
+ # npm install + Python bundle copy + restart via stopAll/startAll.
1645
+ # Run it in the background after a short delay so the Slack reply is
1646
+ # sent before the process is replaced.
1647
+ try:
1648
+ r = subprocess.run(
1649
+ ["sentinel", "--version"],
1650
+ capture_output=True, text=True, timeout=10,
1651
+ )
1652
+ sentinel_bin_ok = r.returncode == 0
1653
+ except Exception:
1654
+ sentinel_bin_ok = False
1655
+
1656
+ if not sentinel_bin_ok:
1657
+ return json.dumps({
1658
+ "status": "error",
1659
+ "note": "`sentinel` CLI not found. Run: npm install -g @misterhuydo/sentinel",
1660
+ })
1661
+
1662
+ def _do_upgrade():
1663
+ import time
1664
+ time.sleep(10) # give Slack time to post the reply
1665
+ subprocess.Popen(["sentinel", "upgrade"], close_fds=True)
1666
+
1667
+ threading.Thread(target=_do_upgrade, daemon=True).start()
1668
+ logger.info("Boss: upgrade_sentinel scheduled via `sentinel upgrade`")
1669
+ return json.dumps({
1670
+ "status": "ok",
1671
+ "note": "Upgrade started — pulling latest version via npm and restarting. Give me ~30 seconds then I'll be back.",
1672
+ })
1673
+
1674
+ if name == "ask_codebase":
1675
+ target = inputs.get("repo", "").lower()
1676
+ question = inputs.get("question", "")
1677
+
1678
+ # 1. Find repos whose name contains the target (e.g. "STS", "elprint-sales")
1679
+ matched = [(rn, r) for rn, r in cfg_loader.repos.items() if target in rn.lower()]
1680
+
1681
+ # 2. No repo match — check if target is a project name → use ALL repos in cfg_loader
1682
+ # (each Sentinel instance is scoped to one project, so all repos belong to it)
1683
+ if not matched:
1684
+ current_project = _read_project_name(Path("."))
1685
+ if target in current_project.lower() or current_project.lower() in target:
1686
+ matched = list(cfg_loader.repos.items())
1687
+
1688
+ if not matched:
1689
+ return json.dumps({
1690
+ "error": f"No repo or project found matching '{target}'",
1691
+ "available_repos": list(cfg_loader.repos.keys()),
1692
+ })
1693
+
1694
+ cfg = cfg_loader.sentinel
1695
+ env = os.environ.copy()
1696
+ # Only inject API key when Claude Pro is NOT preferred for heavy tasks
1697
+ if cfg.anthropic_api_key and not cfg.claude_pro_for_tasks:
1698
+ env["ANTHROPIC_API_KEY"] = cfg.anthropic_api_key
1699
+
1700
+ def _ask_one(repo_name, repo_cfg) -> dict:
1701
+ local_path = Path(repo_cfg.local_path)
1702
+ if not local_path.exists():
1703
+ return {"repo": repo_name, "error": f"not cloned yet at {local_path}"}
1704
+ prompt = (
1705
+ f"You are a code analyst. Answer the following question about the codebase at: {local_path}\n\n"
1706
+ f"Question: {question}\n\n"
1707
+ f"Use whatever tools you need to answer accurately. Be concise and direct. Plain text only."
1708
+ )
1709
+ try:
1710
+ r = subprocess.run(
1711
+ ([cfg.claude_code_bin, "--dangerously-skip-permissions", "--print", prompt]
1712
+ if os.getuid() != 0 else
1713
+ [cfg.claude_code_bin, "--print", prompt]),
1714
+ capture_output=True, text=True, timeout=180, env=env,
1715
+ cwd=str(local_path),
1716
+ )
1717
+ output = (r.stdout or "").strip()
1718
+ logger.info("Boss ask_codebase %s rc=%d len=%d", repo_name, r.returncode, len(output))
1719
+ if r.returncode != 0 and not output:
1720
+ raw_err = (r.stderr or "")
1721
+ alert_if_rate_limited(
1722
+ cfg.slack_bot_token, cfg.slack_channel,
1723
+ f"ask_codebase/{repo_name}", raw_err,
1724
+ )
1725
+ return {"repo": repo_name, "error": f"claude --print failed (rc={r.returncode}): {raw_err[:200]}"}
1726
+ return {"repo": repo_name, "answer": output[:3000]}
1727
+ except subprocess.TimeoutExpired:
1728
+ return {"repo": repo_name, "error": "timed out after 180s"}
1729
+ except Exception as e:
1730
+ return {"repo": repo_name, "error": str(e)}
1731
+
1732
+ if len(matched) == 1:
1733
+ result = _ask_one(*matched[0])
1734
+ # Unwrap single-repo result for cleaner response
1735
+ return json.dumps(result)
1736
+
1737
+ # Multiple repos — query each and combine
1738
+ results = [_ask_one(rn, r) for rn, r in matched]
1739
+ return json.dumps({"project": target, "repos_queried": len(results), "results": results})
1740
+
1741
+ if name == "restart_project":
1742
+ if not is_admin:
1743
+ return json.dumps({"error": "Admin access required to restart a project."})
1744
+ project_arg = inputs.get("project", "").lower()
1745
+ dirs = _find_project_dirs(project_arg)
1746
+ if not dirs:
1747
+ return json.dumps({"error": f"No project found matching '{project_arg}'"})
1748
+ results = []
1749
+ for d in dirs:
1750
+ stop_sh = d / "stop.sh"
1751
+ start_sh = d / "start.sh"
1752
+ if not stop_sh.exists() or not start_sh.exists():
1753
+ results.append({"project": d.name, "status": "error", "detail": "stop.sh or start.sh not found"})
1754
+ continue
1755
+ try:
1756
+ subprocess.run(["bash", str(stop_sh)], cwd=str(d), timeout=30)
1757
+ subprocess.run(["bash", str(start_sh)], cwd=str(d), timeout=30)
1758
+ results.append({"project": d.name, "status": "restarted"})
1759
+ logger.info("Boss: restarted project %s", d.name)
1760
+ except Exception as e:
1761
+ results.append({"project": d.name, "status": "error", "detail": str(e)})
1762
+ return json.dumps({"results": results})
1763
+
1764
+ if name == "tail_log":
1765
+ source = inputs.get("source", "").lower()
1766
+ lines = int(inputs.get("lines", 100))
1767
+ script = Path(__file__).resolve().parent.parent / "scripts" / "fetch_log.sh"
1768
+ log_cfg_dir = Path("config") / "log-configs"
1769
+
1770
+ if not script.exists():
1771
+ return json.dumps({"error": "fetch_log.sh not found"})
1772
+ if not log_cfg_dir.exists():
1773
+ return json.dumps({"error": "config/log-configs/ not found"})
1774
+
1775
+ props_files = sorted(log_cfg_dir.glob("*.properties"))
1776
+ if source:
1777
+ props_files = [p for p in props_files if source in p.stem.lower()]
1778
+ if not props_files:
1779
+ return json.dumps({"error": f"No log-config found matching '{source}'"})
1780
+
1781
+ results = []
1782
+ for props in props_files:
1783
+ env = os.environ.copy()
1784
+ env["TAIL"] = str(lines)
1785
+ env["GREP_FILTER"] = "" # no filter — show everything
1786
+ try:
1787
+ r = subprocess.run(
1788
+ ["bash", str(script), str(props)],
1789
+ capture_output=True, text=True, timeout=60, env=env,
1790
+ )
1791
+ tail_lines = (r.stdout or "").strip().splitlines()[-lines:]
1792
+ results.append({
1793
+ "source": props.stem,
1794
+ "lines": len(tail_lines),
1795
+ "content": "\n".join(tail_lines),
1796
+ })
1797
+ logger.info("Boss tail_log %s rc=%d lines=%d", props.stem, r.returncode, len(tail_lines))
1798
+ except subprocess.TimeoutExpired:
1799
+ results.append({"source": props.stem, "error": "timed out"})
1800
+ except Exception as e:
1801
+ results.append({"source": props.stem, "error": str(e)})
1802
+ return json.dumps({"results": results})
1803
+
1804
+ if name == "post_file":
1805
+ if not slack_client or not channel:
1806
+ return json.dumps({"error": "No Slack channel context — cannot upload file"})
1807
+ content = inputs.get("content", "")
1808
+ filename = inputs.get("filename", "sentinel-output.txt")
1809
+ title = inputs.get("title", filename)
1810
+ if not content:
1811
+ return json.dumps({"error": "No content provided"})
1812
+ try:
1813
+ await slack_client.files_upload_v2(
1814
+ channel=channel,
1815
+ content=content,
1816
+ filename=filename,
1817
+ title=title,
1818
+ )
1819
+ logger.info("Boss post_file: uploaded %s (%d bytes) to %s", filename, len(content), channel)
1820
+ return json.dumps({"ok": True, "filename": filename, "bytes": len(content)})
1821
+ except Exception as e:
1822
+ logger.warning("Boss post_file failed: %s", e)
1823
+ return json.dumps({"error": str(e)})
1824
+
1825
+ if name == "my_stats":
1826
+ hours = int(inputs.get("hours", 168))
1827
+ errors = store.get_recent_errors(hours)
1828
+ fixes = store.get_recent_fixes(hours)
1829
+ prs = store.get_open_prs()
1830
+ pending_conf = store.get_fixes_pending_confirmation()
1831
+ # Conversation stats
1832
+ history = store.load_conversation(user_id) if user_id else []
1833
+ hist_len = len(history)
1834
+ # Load conversation updated_at from DB
1835
+ conv_updated = ""
1836
+ try:
1837
+ import sqlite3 as _sqlite3
1838
+ with _sqlite3.connect(store.db_path) as _db:
1839
+ row = _db.execute(
1840
+ "SELECT updated_at FROM conversations WHERE user_id=?", (user_id,)
1841
+ ).fetchone()
1842
+ if row:
1843
+ conv_updated = row[0]
1844
+ except Exception:
1845
+ pass
1846
+ # Tally fix statuses
1847
+ by_status: dict = {}
1848
+ for fix in fixes:
1849
+ s = fix.get("status", "unknown")
1850
+ by_status[s] = by_status.get(s, 0) + 1
1851
+ # Fixes confirmed via sentinel marker in prod
1852
+ confirmed = [f for f in fixes if f.get("fix_outcome") == "confirmed"]
1853
+ regressed = [f for f in fixes if f.get("fix_outcome") == "regressed"]
1854
+ submitted = store.get_submitted_issues(user_id, hours=hours) if user_id else []
1855
+ submitted_recent = store.get_submitted_issues(user_id, hours=hours) if user_id else []
1856
+ return json.dumps({
1857
+ "conversation": {
1858
+ "messages_in_history": hist_len,
1859
+ "turns": hist_len // 2,
1860
+ "last_active": conv_updated or "no history",
1861
+ },
1862
+ "issues_you_submitted": {
1863
+ "total_in_window": len(submitted_recent),
1864
+ "all_time": len(store.get_submitted_issues(user_id) if user_id else []),
1865
+ "recent": [
1866
+ {"project": i["project"], "description": i["description"][:80],
1867
+ "submitted_at": i["submitted_at"]}
1868
+ for i in submitted_recent[:5]
1869
+ ],
1870
+ },
1871
+ "window_hours": hours,
1872
+ "errors_detected": len(errors),
1873
+ "fixes": {
1874
+ "applied": by_status.get("applied", 0),
1875
+ "pending_pr": len(prs),
1876
+ "failed": by_status.get("failed", 0),
1877
+ "skipped": by_status.get("skipped", 0),
1878
+ "error": by_status.get("error", 0),
1879
+ },
1880
+ "confirmed_in_prod": len(confirmed),
1881
+ "regressed_after_fix": len(regressed),
1882
+ "awaiting_confirmation": len(pending_conf),
1883
+ "open_prs": [
1884
+ {"repo": p["repo_name"], "pr_url": p["pr_url"], "timestamp": p["timestamp"]}
1885
+ for p in prs
1886
+ ],
1887
+ "top_errors": [
1888
+ {"message": e["message"][:100], "count": e["count"], "source": e["source"]}
1889
+ for e in errors[:5]
1890
+ ],
1891
+ })
1892
+ if name == "clear_my_history":
1893
+ if user_id:
1894
+ store.save_conversation(user_id, [])
1895
+ logger.info("Boss: cleared conversation history for user %s", user_id)
1896
+ return json.dumps({
1897
+ "status": "cleared",
1898
+ "note": "Your conversation history has been wiped. Next session starts fresh. [DONE]",
1899
+ })
1900
+ return json.dumps({"error": "cannot determine user — not clearing"})
1901
+
1902
+ # ── Admin-only tools ──────────────────────────────────────────────────────
1903
+ _ADMIN_TOOLS = {"list_all_users", "clear_user_history", "reset_fingerprint", "list_all_errors", "export_db"}
1904
+ if name in _ADMIN_TOOLS:
1905
+ if not is_admin:
1906
+ return json.dumps({"error": "Admin access required. You are not in SLACK_ADMIN_USERS."})
1907
+
1908
+ if name == "list_all_users":
1909
+ stats = store.get_all_user_stats()
1910
+ return json.dumps({"users": stats, "total": len(stats)})
1911
+
1912
+ if name == "clear_user_history":
1913
+ target = inputs.get("target_user_id", "").strip()
1914
+ if not target:
1915
+ return json.dumps({"error": "target_user_id is required"})
1916
+ store.save_conversation(target, [])
1917
+ display = store.get_user_name(target)
1918
+ logger.info("Boss admin: cleared history for user %s (%s) by admin %s", target, display, user_id)
1919
+ return json.dumps({"status": "cleared", "target_user_id": target, "display_name": display})
1920
+
1921
+ if name == "set_maintenance":
1922
+ repo_name = inputs.get("repo_name", "").strip()
1923
+ note = inputs.get("note", "").strip()
1924
+ if not repo_name:
1925
+ return json.dumps({"error": "repo_name is required"})
1926
+ store.set_health_state(repo_name, "confirmed", note=note)
1927
+ logger.info("Boss: maintenance confirmed for %s by %s (note: %s)", repo_name, user_id, note or "none")
1928
+ return json.dumps({
1929
+ "status": "confirmed",
1930
+ "repo": repo_name,
1931
+ "note": note or "none",
1932
+ "message": (
1933
+ f"Got it. I'll silently monitor {repo_name}'s health URL and "
1934
+ f"notify you as soon as it comes back online."
1935
+ ),
1936
+ })
1937
+
1938
+ if name == "reset_fingerprint":
1939
+ fp = inputs.get("fingerprint", "").strip()
1940
+ if not fp:
1941
+ return json.dumps({"error": "fingerprint is required"})
1942
+ found = store.reset_fingerprint(fp)
1943
+ logger.info("Boss admin: reset fingerprint %s by admin %s (found=%s)", fp, user_id, found)
1944
+ return json.dumps({"status": "reset" if found else "not_found", "fingerprint": fp,
1945
+ "note": "Sentinel will retry this error on the next poll." if found else "No fix record found for this fingerprint."})
1946
+
1947
+ if name == "list_all_errors":
1948
+ hours = int(inputs.get("hours", 0))
1949
+ errors = store.get_all_errors(hours)
1950
+ return json.dumps({"errors": errors[:100], "total": len(errors),
1951
+ "window_hours": hours or "all time"})
1952
+
1953
+ if name == "export_db":
1954
+ if not slack_client or not channel:
1955
+ return json.dumps({"error": "No Slack channel context — cannot upload file"})
1956
+ try:
1957
+ import sqlite3 as _sq
1958
+ import io as _io
1959
+ lines = []
1960
+ with _sq.connect(store.db_path) as _db:
1961
+ for tbl in ["errors", "fixes", "reports", "slack_users", "conversations", "submitted_issues"]:
1962
+ try:
1963
+ rows = _db.execute(f"SELECT * FROM {tbl}").fetchall() # noqa: S608
1964
+ cols = [d[0] for d in _db.execute(f"SELECT * FROM {tbl} LIMIT 0").description] # noqa: S608
1965
+ lines.append(f"=== {tbl} ({len(rows)} rows) ===")
1966
+ lines.append("\t".join(cols))
1967
+ for row in rows:
1968
+ lines.append("\t".join(str(v) if v is not None else "" for v in row))
1969
+ lines.append("")
1970
+ except Exception:
1971
+ lines.append(f"=== {tbl} (unavailable) ===\n")
1972
+ content = "\n".join(lines)
1973
+ await slack_client.files_upload_v2(
1974
+ channel=channel,
1975
+ content=content,
1976
+ filename="sentinel-db-export.tsv",
1977
+ title="Sentinel DB Export",
1978
+ )
1979
+ logger.info("Boss admin: exported DB (%d bytes) by admin %s", len(content), user_id)
1980
+ return json.dumps({"ok": True, "bytes": len(content)})
1981
+ except Exception as e:
1982
+ return json.dumps({"error": str(e)})
1983
+
1984
+ return json.dumps({"error": f"unknown tool: {name}"})
1985
+
1986
+
1987
+ # ── CLI fallback (OAuth / no API key) ────────────────────────────────────────
1988
+
1989
+ def _attachments_to_text(attachments: list[dict]) -> str:
1990
+ """Produce a plain-text summary of attachments to append to CLI prompts."""
1991
+ if not attachments:
1992
+ return ""
1993
+ parts = []
1994
+ for att in attachments:
1995
+ if att["type"] == "text":
1996
+ parts.append(
1997
+ f"[Attached file: {att['name']}]\n{att['content']}"
1998
+ )
1999
+ elif att["type"] == "image":
2000
+ parts.append(
2001
+ f"[Attached image: {att['name']}] (saved at {att['path']})"
2002
+ )
2003
+ else:
2004
+ parts.append(
2005
+ f"[Attached file: {att['name']}] (saved at {att['path']} read it if relevant)"
2006
+ )
2007
+ return "\n\nATTACHMENTS:\n" + "\n---\n".join(parts)
2008
+
2009
+
2010
+ def _attachments_to_api_blocks(attachments: list[dict]) -> list[dict]:
2011
+ """Convert attachments into Anthropic API message content blocks."""
2012
+ blocks: list[dict] = []
2013
+ for att in attachments:
2014
+ if att["type"] == "image":
2015
+ blocks.append({
2016
+ "type": "image",
2017
+ "source": {
2018
+ "type": "base64",
2019
+ "media_type": att.get("mime", "image/png"),
2020
+ "data": att["content"],
2021
+ },
2022
+ })
2023
+ elif att["type"] == "text":
2024
+ blocks.append({
2025
+ "type": "text",
2026
+ "text": f"[Attached file: {att['name']}]\n{att['content']}",
2027
+ })
2028
+ else:
2029
+ blocks.append({
2030
+ "type": "text",
2031
+ "text": f"[Attached file: {att['name']}] saved at {att['path']}",
2032
+ })
2033
+ return blocks
2034
+
2035
+
2036
+ _ACTION_RE = re.compile(r"^ACTION:\s*(\{.*\})", re.MULTILINE)
2037
+
2038
+
2039
+ async def _handle_with_cli(
2040
+ message: str,
2041
+ history: list,
2042
+ cfg_loader,
2043
+ store,
2044
+ slack_client=None,
2045
+ user_name: str = "",
2046
+ user_id: str = "",
2047
+ attachments: list | None = None,
2048
+ is_admin: bool = False,
2049
+ ) -> tuple[str, bool]:
2050
+ """Fallback: use `claude --print` for users without an Anthropic API key."""
2051
+ status_json = await _run_tool("get_status", {"hours": 24}, cfg_loader, store)
2052
+ prs_json = await _run_tool("list_pending_prs", {}, cfg_loader, store)
2053
+
2054
+ # Pre-fetch log search if the message is a search request.
2055
+ # Use quoted strings as the query, or fall back to the full message.
2056
+ # Never hardcode field names — the query is whatever the user said.
2057
+ search_json = ""
2058
+ _search_kws = ("search", "find", "look for", "show me log", "grep", "entries for")
2059
+ if any(kw in message.lower() for kw in _search_kws):
2060
+ quoted = re.findall(r'"([^"]+)"', message)
2061
+ query = quoted[0] if quoted else message
2062
+ search_json = await _run_tool("search_logs", {"query": query}, cfg_loader, store)
2063
+
2064
+ paused = Path("SENTINEL_PAUSE").exists()
2065
+ repos = list(cfg_loader.repos.keys())
2066
+ log_sources = list(cfg_loader.log_sources.keys())
2067
+ ts = datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M UTC")
2068
+
2069
+ history_text = ""
2070
+ for msg in history[-8:]:
2071
+ role = msg["role"].upper()
2072
+ content = msg["content"]
2073
+ if isinstance(content, list):
2074
+ content = " ".join(
2075
+ (b.get("text", "") if isinstance(b, dict) else getattr(b, "text", ""))
2076
+ for b in content
2077
+ if (isinstance(b, dict) and b.get("type") == "text")
2078
+ or (hasattr(b, "type") and b.type == "text")
2079
+ )
2080
+ history_text += f"\n{role}: {content}"
2081
+
2082
+ slack_mention = f"<@{user_id}>" if user_id else (user_name or "")
2083
+ known_users = store.get_all_users()
2084
+ users_hint = ", ".join(f"<@{uid}> = {name}" for uid, name in known_users.items())
2085
+ prompt = (
2086
+ _SYSTEM
2087
+ + (f"\nYou are speaking with: {user_name} (Slack mention: {slack_mention})" if user_name else "")
2088
+ + "\nAlways start your reply by addressing the user directly using their Slack mention, e.g. \"<@U123> here is what I found...\"."
2089
+ + " Never use their plain name — always use the <@USER_ID> format so Slack highlights it."
2090
+ + (f"\nKnown Slack users: {users_hint}" if users_hint else "")
2091
+ + f"\n\nCurrent time: {ts}"
2092
+ + f"\nSentinel status: {'⏸ PAUSED' if paused else '▶ RUNNING'}"
2093
+ + f"\nManaged repos: {', '.join(repos) if repos else '(none configured)'}"
2094
+ + (f"\nLog sources: {', '.join(log_sources)}" if log_sources else "")
2095
+ + f"\nAdmin access for this user: {'YES — admin tools are available' if is_admin else 'NO — admin tools will be refused'}"
2096
+ + "\nNOTE: Running in CLI fallback mode — admin tools and some features are unavailable. Ask user to configure ANTHROPIC_API_KEY for full features."
2097
+ + f"\n\nCurrent status (last 24 h):\n{status_json}"
2098
+ + f"\n\nOpen PRs:\n{prs_json}"
2099
+ + (f"\n\nLog search results:\n{search_json}" if search_json else "")
2100
+ + (f"\n\nConversation so far:{history_text}" if history_text else "")
2101
+ + _attachments_to_text(attachments or [])
2102
+ + f"\n\nUSER: {message}"
2103
+ + "\n\nIf you need to take an action, include a line like:\n"
2104
+ + " ACTION: {\"action\": \"pause_sentinel\"}\n"
2105
+ + " ACTION: {\"action\": \"resume_sentinel\"}\n"
2106
+ + " ACTION: {\"action\": \"trigger_poll\"}\n"
2107
+ + " ACTION: {\"action\": \"create_issue\", \"description\": \"...\", \"target_repo\": \"\"}\n"
2108
+ + " ACTION: {\"action\": \"search_logs\", \"query\": \"<whatever the user asked to find>\"}\n"
2109
+ + "End with [DONE] if the request is fully handled."
2110
+ )
2111
+
2112
+ cfg = cfg_loader.sentinel
2113
+ env = os.environ.copy()
2114
+ if cfg.anthropic_api_key:
2115
+ env["ANTHROPIC_API_KEY"] = cfg.anthropic_api_key
2116
+
2117
+ try:
2118
+ result = subprocess.run(
2119
+ ([cfg.claude_code_bin, "--dangerously-skip-permissions", "--print", prompt]
2120
+ if os.getuid() != 0 else
2121
+ [cfg.claude_code_bin, "--print", prompt]),
2122
+ capture_output=True, text=True, timeout=180, env=env,
2123
+ )
2124
+ output = (result.stdout or "").strip()
2125
+ if result.returncode != 0 or not output:
2126
+ stderr = (result.stderr or "").strip()
2127
+ logger.error(
2128
+ "Boss CLI call failed (rc=%d): stdout=%r stderr=%r",
2129
+ result.returncode, output[:200], stderr[:200],
2130
+ )
2131
+ raw_err = (result.stderr or "").strip()
2132
+ if result.returncode != 0 and not output:
2133
+ full_err = f"exit {result.returncode}: {raw_err[:300]}"
2134
+ cfg = cfg_loader.sentinel
2135
+ alert_if_rate_limited(cfg.slack_bot_token, cfg.slack_channel,
2136
+ "sentinel_boss/cli", raw_err or full_err)
2137
+ return f":warning: `claude --print` failed ({full_err})", True
2138
+ except Exception as e:
2139
+ logger.error("Boss CLI call failed: %s", e)
2140
+ return f":warning: Boss unavailable: {e}", True
2141
+
2142
+ for m in _ACTION_RE.finditer(output):
2143
+ try:
2144
+ action = json.loads(m.group(1))
2145
+ name = action.pop("action", "")
2146
+ if name:
2147
+ result_str = await _run_tool(name, action, cfg_loader, store, user_id=user_id)
2148
+ logger.info("Boss CLI action: %s → %s", name, result_str[:80])
2149
+ except Exception as e:
2150
+ logger.warning("Boss action parse error: %s", e)
2151
+
2152
+ reply = _ACTION_RE.sub("", output).strip()
2153
+ is_done = "[DONE]" in reply
2154
+ reply = reply.replace("[DONE]", "").strip()
2155
+ if not reply:
2156
+ greeting = f"Hi {user_name}! " if user_name else "Hi! "
2157
+ reply = f"{greeting}I'm Sentinel, your autonomous DevOps agent. How can I help you?"
2158
+
2159
+ history.append({"role": "user", "content": message})
2160
+ history.append({"role": "assistant", "content": reply})
2161
+ return reply, is_done
2162
+
2163
+
2164
+ # ── History serialization helpers ────────────────────────────────────────────
2165
+
2166
+ def _serialize_content(content) -> list:
2167
+ """Convert Anthropic SDK response content (Pydantic objects) to plain dicts.
2168
+
2169
+ The SDK returns TextBlock / ToolUseBlock instances. json.dumps(..., default=str)
2170
+ turns them into useless strings like "TextBlock(type='text', text='...')".
2171
+ This converts them to proper dicts so history round-trips through SQLite safely.
2172
+ """
2173
+ if not isinstance(content, list):
2174
+ return content
2175
+ result = []
2176
+ for block in content:
2177
+ if isinstance(block, dict):
2178
+ result.append(block)
2179
+ elif hasattr(block, "model_dump"):
2180
+ result.append(block.model_dump())
2181
+ elif hasattr(block, "dict"):
2182
+ result.append(block.dict())
2183
+ elif hasattr(block, "type"):
2184
+ if block.type == "text":
2185
+ result.append({"type": "text", "text": getattr(block, "text", "")})
2186
+ elif block.type == "tool_use":
2187
+ result.append({
2188
+ "type": "tool_use",
2189
+ "id": getattr(block, "id", ""),
2190
+ "name": getattr(block, "name", ""),
2191
+ "input": getattr(block, "input", {}),
2192
+ })
2193
+ else:
2194
+ result.append({"type": "text", "text": str(block)})
2195
+ return result
2196
+
2197
+
2198
+ def _clean_history(history: list) -> list:
2199
+ """Remove turns that would cause a 400 from the Anthropic API.
2200
+
2201
+ Strips orphaned tool_use blocks (assistant turn with tool_use but no
2202
+ following tool_result turn) and consecutive same-role turns that result
2203
+ from a previous session that crashed mid-tool-loop.
2204
+ """
2205
+ cleaned = []
2206
+ i = 0
2207
+ while i < len(history):
2208
+ turn = history[i]
2209
+ role = turn.get("role", "")
2210
+ content = turn.get("content", [])
2211
+
2212
+ # Drop assistant turns that contain tool_use if the next turn isn't tool_result
2213
+ if role == "assistant" and isinstance(content, list):
2214
+ has_tool_use = any(
2215
+ (isinstance(b, dict) and b.get("type") == "tool_use")
2216
+ for b in content
2217
+ )
2218
+ if has_tool_use:
2219
+ next_turn = history[i + 1] if i + 1 < len(history) else None
2220
+ next_content = (next_turn or {}).get("content", [])
2221
+ has_result = isinstance(next_content, list) and any(
2222
+ (isinstance(b, dict) and b.get("type") == "tool_result")
2223
+ for b in next_content
2224
+ )
2225
+ if not has_result:
2226
+ i += 1 # skip orphaned tool_use turn
2227
+ continue
2228
+
2229
+ # Drop consecutive same-role turns (keep the last one)
2230
+ if cleaned and cleaned[-1].get("role") == role:
2231
+ cleaned[-1] = turn
2232
+ else:
2233
+ cleaned.append(turn)
2234
+ i += 1
2235
+ return cleaned
2236
+
2237
+
2238
+ # ── API-key path (structured tools, full agentic loop) ────────────────────────
2239
+
2240
+ async def _handle_with_api(
2241
+ message: str,
2242
+ history: list,
2243
+ cfg_loader,
2244
+ store,
2245
+ slack_client=None,
2246
+ user_name: str = "",
2247
+ user_id: str = "",
2248
+ attachments: list | None = None,
2249
+ channel: str = "",
2250
+ is_admin: bool = False,
2251
+ ) -> tuple[str, bool]:
2252
+ import anthropic
2253
+
2254
+ api_key = cfg_loader.sentinel.anthropic_api_key or os.environ.get("ANTHROPIC_API_KEY", "")
2255
+ client = anthropic.Anthropic(api_key=api_key)
2256
+
2257
+ paused = Path("SENTINEL_PAUSE").exists()
2258
+ repos = list(cfg_loader.repos.keys())
2259
+ ts = datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M UTC")
2260
+ known_projects = [_read_project_name(d) for d in _find_project_dirs()]
2261
+ log_sources = list(cfg_loader.log_sources.keys())
2262
+ slack_mention = f"<@{user_id}>" if user_id else (user_name or "")
2263
+ known_users = store.get_all_users() # {user_id: display_name}
2264
+ users_hint = ", ".join(f"<@{uid}> = {name}" for uid, name in known_users.items())
2265
+ system = (
2266
+ _SYSTEM
2267
+ + (f"\nYou are speaking with: {user_name} (Slack mention: {slack_mention})" if user_name else "")
2268
+ + "\nAlways start your reply by addressing the user directly using their Slack mention, e.g. \"<@U123> here is what I found...\"."
2269
+ + " Never use their plain name — always use the <@USER_ID> format so Slack highlights it."
2270
+ + (f"\nKnown Slack users: {users_hint}" if users_hint else "")
2271
+ + f"\n\nCurrent time: {ts}"
2272
+ + f"\nSentinel status: {'⏸ PAUSED' if paused else '▶ RUNNING'}"
2273
+ + f"\nManaged repos: {', '.join(repos) if repos else '(none configured)'}"
2274
+ + (f"\nLog sources: {', '.join(log_sources)}" if log_sources else "")
2275
+ + (f"\nKnown projects in workspace: {', '.join(known_projects)}" if known_projects else "")
2276
+ + f"\nAdmin access for this user: {'YES — admin tools are available' if is_admin else 'NO — admin tools will be refused'}"
2277
+ )
2278
+
2279
+ # Build user content — include attachment blocks if any
2280
+ attach_blocks = _attachments_to_api_blocks(attachments or [])
2281
+ if attach_blocks:
2282
+ user_content = attach_blocks + [{"type": "text", "text": message}]
2283
+ else:
2284
+ user_content = message
2285
+
2286
+ # Work on a local copy — only commit to history on success to prevent
2287
+ # cascading 400s if the API rejects a malformed/corrupted history.
2288
+ messages = list(history) + [{"role": "user", "content": user_content}]
2289
+
2290
+ while True:
2291
+ response = client.messages.create(
2292
+ model="claude-opus-4-6",
2293
+ max_tokens=2048,
2294
+ system=system,
2295
+ tools=_TOOLS,
2296
+ messages=messages,
2297
+ )
2298
+
2299
+ text_parts = []
2300
+ tool_blocks = []
2301
+ for block in response.content:
2302
+ if block.type == "text":
2303
+ text_parts.append(block.text)
2304
+ elif block.type == "tool_use":
2305
+ tool_blocks.append(block)
2306
+
2307
+ if not tool_blocks:
2308
+ reply = " ".join(text_parts).strip()
2309
+ is_done = "[DONE]" in reply
2310
+ reply = reply.replace("[DONE]", "").strip()
2311
+ if not reply:
2312
+ greeting = f"Hi {user_name}! " if user_name else "Hi! "
2313
+ reply = f"{greeting}I'm Sentinel, your autonomous DevOps agent. How can I help you?"
2314
+ # Heuristic override: if reply ends with a question, Claude is waiting for input
2315
+ if is_done and re.search(r'\?\s*$', reply):
2316
+ is_done = False
2317
+ # Commit to history only on success — serialize SDK objects to plain dicts
2318
+ history.append({"role": "user", "content": user_content})
2319
+ history.append({"role": "assistant", "content": _serialize_content(response.content)})
2320
+ return reply, is_done
2321
+
2322
+ messages.append({"role": "assistant", "content": _serialize_content(response.content)})
2323
+ tool_results = []
2324
+ for tc in tool_blocks:
2325
+ result = await _run_tool(tc.name, tc.input, cfg_loader, store, slack_client=slack_client, user_id=user_id, channel=channel, is_admin=is_admin)
2326
+ logger.info("Boss tool: %s(%s) → %s", tc.name, tc.input, result[:120])
2327
+ tool_results.append({
2328
+ "type": "tool_result",
2329
+ "tool_use_id": tc.id,
2330
+ "content": result,
2331
+ })
2332
+ messages.append({"role": "user", "content": tool_results})
2333
+
2334
+
2335
+ # ── Main entry point ──────────────────────────────────────────────────────────
2336
+
2337
+ async def handle_message(
2338
+ message: str,
2339
+ history: list,
2340
+ cfg_loader,
2341
+ store,
2342
+ slack_client=None,
2343
+ user_name: str = "",
2344
+ user_id: str = "",
2345
+ attachments: list | None = None,
2346
+ channel: str = "",
2347
+ is_admin: bool = False,
2348
+ ) -> tuple[str, bool]:
2349
+ """
2350
+ Process one user message through the Sentinel Boss (Claude with tool use).
2351
+
2352
+ Priority:
2353
+ 1. Claude Pro / OAuth via `claude --print` (CLI path — no API key needed)
2354
+ 2. ANTHROPIC_API_KEY fallback (structured tools, full agentic loop)
2355
+
2356
+ Returns:
2357
+ (reply_text, is_done)
2358
+ is_done=True → session complete, release the Slack queue slot.
2359
+ is_done=False → waiting for user follow-up, keep the slot.
2360
+ """
2361
+ api_key = cfg_loader.sentinel.anthropic_api_key or os.environ.get("ANTHROPIC_API_KEY", "")
2362
+
2363
+ # 1st priority: ANTHROPIC_API_KEY — full structured tools, cheap per-token for Boss queries
2364
+ if api_key:
2365
+ try:
2366
+ import anthropic # noqa: F401
2367
+ return await _handle_with_api(
2368
+ message, history, cfg_loader, store, slack_client=slack_client,
2369
+ user_name=user_name, user_id=user_id, attachments=attachments, channel=channel,
2370
+ is_admin=is_admin,
2371
+ )
2372
+ except Exception as api_err:
2373
+ err_str = str(api_err)
2374
+ # Detect rate-limit / auth failure and alert Slack before falling through
2375
+ cfg = cfg_loader.sentinel
2376
+ if is_rate_limited(err_str):
2377
+ from .notify import rate_limit_message
2378
+ alert_if_rate_limited(cfg.slack_bot_token, cfg.slack_channel,
2379
+ "sentinel_boss/api", err_str)
2380
+ logger.warning("Boss: API key path failed (%s), trying CLI fallback", err_str)
2381
+
2382
+ # 2nd priority: Claude Pro / OAuth via CLI (limited tools but no API key needed)
2383
+ cli_reply, cli_done = await _handle_with_cli(
2384
+ message, history, cfg_loader, store, slack_client=slack_client, user_name=user_name,
2385
+ user_id=user_id, attachments=attachments, is_admin=is_admin,
2386
+ )
2387
+ if not cli_reply.startswith(":warning:"):
2388
+ return cli_reply, cli_done
2389
+
2390
+ # Both paths failed — alert Slack and return error
2391
+ cfg = cfg_loader.sentinel
2392
+ err_output = cli_reply
2393
+ alert_if_rate_limited(cfg.slack_bot_token, cfg.slack_channel,
2394
+ "sentinel_boss/cli", err_output)
2395
+ if not api_key:
2396
+ # No auth at all configured
2397
+ no_auth_msg = (
2398
+ ":warning: *Sentinel Boss — no Claude auth configured*\n"
2399
+ "Configure at least one of:\n"
2400
+ "• `ANTHROPIC_API_KEY` in `sentinel.properties` — full features\n"
2401
+ "• Claude Pro OAuth: run `claude login` on the server — required for fix_engine\n"
2402
+ "See: https://github.com/misterhuydo/Sentinel#authentication"
2403
+ )
2404
+ slack_alert(cfg.slack_bot_token, cfg.slack_channel, no_auth_msg)
2405
+ return ":warning: No Claude authentication configured. See Slack for details.", True
2406
+ return cli_reply, cli_done