@misterhuydo/sentinel 1.2.5 → 1.2.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,2147 +1,2552 @@
1
- """
2
- sentinel_boss.py — Claude-backed Sentinel Boss.
3
-
4
- Claude acts as the boss: reads project state, decides on actions,
5
- executes them via tool use, and responds naturally. One agentic loop
6
- per turn — Claude may call multiple tools before replying.
7
- """
8
-
9
- import json
10
- import logging
11
- import os
12
- import re
13
- import subprocess
14
- import uuid
15
- from datetime import datetime, timezone
16
- from pathlib import Path
17
- from typing import Optional
18
-
19
- from .notify import alert_if_rate_limited, slack_alert, is_rate_limited
20
-
21
- logger = logging.getLogger(__name__)
22
-
23
- # ── System prompt ────────────────────────────────────────────────────────────
24
-
25
- _SYSTEM = """\
26
- You are Sentinel Boss — the AI interface for Sentinel, a 24/7 autonomous DevOps agent.
27
-
28
- Sentinel watches production logs, detects errors, generates code fixes via Claude Code,
29
- and opens GitHub PRs for admin review (or pushes directly if AUTO_PUBLISH=true).
30
-
31
- Your job:
32
- - Understand what the DevOps engineer needs in natural language
33
- - Query Sentinel's live state (errors, fixes, open PRs) on their behalf
34
- - Deliver tasks/issues to the right project — you know all projects in this workspace
35
- - Control Sentinel (pause/resume) when asked
36
- - Give honest, concise answers — you know this system inside out
37
- - If a project name is unclear or ambiguous, ask the engineer to clarify — never guess
38
-
39
- What you can do (tools available):
40
-
41
- 1. get_status — Show recent errors detected, fixes applied/pending, open PRs.
42
- e.g. "what happened today?", "any issues?", "show open PRs"
43
-
44
- 2. create_issue — Deliver a fix/task to any project in this workspace by short name.
45
- You know all project names — use list_projects if you're unsure.
46
- If the project name is ambiguous or not found, ask to clarify.
47
- e.g. "tell 1881 to fix X", "look into Y in elprint", "investigate Z"
48
-
49
- 3. pause_sentinel — Create SENTINEL_PAUSE file to halt all auto-fix activity.
50
- e.g. "pause sentinel", "stop auto-fixing"
51
-
52
- 4. resume_sentinel — Remove SENTINEL_PAUSE file to resume normal operation.
53
- e.g. "resume sentinel", "unpause"
54
-
55
- 5. list_projects — List all configured repos and log sources in this Sentinel instance.
56
- e.g. "what projects are you watching?", "list all repos"
57
-
58
- 6. search_logs — SSH live to servers and grep logs in real time (uses fetch_log.sh with
59
- the query as GREP_FILTER). Falls back to cached files if unavailable.
60
- e.g. "search logs for illegal PIN in 1881", "find X in SSOLWA", "grep logs for Z"
61
-
62
- 7. trigger_poll Trigger an immediate poll cycle without waiting for the schedule.
63
- e.g. "check now", "poll immediately", "don't wait, run now"
64
-
65
- 8. get_repo_status — Show the current git branch, last commit, and recent fix branches
66
- for a specific repository.
67
- e.g. "status of repo X", "what branch is cairn on?"
68
-
69
- 9. list_recent_commits List the most recent commits in a repo (including Sentinel's auto-fixes).
70
- e.g. "show me recent commits in elprint-sales", "what did sentinel commit?"
71
-
72
- 10. get_fix_detail — Get full details of a specific fix: error, patch path, PR URL, status.
73
- e.g. "show fix abc123", "details on that fix"
74
-
75
- 11. list_errors — List recent errors from the state store, optionally filtered by repo or source.
76
- e.g. "show all errors today", "what errors hit elprint this week?"
77
-
78
- 12. pull_repo — Run git pull on one or all managed application repos.
79
- e.g. "pull changes", "git pull all repos", "update the code"
80
-
81
- 13. pull_config — Run git pull on one or all Sentinel project config dirs.
82
- e.g. "pull config for 1881", "update sentinel config", "pull all configs"
83
-
84
- 14. fetch_logs — Run fetch_log.sh on demand to pull fresh logs from remote servers right now.
85
- Supports --debug mode and parameter overrides (tail count, grep filter).
86
- e.g. "fetch logs", "try fetch_log.sh for SSOLWA", "fetch logs with debug",
87
- "grab latest logs from STS", "fetch logs without filter"
88
-
89
- 15. watch_bot — Register a Slack bot for passive monitoring. Every message it posts is
90
- auto-queued as an issue in the bot's registered project.
91
- ALWAYS requires a project — infer from context or ask the user first.
92
- e.g. "listen to @alertbot", "watch @bot1 @bot2 for project 1881", "monitor @errorbot"
93
-
94
- 16. unwatch_bot — Remove a Slack bot from the passive watch list.
95
- e.g. "stop watching @alertbot", "unwatch @errorbot"
96
-
97
- 17. list_watched_bots — Show all Slack bots currently being passively monitored and which projects
98
- they are delivering to.
99
- e.g. "which bots are you watching?", "list monitored bots"
100
-
101
- 18. upgrade_sentinelPull the latest Sentinel agent code, update Python deps, and restart the
102
- process. Safe to run at any time — no restart if already up to date.
103
- e.g. "upgrade sentinel", "update sentinel", "upgrade yourself"
104
-
105
- 19. ask_codebase Ask any natural-language question about a managed repo's codebase.
106
- Claude Code answers using its full knowledge of the code.
107
- e.g. "what does the 1881 backend do?", "find PIN validation in elprint",
108
- "any TODOs in cairn?", "are there security issues in elprint-sales?"
109
-
110
- 20. restart_project — Stop and restart a specific Sentinel monitoring instance (stop.sh + start.sh).
111
- This restarts the Sentinel agent for that project, NOT the application itself.
112
- e.g. "restart sentinel for 1881", "restart the 1881 monitor", "reload elprint sentinel"
113
-
114
- 21. tail_log Fetch the last N lines of a log source live, without a grep filter.
115
- e.g. "show recent SSOLWA logs", "tail STS", "last 200 lines from 1881 logs"
116
-
117
- 22. post_file — Upload a text file to the Slack conversation (diff, log excerpt, report, CSV).
118
- Use when output is too large for chat, or the user asks to download/export something.
119
- e.g. "give me that as a file", "export the log", "send me the diff"
120
-
121
- When someone asks what you can do, what you support, what your capabilities are, or how you can help,
122
- reply with a short summary grouped by category:
123
-
124
- *Monitoring & status*
125
- `get_status` errors detected, fixes applied/pending/failed, open PRs "what happened today?"
126
- `get_repo_status` per-repo breakdown of errors and fixes — "how is elprint doing?"
127
- • `list_recent_commits` — recent Sentinel auto-fix commits — "what did Sentinel commit?"
128
-
129
- *Log management*
130
- • `fetch_logs` — pull fresh logs from servers right now — "fetch logs for SSOLWA"
131
- • `search_logs` — live SSH grep on production servers — "search logs for illegal PIN in 1881"
132
- • `tail_log` — last N lines of a log source, no filter — "show recent SSOLWA logs"
133
-
134
- *Codebase questions*
135
- • `ask_codebase` — any question about a repo's code — "what does 1881 do?", "find PIN validation", "any TODOs?", "security issues?"
136
-
137
- *Fix management*
138
- • `get_fix_details` — full details of a specific fix — "show fix abc123"
139
- `list_pending_prs` — all open Sentinel PRs awaiting review — "list open PRs"
140
- • `check_auth_status` — Claude auth health, rate-limit circuit state, fix engine 24 h stats "is Claude working?", "any rate limits?", "auth issues?"
141
-
142
- *Project & task delivery*
143
- • `list_projects` — all projects and repos Sentinel manages — "what projects do you manage?"
144
- • `create_issue` — deliver a task to any project by name — "tell 1881 to fix X"
145
- • `trigger_poll` — run a log-fetch + fix cycle right now — "check now"
146
- • `pause_sentinel` / `resume_sentinel` — halt or resume all auto-fix activity — "pause Sentinel"
147
-
148
- *Repo & config sync*
149
- • `pull_repo` — git pull on managed application repos — "pull latest code"
150
- • `pull_config` — git pull on Sentinel config dirs — "pull config for elprint"
151
-
152
- *File sharing*
153
- `post_file` upload a file to Slack — "give me that as a file", "export the log", "send me the diff"
154
-
155
- *Personal*
156
- • `my_stats` — your activity: issues submitted, fixes, conversation history — "my stats"
157
- `clear_my_history` — wipe your conversation history and start fresh — "clear my history"
158
-
159
- *Slack bot watching*
160
- • `list_watched_bots` — show all bots currently being monitored — "which bots are you watching?"
161
-
162
- *Admin* (SLACK_ADMIN_USERS if configured, otherwise all allowed users)
163
- • `watch_bot` — register a Slack bot for passive monitoring; its messages become issues — "listen to @alertbot"
164
- `unwatch_bot` — stop monitoring a bot — "stop watching @errorbot"
165
- • `restart_project` — stop + restart a Sentinel monitoring instance (not the app) — "restart sentinel for 1881"
166
- • `upgrade_sentinel` — pull latest Sentinel release and restart — "upgrade sentinel"
167
- `list_all_users` all Slack users who have talked to Sentinel + activity summary
168
- • `clear_user_history` — wipe a specific user's conversation history
169
- • `reset_fingerprint` — clear the 24h fix lock so Sentinel retries an error
170
- • `list_all_errors` — full unfiltered error database
171
- • `export_db` — dump full Sentinel state as a downloadable file
172
-
173
- Tone: direct, professional, like a senior engineer who owns the system.
174
- Don't pad responses. Don't say "Great question!" or "Certainly!".
175
- If you don't know something, use a tool to find out before saying you don't know.
176
-
177
- When to act vs. when to ask:
178
- - Clear command ("check status", "fetch logs", "pause sentinel") → call the tool immediately, reply with results.
179
- - Ambiguous or exploratory ("what does get_repo_status do?", "tell me about search_logs") → explain the tool naturally, then ask: "Want me to run it?"
180
- - Unclear intent (could be either) use judgment: brief explanation + "Want me to run this now?"
181
- - If a tool call will take a moment (search, fetch, pull), prefix your reply with a brief "working" line ending in "..." before the results, e.g. "Searching SSOLWA for TryDig activity..." then the actual output.
182
- Never just say a working line and stop — always follow it with the results in the same message.
183
-
184
- Session context critical rules:
185
- - Loaded conversation history is prior-session background only. It may be hours or days old.
186
- - NEVER say "the previous search", "I already fetched", "as I found earlier", or any phrase implying you already did part of the current task unless a tool result appears in THIS response's tool calls.
187
- - When handling a new request, call the tools fresh. Do not assume any prior tool result is still current or that any prior step "counts" toward the current task.
188
- - The only exception: if the user explicitly asks about something from the history ("what did you find earlier?"), you may reference it — but note it is from a prior session.
189
-
190
- Trust your tool results never contradict them:
191
- - If any search_logs call in this response returned total_matches > 0, you HAVE results. Report them.
192
- - Never say "no results found" or "nothing was found" when a tool result shows total_matches > 0.
193
- - If one source-specific call returns 0 but a broader call returned matches, use the broader results.
194
- - A cached result with files_searched=0 is a source-name lookup failure, NOT an absence of log data.
195
- Treat it as "source not recognised" and fall back to the broad search results you already have.
196
-
197
- Avoid redundant tool calls (within a single response only — always run tools fresh for new requests):
198
- - If a broad search (e.g. search_logs with no source filter) already returned results in THIS response, do NOT repeat the same search with a source filter to "refine" — use what you already fetched.
199
- - If a tool call fails in THIS response, do NOT retry the entire search from scratch. Continue with what succeeded and note the failure.
200
- - One pass per task: gather all needed data in a single round of tool calls, then produce the final answer.
201
-
202
- Issue identification — before calling create_issue:
203
- 1. Determine if the message is a REAL issue/task (bug report, feature request, investigation ask)
204
- vs. a status question, tool query, or casual chat. If not an issue, just answer normally.
205
- 2. If it IS an issue, gather what's needed before creating:
206
- - Project: which project? If unclear, ask. Use list_projects if you need to check names.
207
- - Context: what's the problem? Include everything: description, error text, steps to reproduce.
208
- - Attachments: summarise any files/screenshots the user shared.
209
- - Support URL: note any ticket/doc/link the user mentioned.
210
- - Identity: always captured automatically from the Slack session.
211
- 3. Populate `findings` with curated evidence only when relevant and concise:
212
- - If you ran search_logs, tail_log, ask_codebase, or get_status before creating the issue,
213
- summarise only the findings directly related to this specific issue.
214
- - Do NOT paste raw tool output. Summarise: which services, how often, key pattern, 1-3 example lines.
215
- - If the search returned nothing relevant, or the issue is purely user-described with no log evidence, leave `findings` empty.
216
- - The fix engine reads only the issue file. Give it signal, not noise — 500 words max.
217
- 4. Before calling the tool, confirm with the user in natural language:
218
- e.g. "I'll create an issue for project *1881* here's what I have: [summary]. Look right?"
219
- Wait for their confirmation before proceeding.
220
- EXCEPTION: if the user's message already contains a clear project + unambiguous description,
221
- skip the confirmation and create immediately don't ask when nothing is unclear.
222
- 5. After creating, tell them the issue was queued and Sentinel will pick it up on the next poll.
223
-
224
- When the engineer's request is fully handled, end your LAST message with the token: [DONE]
225
- IMPORTANT: Always write your actual reply text FIRST, then append [DONE] at the end. Example: "Hello! I'm Sentinel. [DONE]". Never output [DONE] as your only content.
226
- For greetings like "hello" or empty messages, introduce yourself briefly and offer help, then end with [DONE].
227
- If you need a follow-up from them, do NOT include [DONE] — wait for their next message.
228
- """
229
-
230
- # ── Tool definitions ─────────────────────────────────────────────────────────
231
-
232
- _TOOLS = [
233
- {
234
- "name": "get_status",
235
- "description": (
236
- "Get recent errors, fixes applied, fixes pending review, and open PRs. "
237
- "Use for: 'what happened today?', 'any issues?', 'how are things?', "
238
- "'what are the open PRs?', 'did sentinel fix anything?'"
239
- ),
240
- "input_schema": {
241
- "type": "object",
242
- "properties": {
243
- "hours": {
244
- "type": "integer",
245
- "description": "Look-back window in hours (default 24)",
246
- "default": 24,
247
- },
248
- },
249
- },
250
- },
251
- {
252
- "name": "create_issue",
253
- "description": (
254
- "Deliver a confirmed issue/task to a Sentinel project instance. "
255
- "Only call this after you have: (1) confirmed the message is a real issue or task, "
256
- "(2) identified the target project, (3) gathered enough context, and "
257
- "(4) confirmed with the user ('I'll create this issue for project X — does that look right?'). "
258
- "Do NOT call this for status questions, tool queries, or casual chat."
259
- ),
260
- "input_schema": {
261
- "type": "object",
262
- "properties": {
263
- "description": {
264
- "type": "string",
265
- "description": "Full problem/task description — include all context the user gave you",
266
- },
267
- "project": {
268
- "type": "string",
269
- "description": "Project short name (e.g. '1881', 'elprint'). Ask if unclear.",
270
- },
271
- "target_repo": {
272
- "type": "string",
273
- "description": "Specific repo within the project (omit to let Sentinel auto-route)",
274
- },
275
- "support_url": {
276
- "type": "string",
277
- "description": "Any URL the user shared (ticket, doc, screenshot link, etc.)",
278
- },
279
- "attachments_summary": {
280
- "type": "string",
281
- "description": "Summary of any files/screenshots the user attached",
282
- },
283
- "findings": {
284
- "type": "string",
285
- "description": (
286
- "A concise, curated summary of evidence directly relevant to this issue "
287
- "NOT raw tool output. Include only what the fix engine needs: "
288
- "key error patterns, affected services, approximate frequency/timestamps, "
289
- "and 1-3 representative log lines. Omit unrelated results. "
290
- "Keep under 500 words. Leave empty if no tool results are relevant."
291
- ),
292
- },
293
- },
294
- "required": ["description"],
295
- },
296
- },
297
- {
298
- "name": "get_fix_details",
299
- "description": "Get full details of a specific fix by fingerprint (8+ hex chars).",
300
- "input_schema": {
301
- "type": "object",
302
- "properties": {
303
- "fingerprint": {"type": "string"},
304
- },
305
- "required": ["fingerprint"],
306
- },
307
- },
308
- {
309
- "name": "list_pending_prs",
310
- "description": "List all open Sentinel PRs awaiting admin review.",
311
- "input_schema": {"type": "object", "properties": {}},
312
- },
313
- {
314
- "name": "check_auth_status",
315
- "description": (
316
- "Check Claude authentication health, current rate-limit / usage-limit circuit state, "
317
- "and fix engine stats for the last 24 h. "
318
- "Use when someone asks: 'is Claude working?', 'any rate limits?', 'why aren't fixes running?', "
319
- "'is the API key OK?', 'auth issues?', 'fix engine status'."
320
- ),
321
- "input_schema": {"type": "object", "properties": {}},
322
- },
323
- {
324
- "name": "pause_sentinel",
325
- "description": (
326
- "Pause ALL Sentinel fix activity immediately. "
327
- "Use when the engineer says 'pause', 'stop', 'freeze', or 'hold off'."
328
- ),
329
- "input_schema": {"type": "object", "properties": {}},
330
- },
331
- {
332
- "name": "resume_sentinel",
333
- "description": "Resume Sentinel fix activity after a pause.",
334
- "input_schema": {"type": "object", "properties": {}},
335
- },
336
- {
337
- "name": "list_projects",
338
- "description": (
339
- "List all projects (Sentinel instances) in this workspace and the repos "
340
- "each one manages. Use for: 'what projects do you manage?', 'list projects', "
341
- "'what repos are configured?', 'show me all projects'."
342
- ),
343
- "input_schema": {"type": "object", "properties": {}},
344
- },
345
- {
346
- "name": "search_logs",
347
- "description": (
348
- "Search production logs for a keyword or pattern. "
349
- "When a project or source is specified (or can be inferred), performs a LIVE fetch "
350
- "via fetch_log.sh with the query as the grep filter — SSHes directly to the server. "
351
- "Falls back to searching locally-cached log files when no source can be determined. "
352
- "Use for: 'search logs for illegal PIN in 1881', 'find X in SSOLWA logs', "
353
- "'what did user Y do?', 'show entries for appid=Z', 'grep logs for X'."
354
- ),
355
- "input_schema": {
356
- "type": "object",
357
- "properties": {
358
- "query": {
359
- "type": "string",
360
- "description": "Keyword or regex to grep for",
361
- },
362
- "source": {
363
- "type": "string",
364
- "description": "Log source name to search (partial match against log-config filenames, e.g. 'SSOLWA', '1881'). Leave empty to search all sources.",
365
- },
366
- "max_matches": {
367
- "type": "integer",
368
- "description": "Max matching lines to return per source (default 30)",
369
- "default": 30,
370
- },
371
- "tail": {
372
- "type": "integer",
373
- "description": (
374
- "Number of log lines to fetch from the server before grepping (default: config value, typically 500). "
375
- "Increase when the user asks for a longer time window — e.g. 'yesterday up to now' → use 5000-10000. "
376
- "Higher values take longer but cover more history."
377
- ),
378
- },
379
- },
380
- "required": ["query"],
381
- },
382
- },
383
- {
384
- "name": "trigger_poll",
385
- "description": (
386
- "Trigger an immediate log-fetch and error-detection cycle without waiting "
387
- "for the next scheduled interval. Use when: 'check now', 'run now', "
388
- "'poll immediately', 'don't wait'."
389
- ),
390
- "input_schema": {"type": "object", "properties": {}},
391
- },
392
- {
393
- "name": "get_repo_status",
394
- "description": (
395
- "Per-repository breakdown of errors detected and fixes applied. "
396
- "Use for: 'how is repo X doing?', 'which repo has the most issues?', "
397
- "'break down by repo'."
398
- ),
399
- "input_schema": {
400
- "type": "object",
401
- "properties": {
402
- "hours": {
403
- "type": "integer",
404
- "description": "Look-back window in hours (default 24)",
405
- "default": 24,
406
- },
407
- },
408
- },
409
- },
410
- {
411
- "name": "list_recent_commits",
412
- "description": (
413
- "List recent commits made by Sentinel across all managed repos. "
414
- "Use for: 'what did Sentinel commit?', 'show recent auto-fixes', 'what was changed?'."
415
- ),
416
- "input_schema": {
417
- "type": "object",
418
- "properties": {
419
- "limit": {
420
- "type": "integer",
421
- "description": "Max commits per repo (default 5)",
422
- "default": 5,
423
- },
424
- },
425
- },
426
- },
427
- {
428
- "name": "pull_repo",
429
- "description": (
430
- "Run git pull on one or all managed repos to fetch latest changes from GitHub. "
431
- "Use for: 'pull changes', 'git pull', 'update repo X', 'fetch latest code'."
432
- ),
433
- "input_schema": {
434
- "type": "object",
435
- "properties": {
436
- "repo": {
437
- "type": "string",
438
- "description": "Repo name to pull (omit to pull all configured repos)",
439
- },
440
- },
441
- },
442
- },
443
- {
444
- "name": "pull_config",
445
- "description": (
446
- "Run git pull on one or all Sentinel project config directories. "
447
- "Projects are matched by short name ('1881', 'elprint') or full dir name ('sentinel-1881'). "
448
- "Use for: 'pull config for 1881', 'update sentinel config', 'pull all configs'."
449
- ),
450
- "input_schema": {
451
- "type": "object",
452
- "properties": {
453
- "project": {
454
- "type": "string",
455
- "description": "Project short name or dir name to pull (omit for all projects)",
456
- },
457
- },
458
- },
459
- },
460
- {
461
- "name": "fetch_logs",
462
- "description": (
463
- "Run fetch_log.sh for one or all configured log sources to pull the latest logs "
464
- "from remote servers right now. Use for: 'fetch logs', 'run fetch_log.sh', "
465
- "'grab latest logs from SSOLWA', 'try fetch_log.sh for STS', "
466
- "'pull logs from server', 'get fresh logs'."
467
- ),
468
- "input_schema": {
469
- "type": "object",
470
- "properties": {
471
- "source": {
472
- "type": "string",
473
- "description": "Log source name to fetch (partial match, e.g. 'SSOLWA'). Omit to fetch all.",
474
- },
475
- "debug": {
476
- "type": "boolean",
477
- "description": "Run fetch_log.sh with --debug flag to show SSH/grep details",
478
- "default": False,
479
- },
480
- "tail": {
481
- "type": "integer",
482
- "description": "Override TAIL lines (how many log lines to fetch)",
483
- },
484
- "grep_filter": {
485
- "type": "string",
486
- "description": "Override GREP_FILTER (regex). Pass 'none' to disable filtering.",
487
- },
488
- },
489
- },
490
- },
491
- {
492
- "name": "watch_bot",
493
- "description": (
494
- "Tell Sentinel to passively monitor a Slack bot — queuing its messages as issues. "
495
- "Extract all <@UXXXXXX> user IDs from the message and pass them here. "
496
- "Sentinel verifies each is actually a bot (not a human) before adding to the watch list. "
497
- "IMPORTANT: a bot watcher is only useful if its issues can be delivered to a project. "
498
- "Try to infer the project from context (bot name, prior messages, available projects). "
499
- "If it cannot be determined, do NOT call this tool — instead ask the user which project "
500
- "the bot's alerts belong to, then call this tool with the project filled in. "
501
- "Use for: 'listen to @alertbot', 'watch @bot1 @bot2', 'monitor @errorbot'."
502
- ),
503
- "input_schema": {
504
- "type": "object",
505
- "properties": {
506
- "user_ids": {
507
- "type": "array",
508
- "items": {"type": "string"},
509
- "description": "Slack user IDs to watch — extract from <@UXXXXXX> patterns in the message",
510
- },
511
- "project": {
512
- "type": "string",
513
- "description": "Project short name this bot's issues should be routed to (e.g. '1881', 'elprint'). Infer from context or ask user before calling.",
514
- },
515
- },
516
- "required": ["user_ids"],
517
- },
518
- },
519
- {
520
- "name": "unwatch_bot",
521
- "description": (
522
- "Stop Sentinel from monitoring a Slack bot. "
523
- "Use for: 'stop watching @alertbot', 'unwatch @bot', 'remove @errorbot from watchers'."
524
- ),
525
- "input_schema": {
526
- "type": "object",
527
- "properties": {
528
- "user_ids": {
529
- "type": "array",
530
- "items": {"type": "string"},
531
- "description": "Slack user IDs to remove from the watch list",
532
- },
533
- },
534
- "required": ["user_ids"],
535
- },
536
- },
537
- {
538
- "name": "list_watched_bots",
539
- "description": (
540
- "List all Slack bots Sentinel is currently monitoring passively. "
541
- "Use for: 'who are you watching?', 'which bots are you monitoring?', 'list watched bots'."
542
- ),
543
- "input_schema": {"type": "object", "properties": {}},
544
- },
545
- {
546
- "name": "upgrade_sentinel",
547
- "description": (
548
- "Upgrade the Sentinel agent itself: git pull the latest code, update Python deps, "
549
- "then restart the process. Safe to call at any time — if already up to date, "
550
- "no restart is triggered. "
551
- "Use for: 'upgrade sentinel', 'update sentinel', 'upgrade yourself', "
552
- "'pull latest sentinel code', 'restart sentinel after upgrade'."
553
- ),
554
- "input_schema": {"type": "object", "properties": {}},
555
- },
556
- {
557
- "name": "ask_codebase",
558
- "description": (
559
- "Ask any natural-language question about a managed codebase. "
560
- "Accepts a repo name (e.g. 'STS', 'elprint-sales') OR a project name (e.g. '1881', 'elprint') "
561
- "— if a project name is given and it has multiple repos, all are queried. "
562
- "Claude Code answers using its full codebase knowledge — no need to specify how. "
563
- "Use for: 'what does 1881 do?', 'TODOs in 1881', 'find PIN validation in STS', "
564
- "'security issues in elprint-sales?', 'summarize the cairn repo'."
565
- ),
566
- "input_schema": {
567
- "type": "object",
568
- "properties": {
569
- "repo": {
570
- "type": "string",
571
- "description": "Repo name (e.g. 'STS', 'elprint-sales') OR project name (e.g. '1881', 'elprint') — project name queries all its repos",
572
- },
573
- "question": {
574
- "type": "string",
575
- "description": "Natural language question about the codebase",
576
- },
577
- },
578
- "required": ["repo", "question"],
579
- },
580
- },
581
- {
582
- "name": "restart_project",
583
- "description": (
584
- "Stop and restart a specific Sentinel monitoring instance (runs stop.sh then start.sh). "
585
- "This restarts the Sentinel agent process for that project — it does NOT restart the application itself. "
586
- "Use when: 'restart sentinel for 1881', 'reload the 1881 monitor', 'restart elprint sentinel'. "
587
- "Safer than restarting all projects at once."
588
- ),
589
- "input_schema": {
590
- "type": "object",
591
- "properties": {
592
- "project": {
593
- "type": "string",
594
- "description": "Project short name or dir name (e.g. '1881', 'elprint')",
595
- },
596
- },
597
- "required": ["project"],
598
- },
599
- },
600
- {
601
- "name": "my_stats",
602
- "description": (
603
- "Show the current user's personal Sentinel dashboard: "
604
- "conversation history length, issues they submitted, and "
605
- "a summary of Sentinel fix activity (errors caught, fixes applied, "
606
- "fixes pending PR review, fixes confirmed live, fixes failed). "
607
- "Use for: 'what have you done for me?', 'show my stats', "
608
- "'how many issues have been fixed?', 'my history', 'summary', "
609
- "'what did sentinel fix this week?', 'pending fixes', 'open PRs'."
610
- ),
611
- "input_schema": {
612
- "type": "object",
613
- "properties": {
614
- "hours": {
615
- "type": "integer",
616
- "description": "Look-back window in hours (default 168 = 7 days)",
617
- "default": 168,
618
- },
619
- },
620
- },
621
- },
622
- {
623
- "name": "clear_my_history",
624
- "description": (
625
- "Clear the current user's conversation history with Sentinel. "
626
- "After clearing, future sessions start with no memory of past conversations. "
627
- "Use for: 'clear my history', 'forget our conversation', "
628
- "'start fresh', 'reset my context', 'wipe my history'."
629
- ),
630
- "input_schema": {"type": "object", "properties": {}},
631
- },
632
- {
633
- "name": "tail_log",
634
- "description": (
635
- "Fetch the last N lines of a log source's live production logs without any grep filter. "
636
- "Use when: 'show me recent SSOLWA logs', 'tail STS', 'what's happening in 1881 logs right now', "
637
- "'show last 100 lines from SSOLWA'. Different from search_logs — no pattern required."
638
- ),
639
- "input_schema": {
640
- "type": "object",
641
- "properties": {
642
- "source": {
643
- "type": "string",
644
- "description": "Log source name (partial match against log-config filenames, e.g. 'SSOLWA', 'STS')",
645
- },
646
- "lines": {
647
- "type": "integer",
648
- "description": "Number of recent lines to fetch (default 100)",
649
- "default": 100,
650
- },
651
- },
652
- "required": ["source"],
653
- },
654
- },
655
- {
656
- "name": "post_file",
657
- "description": (
658
- "Upload a text file directly to the Slack conversation so the user can read or download it. "
659
- "Use when: output is too large for a chat message, the user asks to 'download', 'export', or "
660
- "'send as a file', or when formatted content (diffs, logs, CSVs, reports) is clearer as a file. "
661
- "e.g. 'give me that as a file', 'export the log', 'send me the diff for PR #41', "
662
- "'download the health report', 'export recent errors as CSV'"
663
- ),
664
- "input_schema": {
665
- "type": "object",
666
- "properties": {
667
- "content": {
668
- "type": "string",
669
- "description": "The full text content of the file to upload",
670
- },
671
- "filename": {
672
- "type": "string",
673
- "description": "Filename with extension, e.g. 'fix-ab12.diff', 'sentinel-report.txt', 'errors.csv', 'ssolwa.log'",
674
- },
675
- "title": {
676
- "type": "string",
677
- "description": "Optional display title shown above the file in Slack (defaults to filename)",
678
- },
679
- },
680
- "required": ["content", "filename"],
681
- },
682
- },
683
- {
684
- "name": "list_all_users",
685
- "description": (
686
- "ADMIN ONLY. List all Slack users who have ever talked to Sentinel, "
687
- "with their issue count and conversation message count. "
688
- "e.g. 'list all users', 'who has talked to you?', 'show user activity'"
689
- ),
690
- "input_schema": {"type": "object", "properties": {}},
691
- },
692
- {
693
- "name": "clear_user_history",
694
- "description": (
695
- "ADMIN ONLY. Clear the conversation history for a specific Slack user. "
696
- "e.g. 'clear history for huy', 'reset bob's conversation'"
697
- ),
698
- "input_schema": {
699
- "type": "object",
700
- "properties": {
701
- "user_id": {
702
- "type": "string",
703
- "description": "Slack user ID to clear (e.g. U01AB2CD3EF)",
704
- },
705
- },
706
- "required": ["user_id"],
707
- },
708
- },
709
- {
710
- "name": "reset_fingerprint",
711
- "description": (
712
- "ADMIN ONLY. Remove the 24h fix lock for an error fingerprint so Sentinel will retry it "
713
- "on the next poll cycle. Use when a fix attempt failed and you want to force a retry. "
714
- "e.g. 'retry fix abc123', 'reset fingerprint abc123de', 'let Sentinel try that error again'"
715
- ),
716
- "input_schema": {
717
- "type": "object",
718
- "properties": {
719
- "fingerprint": {
720
- "type": "string",
721
- "description": "Error fingerprint hash (8+ hex chars, from get_fix_details or list_all_errors)",
722
- },
723
- },
724
- "required": ["fingerprint"],
725
- },
726
- },
727
- {
728
- "name": "list_all_errors",
729
- "description": (
730
- "ADMIN ONLY. Return the full unfiltered error database — all fingerprints, counts, "
731
- "sources, and last-seen times. "
732
- "e.g. 'show all errors', 'full error list', 'dump the error DB'"
733
- ),
734
- "input_schema": {
735
- "type": "object",
736
- "properties": {
737
- "hours": {
738
- "type": "integer",
739
- "description": "Limit to errors seen in the last N hours (0 = all time)",
740
- "default": 0,
741
- },
742
- },
743
- },
744
- },
745
- {
746
- "name": "export_db",
747
- "description": (
748
- "ADMIN ONLY. Export the full Sentinel state (errors, fixes, PRs, users) as a "
749
- "downloadable text file posted to Slack. "
750
- "e.g. 'export the DB', 'download state', 'give me a full report file'"
751
- ),
752
- "input_schema": {"type": "object", "properties": {}},
753
- },
754
- ]
755
-
756
-
757
- # ── Workspace helpers ─────────────────────────────────────────────────────────
758
-
759
- def _workspace_dir() -> Path:
760
- return Path(".").resolve().parent
761
-
762
- def _short_name(dir_name: str) -> str:
763
- """'sentinel-1881' → '1881', 'sentinel-elprint' → 'elprint', others unchanged."""
764
- if dir_name.startswith("sentinel-"):
765
- return dir_name[len("sentinel-"):]
766
- return dir_name
767
-
768
- def _read_project_name(project_dir: Path) -> str:
769
- """Return PROJECT_NAME from sentinel.properties if set, else fall back to _short_name(dir)."""
770
- props = project_dir / "config" / "sentinel.properties"
771
- if props.exists():
772
- try:
773
- for line in props.read_text(encoding="utf-8", errors="ignore").splitlines():
774
- line = line.strip()
775
- if line.startswith("PROJECT_NAME"):
776
- _, _, val = line.partition("=")
777
- val = val.partition("#")[0].strip()
778
- if val:
779
- return val
780
- except Exception:
781
- pass
782
- return _short_name(project_dir.name)
783
-
784
- def _find_project_dirs(target: str = "") -> list[Path]:
785
- """Return project dirs matching target (PROJECT_NAME, short name, or full dir name), or all if target empty."""
786
- workspace = _workspace_dir()
787
- results = []
788
- try:
789
- for d in sorted(workspace.iterdir()):
790
- if not d.is_dir() or d.name in ("code", ".git"):
791
- continue
792
- if not (d / "config").exists():
793
- continue
794
- if target:
795
- t = target.lower()
796
- if (t not in d.name.lower()
797
- and t not in _short_name(d.name).lower()
798
- and t not in _read_project_name(d).lower()):
799
- continue
800
- results.append(d)
801
- except Exception:
802
- pass
803
- return results
804
-
805
- def _git_pull(path: Path) -> dict:
806
- try:
807
- r = subprocess.run(
808
- ["git", "pull", "--rebase", "origin"],
809
- cwd=str(path), capture_output=True, text=True, timeout=60,
810
- )
811
- last = r.stdout.strip().splitlines()[-1] if r.stdout.strip() else "already up to date"
812
- return {"status": "ok" if r.returncode == 0 else "error",
813
- "detail": last if r.returncode == 0 else r.stderr.strip()}
814
- except Exception as e:
815
- return {"status": "error", "detail": str(e)}
816
-
817
-
818
- # ── Log-source name resolver ──────────────────────────────────────────────────
819
-
820
- def _filter_log_sources(props_files: list, source_hint: str) -> list:
821
- """
822
- Return the subset of props_files whose log source matches source_hint.
823
-
824
- Matching is tried in order (first match wins per file):
825
- 1. Substring of the filename stem (e.g. "sts" → STS.properties)
826
- 2. Substring of REMOTE_SERVICE_USER (e.g. "ssolwa" → ...SSOLoginWebApp...)
827
- 3. Substring of HOSTS (e.g. hostname fragment)
828
-
829
- Case-insensitive throughout. An empty source_hint returns all files unchanged.
830
- """
831
- if not source_hint:
832
- return props_files
833
- hint = source_hint.lower()
834
-
835
- def _props_contains(path: Path, key: str, hint: str) -> bool:
836
- try:
837
- for line in path.read_text(encoding="utf-8", errors="replace").splitlines():
838
- stripped = line.strip()
839
- if stripped.startswith("#"):
840
- continue
841
- if stripped.upper().startswith(key + "="):
842
- val = stripped.split("=", 1)[1].partition("#")[0].strip().lower()
843
- if hint in val:
844
- return True
845
- except OSError:
846
- pass
847
- return False
848
-
849
- matched = []
850
- for p in props_files:
851
- if hint in p.stem.lower():
852
- matched.append(p)
853
- elif _props_contains(p, "REMOTE_SERVICE_USER", hint):
854
- matched.append(p)
855
- elif _props_contains(p, "HOSTS", hint):
856
- matched.append(p)
857
- return matched
858
-
859
-
860
- # ── Tool execution ────────────────────────────────────────────────────────────
861
-
862
- async def _run_tool(name: str, inputs: dict, cfg_loader, store, slack_client=None, user_id: str = "", channel: str = "", is_admin: bool = False) -> str:
863
- if name == "get_status":
864
- hours = int(inputs.get("hours", 24))
865
- errors = store.get_recent_errors(hours)
866
- fixes = store.get_recent_fixes(hours)
867
- prs = store.get_open_prs()
868
- top_errors = [
869
- {
870
- "message": e["message"][:120],
871
- "count": e["count"],
872
- "source": e["source"],
873
- "last_seen": e["last_seen"],
874
- }
875
- for e in errors[:8]
876
- ]
877
- return json.dumps({
878
- "window_hours": hours,
879
- "errors_detected": len(errors),
880
- "top_errors": top_errors,
881
- "fixes_applied": sum(1 for f in fixes if f["status"] == "applied"),
882
- "fixes_pending": sum(1 for f in fixes if f["status"] == "pending"),
883
- "fixes_failed": sum(1 for f in fixes if f["status"] == "failed"),
884
- "open_prs": [
885
- {
886
- "repo": p["repo_name"],
887
- "branch": p["branch"],
888
- "pr_url": p["pr_url"],
889
- "age": p.get("timestamp", ""),
890
- }
891
- for p in prs
892
- ],
893
- "sentinel_paused": Path("SENTINEL_PAUSE").exists(),
894
- })
895
-
896
- if name == "check_auth_status":
897
- import subprocess as _sp
898
- from .notify import get_circuit_status
899
- cfg = cfg_loader.sentinel
900
-
901
- # Auth configuration
902
- has_key = bool(cfg.anthropic_api_key)
903
- pro_for_tasks = cfg.claude_pro_for_tasks
904
- if pro_for_tasks and has_key:
905
- primary, fallback = "claude_pro_oauth", "api_key"
906
- elif pro_for_tasks:
907
- primary, fallback = "claude_pro_oauth", None
908
- else:
909
- primary, fallback = "api_key", "claude_pro_oauth" if not has_key else "claude_pro_oauth"
910
-
911
- # Claude CLI liveness check
912
- cli_ok, cli_version = False, ""
913
- try:
914
- r = _sp.run(
915
- [cfg.claude_code_bin, "--version"],
916
- capture_output=True, text=True, timeout=10,
917
- )
918
- if r.returncode == 0:
919
- cli_ok = True
920
- cli_version = r.stdout.strip() or r.stderr.strip()
921
- except Exception:
922
- pass
923
-
924
- # Circuit breaker snapshot — only open (unhealthy) circuits appear here
925
- circuits = get_circuit_status()
926
-
927
- # Fix engine stats (last 24 h)
928
- recent = store.get_recent_fixes(hours=24)
929
- counts = {"applied": 0, "failed": 0, "skipped": 0, "pending": 0}
930
- last_success = None
931
- for f in recent:
932
- s = f.get("status", "")
933
- if s in counts:
934
- counts[s] += 1
935
- if s == "applied" and not last_success:
936
- last_success = f.get("timestamp", "")
937
-
938
- overall = "healthy"
939
- if circuits:
940
- overall = "degraded rate/auth limit active on: " + ", ".join(circuits)
941
- elif not cli_ok:
942
- overall = "warning — claude CLI not reachable"
943
-
944
- return json.dumps({
945
- "overall": overall,
946
- "auth": {
947
- "api_key_configured": has_key,
948
- "claude_pro_for_tasks": pro_for_tasks,
949
- "primary_method": primary,
950
- "fallback_method": fallback,
951
- },
952
- "claude_cli": {"available": cli_ok, "version": cli_version},
953
- "rate_limit_circuits": circuits,
954
- "fix_engine_24h": {**counts, "last_successful_fix": last_success},
955
- })
956
-
957
- if name == "create_issue":
958
- description = inputs["description"]
959
- target_repo = inputs.get("target_repo", "")
960
- project_arg = inputs.get("project", "")
961
-
962
- if project_arg:
963
- project_dirs = _find_project_dirs(project_arg)
964
- if not project_dirs:
965
- all_names = [_read_project_name(d) for d in _find_project_dirs()]
966
- return json.dumps({
967
- "error": f"No project found matching '{project_arg}'",
968
- "available_projects": all_names,
969
- "action_needed": "Ask the user which project they meant.",
970
- })
971
- if len(project_dirs) > 1:
972
- matches = [_read_project_name(d) for d in project_dirs]
973
- return json.dumps({
974
- "error": f"Ambiguous project name '{project_arg}' — matches: {matches}",
975
- "action_needed": "Ask the user to clarify which project they mean.",
976
- })
977
- project_dir = project_dirs[0]
978
- else:
979
- project_dir = Path(".")
980
-
981
- support_url = inputs.get("support_url", "").strip()
982
- attachments_summary = inputs.get("attachments_summary", "").strip()
983
- findings = inputs.get("findings", "").strip()
984
-
985
- issues_dir = project_dir / "issues"
986
- issues_dir.mkdir(exist_ok=True)
987
- fname = f"slack-{uuid.uuid4().hex[:8]}.txt"
988
-
989
- submitter_name = store.get_user_name(user_id) if user_id else ""
990
- submitter_line = f"SUBMITTED_BY: {submitter_name} ({user_id})" if user_id else ""
991
- lines = []
992
- if submitter_line:
993
- lines.append(submitter_line)
994
- if target_repo:
995
- lines.append(f"TARGET_REPO: {target_repo}")
996
- if support_url:
997
- lines.append(f"SUPPORT_URL: {support_url}")
998
- lines.append(f"SUBMITTED_AT: {datetime.now(timezone.utc).isoformat()}")
999
- lines.append("")
1000
- lines.append(description)
1001
- if findings:
1002
- lines.append(f"\nEVIDENCE (gathered by Sentinel Boss):\n{findings}")
1003
- if attachments_summary:
1004
- lines.append(f"\nATTACHMENTS:\n{attachments_summary}")
1005
- content = "\n".join(lines)
1006
- (issues_dir / fname).write_text(content, encoding="utf-8")
1007
-
1008
- # Touch SENTINEL_POLL_NOW so the target instance picks it up immediately
1009
- (project_dir / "SENTINEL_POLL_NOW").touch()
1010
-
1011
- project_label = _read_project_name(project_dir.resolve()) if project_arg else "this project"
1012
- logger.info("Boss created issue for %s: %s", project_label, fname)
1013
- if user_id:
1014
- try:
1015
- store.record_submitted_issue(
1016
- user_id=user_id,
1017
- user_name=submitter_name,
1018
- project=project_label,
1019
- fname=fname,
1020
- description=description,
1021
- )
1022
- except Exception as _rec_err:
1023
- logger.debug("Boss: could not record submitted issue: %s", _rec_err)
1024
- return json.dumps({
1025
- "status": "queued",
1026
- "project": project_label,
1027
- "file": fname,
1028
- "note": f"Delivered to '{project_label}'. Sentinel will process it on the next poll cycle.",
1029
- })
1030
-
1031
- if name == "get_fix_details":
1032
- fp = inputs["fingerprint"]
1033
- fix = store.get_confirmed_fix(fp) or store.get_marker_seen_fix(fp)
1034
- if not fix:
1035
- # Fallback: search recent fixes by prefix
1036
- recent = store.get_recent_fixes(hours=72)
1037
- fix = next((f for f in recent if f.get("fingerprint", "").startswith(fp)), None)
1038
- return json.dumps(fix or {"error": "not found"})
1039
-
1040
- if name == "list_pending_prs":
1041
- prs = store.get_open_prs()
1042
- return json.dumps({
1043
- "count": len(prs),
1044
- "open_prs": [
1045
- {
1046
- "repo": p["repo_name"],
1047
- "branch": p["branch"],
1048
- "pr_url": p["pr_url"],
1049
- "timestamp": p.get("timestamp", ""),
1050
- }
1051
- for p in prs
1052
- ],
1053
- })
1054
-
1055
- if name == "pause_sentinel":
1056
- Path("SENTINEL_PAUSE").touch()
1057
- logger.info("Boss: SENTINEL_PAUSE created")
1058
- return json.dumps({"status": "paused"})
1059
-
1060
- if name == "resume_sentinel":
1061
- p = Path("SENTINEL_PAUSE")
1062
- if p.exists():
1063
- p.unlink()
1064
- logger.info("Boss: SENTINEL_PAUSE removed")
1065
- return json.dumps({"status": "resumed"})
1066
-
1067
- if name == "list_projects":
1068
- projects = []
1069
- for d in _find_project_dirs():
1070
- repo_cfg_dir = d / "config" / "repo-configs"
1071
- repos_in_project = []
1072
- if repo_cfg_dir.exists():
1073
- for p in sorted(repo_cfg_dir.glob("*.properties")):
1074
- if p.name.startswith("_"):
1075
- continue
1076
- repo_url = ""
1077
- for line in p.read_text(encoding="utf-8", errors="ignore").splitlines():
1078
- if line.startswith("REPO_URL"):
1079
- repo_url = line.split("=", 1)[-1].strip()
1080
- break
1081
- repos_in_project.append({"repo": p.stem, "url": repo_url})
1082
- projects.append({
1083
- "project": _read_project_name(d),
1084
- "dir": d.name,
1085
- "running": (d / "sentinel.pid").exists(),
1086
- "this": d.resolve() == Path(".").resolve(),
1087
- "repos": repos_in_project,
1088
- })
1089
- return json.dumps({"projects": projects})
1090
-
1091
- if name == "search_logs":
1092
- query = inputs.get("query", "")
1093
- source = inputs.get("source", "").lower()
1094
- max_matches = int(inputs.get("max_matches", 30))
1095
- tail_override = inputs.get("tail")
1096
-
1097
- # ── Live fetch path: SSH to servers and grep in real time ──────────────
1098
- script = Path(__file__).resolve().parent.parent / "scripts" / "fetch_log.sh"
1099
- log_cfg_dir = Path("config") / "log-configs"
1100
- if script.exists() and log_cfg_dir.exists():
1101
- props_files = _filter_log_sources(sorted(log_cfg_dir.glob("*.properties")), source)
1102
- if props_files:
1103
- live_results = []
1104
- for props in props_files:
1105
- env = os.environ.copy()
1106
- env["GREP_FILTER"] = query
1107
- if tail_override:
1108
- env["TAIL"] = str(tail_override)
1109
- try:
1110
- r = subprocess.run(
1111
- ["bash", str(script), str(props)],
1112
- capture_output=True, text=True, timeout=60, env=env,
1113
- )
1114
- try:
1115
- _qpat = re.compile(query, re.IGNORECASE)
1116
- except re.error:
1117
- _qpat = re.compile(re.escape(query), re.IGNORECASE)
1118
- lines = (r.stdout or "").strip().splitlines()
1119
- matches = [ln[:300] for ln in lines if _qpat.search(ln)][:max_matches]
1120
- if matches:
1121
- live_results.append({"source": props.stem, "matches": matches})
1122
- logger.info("Boss search_logs live %s rc=%d found=%d", props.stem, r.returncode, len(matches))
1123
- except subprocess.TimeoutExpired:
1124
- live_results.append({"source": props.stem, "error": "timed out"})
1125
- except Exception as e:
1126
- live_results.append({"source": props.stem, "error": str(e)})
1127
- total = sum(len(r.get("matches", [])) for r in live_results)
1128
- return json.dumps({
1129
- "query": query,
1130
- "mode": "live",
1131
- "total_matches": total,
1132
- "results": live_results,
1133
- "note": (
1134
- "Results already include a per-source breakdown. "
1135
- "Do NOT call search_logs again with a source filter to 'refine' — "
1136
- "use these results directly."
1137
- ) if total > 0 else None,
1138
- })
1139
-
1140
- # ── Fallback: search locally-cached log files ──────────────────────────
1141
- # Reaching here means: live script unavailable OR source filter matched no config files.
1142
- # A result with files_searched=0 means the source name wasn't recognised — NOT that
1143
- # there are no log entries. Do not interpret this as "no results found".
1144
- fetched_dir = Path("workspace/fetched")
1145
- if not fetched_dir.exists():
1146
- return json.dumps({
1147
- "error": "No fetched logs found and fetch_log.sh unavailable",
1148
- "note": "This is a config/setup problem, not a 'no results' answer.",
1149
- })
1150
- try:
1151
- pattern = re.compile(query, re.IGNORECASE)
1152
- except re.error as e:
1153
- return json.dumps({"error": f"Invalid regex: {e}"})
1154
- results = []
1155
- for log_file in sorted(fetched_dir.glob("*.log")):
1156
- if source and source not in log_file.name.lower():
1157
- continue
1158
- try:
1159
- lines = log_file.read_text(encoding="utf-8", errors="ignore").splitlines()
1160
- matches = [
1161
- {"line": i + 1, "text": line[:300]}
1162
- for i, line in enumerate(lines)
1163
- if pattern.search(line)
1164
- ][:max_matches]
1165
- if matches:
1166
- results.append({"file": log_file.name, "matches": matches})
1167
- except Exception:
1168
- pass
1169
- total = sum(len(r["matches"]) for r in results)
1170
- files_searched = len(list(fetched_dir.glob("*.log")))
1171
- result = {
1172
- "query": query,
1173
- "mode": "cached",
1174
- "total_matches": total,
1175
- "files_searched": files_searched,
1176
- "results": results,
1177
- }
1178
- if files_searched == 0:
1179
- result["warning"] = (
1180
- "Source name not recognised in cached files — this is a lookup failure, not 'no results'. "
1181
- "If you already have results from a broader search_logs call, use those. Stop retrying."
1182
- )
1183
- return json.dumps(result)
1184
-
1185
- if name == "trigger_poll":
1186
- Path("SENTINEL_POLL_NOW").touch()
1187
- logger.info("Boss: immediate poll requested")
1188
- return json.dumps({"status": "triggered", "note": "Sentinel will run a poll cycle within seconds"})
1189
-
1190
- if name == "get_repo_status":
1191
- hours = int(inputs.get("hours", 24))
1192
- fixes = store.get_recent_fixes(hours)
1193
- errors = store.get_recent_errors(hours)
1194
- by_repo: dict = {}
1195
- for fix in fixes:
1196
- repo = fix.get("repo_name", "unknown")
1197
- s = by_repo.setdefault(repo, {"applied": 0, "pending": 0, "failed": 0, "skipped": 0})
1198
- key = fix.get("status", "failed")
1199
- s[key] = s.get(key, 0) + 1
1200
- return json.dumps({"window_hours": hours, "total_errors": len(errors), "by_repo": by_repo})
1201
-
1202
- if name == "list_recent_commits":
1203
- limit = int(inputs.get("limit", 5))
1204
- results = []
1205
- for repo_name, repo in cfg_loader.repos.items():
1206
- local = Path(repo.local_path)
1207
- if not local.exists():
1208
- continue
1209
- try:
1210
- r = subprocess.run(
1211
- ["git", "log", "--oneline", "--grep=sentinel", "-n", str(limit)],
1212
- cwd=str(local), capture_output=True, text=True, timeout=10,
1213
- )
1214
- commits = r.stdout.strip().splitlines()
1215
- if commits:
1216
- results.append({"repo": repo_name, "commits": commits})
1217
- except Exception:
1218
- pass
1219
- return json.dumps({"sentinel_commits": results})
1220
-
1221
- if name == "pull_repo":
1222
- target = inputs.get("repo", "").lower()
1223
- results = []
1224
- for repo_name, repo in cfg_loader.repos.items():
1225
- if target and target not in repo_name.lower():
1226
- continue
1227
- local = Path(repo.local_path)
1228
- if not local.exists():
1229
- results.append({"repo": repo_name, "status": "error", "detail": "local path not found"})
1230
- continue
1231
- try:
1232
- r = subprocess.run(
1233
- ["git", "pull", "--rebase", "origin", repo.branch],
1234
- cwd=str(local), capture_output=True, text=True, timeout=60,
1235
- )
1236
- last_line = r.stdout.strip().splitlines()[-1] if r.stdout.strip() else "already up to date"
1237
- if r.returncode == 0:
1238
- results.append({"repo": repo_name, "status": "ok", "detail": last_line})
1239
- else:
1240
- results.append({"repo": repo_name, "status": "error", "detail": r.stderr.strip()})
1241
- except Exception as e:
1242
- results.append({"repo": repo_name, "status": "error", "detail": str(e)})
1243
- return json.dumps({"results": results})
1244
-
1245
- if name == "pull_config":
1246
- target = inputs.get("project", "")
1247
- dirs = _find_project_dirs(target)
1248
- if not dirs:
1249
- return json.dumps({"error": f"No project found matching '{target}'"})
1250
- results = []
1251
- for d in dirs:
1252
- res = _git_pull(d)
1253
- results.append({"project": _read_project_name(d), "dir": d.name, **res})
1254
- logger.info("Boss: pull_config %s → %s", d.name, res["status"])
1255
- return json.dumps({"results": results})
1256
-
1257
- if name == "fetch_logs":
1258
- source_filter = inputs.get("source", "").lower()
1259
- debug = bool(inputs.get("debug", False))
1260
- tail_override = inputs.get("tail")
1261
- grep_override = inputs.get("grep_filter", "")
1262
-
1263
- # Find fetch_log.sh relative to this file
1264
- script = Path(__file__).resolve().parent.parent / "scripts" / "fetch_log.sh"
1265
- if not script.exists():
1266
- return json.dumps({"error": f"fetch_log.sh not found at {script}"})
1267
-
1268
- log_cfg_dir = Path("config") / "log-configs"
1269
- if not log_cfg_dir.exists():
1270
- return json.dumps({"error": "config/log-configs/ not found"})
1271
-
1272
- props_files = _filter_log_sources(sorted(log_cfg_dir.glob("*.properties")), source_filter)
1273
- if not props_files:
1274
- return json.dumps({"error": f"No log-config found matching '{source_filter}'"})
1275
-
1276
- results = []
1277
- for props in props_files:
1278
- env = os.environ.copy()
1279
- if tail_override:
1280
- env["TAIL"] = str(tail_override)
1281
- if grep_override:
1282
- env["GREP_FILTER"] = grep_override
1283
-
1284
- cmd = ["bash", str(script)]
1285
- if debug:
1286
- cmd.append("--debug")
1287
- cmd.append(str(props))
1288
-
1289
- try:
1290
- r = subprocess.run(
1291
- cmd, capture_output=True, text=True, timeout=120, env=env,
1292
- )
1293
- output = (r.stdout or "").strip()
1294
- stderr = (r.stderr or "").strip()
1295
- results.append({
1296
- "source": props.stem,
1297
- "returncode": r.returncode,
1298
- "output": output[-2000:] if output else "",
1299
- "stderr": stderr[-1000:] if stderr else "",
1300
- })
1301
- logger.info("Boss fetch_logs %s rc=%d", props.stem, r.returncode)
1302
- except subprocess.TimeoutExpired:
1303
- results.append({"source": props.stem, "error": "timed out after 120s"})
1304
- except Exception as e:
1305
- results.append({"source": props.stem, "error": str(e)})
1306
-
1307
- return json.dumps({"fetched": len(results), "results": results})
1308
-
1309
- if name == "watch_bot":
1310
- if not is_admin:
1311
- return json.dumps({"error": "Admin access required to register bots for monitoring."})
1312
- user_ids = inputs.get("user_ids", [])
1313
- project_arg = inputs.get("project", "").strip()
1314
- if not user_ids:
1315
- return json.dumps({"error": "No user_ids provided"})
1316
-
1317
- # Resolve + validate project — required for bot issue routing
1318
- resolved_project = ""
1319
- if project_arg:
1320
- project_dirs = _find_project_dirs(project_arg)
1321
- if not project_dirs:
1322
- all_names = [_read_project_name(d) for d in _find_project_dirs()]
1323
- return json.dumps({
1324
- "error": f"No project found matching '{project_arg}'",
1325
- "available_projects": all_names,
1326
- "action_needed": "Ask the user which project these bot alerts belong to.",
1327
- })
1328
- if len(project_dirs) > 1:
1329
- matches = [_read_project_name(d) for d in project_dirs]
1330
- return json.dumps({
1331
- "error": f"Ambiguous project name '{project_arg}' — matches: {matches}",
1332
- "action_needed": "Ask the user to clarify which project.",
1333
- })
1334
- resolved_project = _read_project_name(project_dirs[0])
1335
- else:
1336
- all_projects = _find_project_dirs()
1337
- if len(all_projects) == 1:
1338
- # Single project in workspace — auto-assign
1339
- resolved_project = _read_project_name(all_projects[0])
1340
- elif all_projects:
1341
- all_names = [_read_project_name(d) for d in all_projects]
1342
- return json.dumps({
1343
- "error": "Cannot determine which project these bot alerts belong to.",
1344
- "available_projects": all_names,
1345
- "action_needed": "Ask the user to specify the project, then retry with project filled in.",
1346
- })
1347
-
1348
- results = []
1349
- for uid in user_ids:
1350
- if not slack_client:
1351
- results.append({"user_id": uid, "status": "error", "reason": "no Slack client available"})
1352
- continue
1353
- try:
1354
- info = await slack_client.users_info(user=uid)
1355
- user = info.get("user", {})
1356
- if not user.get("is_bot", False):
1357
- results.append({"user_id": uid, "status": "skipped", "reason": "not a bot — only bots can be watched passively"})
1358
- continue
1359
- bot_name = user.get("real_name") or user.get("name") or uid
1360
- store.add_watched_bot(uid, bot_name, added_by="boss", project_name=resolved_project)
1361
- logger.info("Boss: now watching bot %s (%s) → project '%s'", bot_name, uid, resolved_project or "unset")
1362
- results.append({"user_id": uid, "bot_name": bot_name, "project": resolved_project, "status": "watching"})
1363
- except Exception as e:
1364
- results.append({"user_id": uid, "status": "error", "reason": str(e)})
1365
- return json.dumps({"results": results})
1366
-
1367
- if name == "unwatch_bot":
1368
- if not is_admin:
1369
- return json.dumps({"error": "Admin access required to remove bots from monitoring."})
1370
- user_ids = inputs.get("user_ids", [])
1371
- if not user_ids:
1372
- return json.dumps({"error": "No user_ids provided"})
1373
- results = []
1374
- for uid in user_ids:
1375
- removed = store.remove_watched_bot(uid)
1376
- logger.info("Boss: unwatch bot %s → %s", uid, "removed" if removed else "not found")
1377
- results.append({"user_id": uid, "status": "removed" if removed else "not found"})
1378
- return json.dumps({"results": results})
1379
-
1380
- if name == "list_watched_bots":
1381
- bots = store.get_watched_bots()
1382
- return json.dumps({
1383
- "count": len(bots),
1384
- "bots": [
1385
- {
1386
- "bot_id": b["bot_id"],
1387
- "bot_name": b["bot_name"],
1388
- "project": b.get("project_name") or "",
1389
- "added_by": b["added_by"],
1390
- "added_at": b["added_at"],
1391
- }
1392
- for b in bots
1393
- ],
1394
- })
1395
-
1396
- if name == "upgrade_sentinel":
1397
- if not is_admin:
1398
- return json.dumps({"error": "Admin access required to upgrade Sentinel."})
1399
- import threading
1400
-
1401
- # Sentinel is installed via npm use `sentinel upgrade` which handles
1402
- # npm install + Python bundle copy + restart via stopAll/startAll.
1403
- # Run it in the background after a short delay so the Slack reply is
1404
- # sent before the process is replaced.
1405
- try:
1406
- r = subprocess.run(
1407
- ["sentinel", "--version"],
1408
- capture_output=True, text=True, timeout=10,
1409
- )
1410
- sentinel_bin_ok = r.returncode == 0
1411
- except Exception:
1412
- sentinel_bin_ok = False
1413
-
1414
- if not sentinel_bin_ok:
1415
- return json.dumps({
1416
- "status": "error",
1417
- "note": "`sentinel` CLI not found. Run: npm install -g @misterhuydo/sentinel",
1418
- })
1419
-
1420
- def _do_upgrade():
1421
- import time
1422
- time.sleep(10) # give Slack time to post the reply
1423
- subprocess.Popen(["sentinel", "upgrade"], close_fds=True)
1424
-
1425
- threading.Thread(target=_do_upgrade, daemon=True).start()
1426
- logger.info("Boss: upgrade_sentinel scheduled via `sentinel upgrade`")
1427
- return json.dumps({
1428
- "status": "ok",
1429
- "note": "Upgrade started — pulling latest version via npm and restarting. Give me ~30 seconds then I'll be back.",
1430
- })
1431
-
1432
- if name == "ask_codebase":
1433
- target = inputs.get("repo", "").lower()
1434
- question = inputs.get("question", "")
1435
-
1436
- # 1. Find repos whose name contains the target (e.g. "STS", "elprint-sales")
1437
- matched = [(rn, r) for rn, r in cfg_loader.repos.items() if target in rn.lower()]
1438
-
1439
- # 2. No repo match — check if target is a project name → use ALL repos in cfg_loader
1440
- # (each Sentinel instance is scoped to one project, so all repos belong to it)
1441
- if not matched:
1442
- current_project = _read_project_name(Path("."))
1443
- if target in current_project.lower() or current_project.lower() in target:
1444
- matched = list(cfg_loader.repos.items())
1445
-
1446
- if not matched:
1447
- return json.dumps({
1448
- "error": f"No repo or project found matching '{target}'",
1449
- "available_repos": list(cfg_loader.repos.keys()),
1450
- })
1451
-
1452
- cfg = cfg_loader.sentinel
1453
- env = os.environ.copy()
1454
- # Only inject API key when Claude Pro is NOT preferred for heavy tasks
1455
- if cfg.anthropic_api_key and not cfg.claude_pro_for_tasks:
1456
- env["ANTHROPIC_API_KEY"] = cfg.anthropic_api_key
1457
-
1458
- def _ask_one(repo_name, repo_cfg) -> dict:
1459
- local_path = Path(repo_cfg.local_path)
1460
- if not local_path.exists():
1461
- return {"repo": repo_name, "error": f"not cloned yet at {local_path}"}
1462
- prompt = (
1463
- f"You are a code analyst. Answer the following question about the codebase at: {local_path}\n\n"
1464
- f"Question: {question}\n\n"
1465
- f"Use whatever tools you need to answer accurately. Be concise and direct. Plain text only."
1466
- )
1467
- try:
1468
- r = subprocess.run(
1469
- ([cfg.claude_code_bin, "--dangerously-skip-permissions", "--print", prompt]
1470
- if os.getuid() != 0 else
1471
- [cfg.claude_code_bin, "--print", prompt]),
1472
- capture_output=True, text=True, timeout=180, env=env,
1473
- cwd=str(local_path),
1474
- )
1475
- output = (r.stdout or "").strip()
1476
- logger.info("Boss ask_codebase %s rc=%d len=%d", repo_name, r.returncode, len(output))
1477
- if r.returncode != 0 and not output:
1478
- raw_err = (r.stderr or "")
1479
- alert_if_rate_limited(
1480
- cfg.slack_bot_token, cfg.slack_channel,
1481
- f"ask_codebase/{repo_name}", raw_err,
1482
- )
1483
- return {"repo": repo_name, "error": f"claude --print failed (rc={r.returncode}): {raw_err[:200]}"}
1484
- return {"repo": repo_name, "answer": output[:3000]}
1485
- except subprocess.TimeoutExpired:
1486
- return {"repo": repo_name, "error": "timed out after 180s"}
1487
- except Exception as e:
1488
- return {"repo": repo_name, "error": str(e)}
1489
-
1490
- if len(matched) == 1:
1491
- result = _ask_one(*matched[0])
1492
- # Unwrap single-repo result for cleaner response
1493
- return json.dumps(result)
1494
-
1495
- # Multiple repos — query each and combine
1496
- results = [_ask_one(rn, r) for rn, r in matched]
1497
- return json.dumps({"project": target, "repos_queried": len(results), "results": results})
1498
-
1499
- if name == "restart_project":
1500
- if not is_admin:
1501
- return json.dumps({"error": "Admin access required to restart a project."})
1502
- project_arg = inputs.get("project", "").lower()
1503
- dirs = _find_project_dirs(project_arg)
1504
- if not dirs:
1505
- return json.dumps({"error": f"No project found matching '{project_arg}'"})
1506
- results = []
1507
- for d in dirs:
1508
- stop_sh = d / "stop.sh"
1509
- start_sh = d / "start.sh"
1510
- if not stop_sh.exists() or not start_sh.exists():
1511
- results.append({"project": d.name, "status": "error", "detail": "stop.sh or start.sh not found"})
1512
- continue
1513
- try:
1514
- subprocess.run(["bash", str(stop_sh)], cwd=str(d), timeout=30)
1515
- subprocess.run(["bash", str(start_sh)], cwd=str(d), timeout=30)
1516
- results.append({"project": d.name, "status": "restarted"})
1517
- logger.info("Boss: restarted project %s", d.name)
1518
- except Exception as e:
1519
- results.append({"project": d.name, "status": "error", "detail": str(e)})
1520
- return json.dumps({"results": results})
1521
-
1522
- if name == "tail_log":
1523
- source = inputs.get("source", "").lower()
1524
- lines = int(inputs.get("lines", 100))
1525
- script = Path(__file__).resolve().parent.parent / "scripts" / "fetch_log.sh"
1526
- log_cfg_dir = Path("config") / "log-configs"
1527
-
1528
- if not script.exists():
1529
- return json.dumps({"error": "fetch_log.sh not found"})
1530
- if not log_cfg_dir.exists():
1531
- return json.dumps({"error": "config/log-configs/ not found"})
1532
-
1533
- props_files = sorted(log_cfg_dir.glob("*.properties"))
1534
- if source:
1535
- props_files = [p for p in props_files if source in p.stem.lower()]
1536
- if not props_files:
1537
- return json.dumps({"error": f"No log-config found matching '{source}'"})
1538
-
1539
- results = []
1540
- for props in props_files:
1541
- env = os.environ.copy()
1542
- env["TAIL"] = str(lines)
1543
- env["GREP_FILTER"] = "" # no filter — show everything
1544
- try:
1545
- r = subprocess.run(
1546
- ["bash", str(script), str(props)],
1547
- capture_output=True, text=True, timeout=60, env=env,
1548
- )
1549
- tail_lines = (r.stdout or "").strip().splitlines()[-lines:]
1550
- results.append({
1551
- "source": props.stem,
1552
- "lines": len(tail_lines),
1553
- "content": "\n".join(tail_lines),
1554
- })
1555
- logger.info("Boss tail_log %s rc=%d lines=%d", props.stem, r.returncode, len(tail_lines))
1556
- except subprocess.TimeoutExpired:
1557
- results.append({"source": props.stem, "error": "timed out"})
1558
- except Exception as e:
1559
- results.append({"source": props.stem, "error": str(e)})
1560
- return json.dumps({"results": results})
1561
-
1562
- if name == "post_file":
1563
- if not slack_client or not channel:
1564
- return json.dumps({"error": "No Slack channel context — cannot upload file"})
1565
- content = inputs.get("content", "")
1566
- filename = inputs.get("filename", "sentinel-output.txt")
1567
- title = inputs.get("title", filename)
1568
- if not content:
1569
- return json.dumps({"error": "No content provided"})
1570
- try:
1571
- await slack_client.files_upload_v2(
1572
- channel=channel,
1573
- content=content,
1574
- filename=filename,
1575
- title=title,
1576
- )
1577
- logger.info("Boss post_file: uploaded %s (%d bytes) to %s", filename, len(content), channel)
1578
- return json.dumps({"ok": True, "filename": filename, "bytes": len(content)})
1579
- except Exception as e:
1580
- logger.warning("Boss post_file failed: %s", e)
1581
- return json.dumps({"error": str(e)})
1582
-
1583
- if name == "my_stats":
1584
- hours = int(inputs.get("hours", 168))
1585
- errors = store.get_recent_errors(hours)
1586
- fixes = store.get_recent_fixes(hours)
1587
- prs = store.get_open_prs()
1588
- pending_conf = store.get_fixes_pending_confirmation()
1589
- # Conversation stats
1590
- history = store.load_conversation(user_id) if user_id else []
1591
- hist_len = len(history)
1592
- # Load conversation updated_at from DB
1593
- conv_updated = ""
1594
- try:
1595
- import sqlite3 as _sqlite3
1596
- with _sqlite3.connect(store.db_path) as _db:
1597
- row = _db.execute(
1598
- "SELECT updated_at FROM conversations WHERE user_id=?", (user_id,)
1599
- ).fetchone()
1600
- if row:
1601
- conv_updated = row[0]
1602
- except Exception:
1603
- pass
1604
- # Tally fix statuses
1605
- by_status: dict = {}
1606
- for fix in fixes:
1607
- s = fix.get("status", "unknown")
1608
- by_status[s] = by_status.get(s, 0) + 1
1609
- # Fixes confirmed via sentinel marker in prod
1610
- confirmed = [f for f in fixes if f.get("fix_outcome") == "confirmed"]
1611
- regressed = [f for f in fixes if f.get("fix_outcome") == "regressed"]
1612
- submitted = store.get_submitted_issues(user_id, hours=hours) if user_id else []
1613
- submitted_recent = store.get_submitted_issues(user_id, hours=hours) if user_id else []
1614
- return json.dumps({
1615
- "conversation": {
1616
- "messages_in_history": hist_len,
1617
- "turns": hist_len // 2,
1618
- "last_active": conv_updated or "no history",
1619
- },
1620
- "issues_you_submitted": {
1621
- "total_in_window": len(submitted_recent),
1622
- "all_time": len(store.get_submitted_issues(user_id) if user_id else []),
1623
- "recent": [
1624
- {"project": i["project"], "description": i["description"][:80],
1625
- "submitted_at": i["submitted_at"]}
1626
- for i in submitted_recent[:5]
1627
- ],
1628
- },
1629
- "window_hours": hours,
1630
- "errors_detected": len(errors),
1631
- "fixes": {
1632
- "applied": by_status.get("applied", 0),
1633
- "pending_pr": len(prs),
1634
- "failed": by_status.get("failed", 0),
1635
- "skipped": by_status.get("skipped", 0),
1636
- "error": by_status.get("error", 0),
1637
- },
1638
- "confirmed_in_prod": len(confirmed),
1639
- "regressed_after_fix": len(regressed),
1640
- "awaiting_confirmation": len(pending_conf),
1641
- "open_prs": [
1642
- {"repo": p["repo_name"], "pr_url": p["pr_url"], "timestamp": p["timestamp"]}
1643
- for p in prs
1644
- ],
1645
- "top_errors": [
1646
- {"message": e["message"][:100], "count": e["count"], "source": e["source"]}
1647
- for e in errors[:5]
1648
- ],
1649
- })
1650
- if name == "clear_my_history":
1651
- if user_id:
1652
- store.save_conversation(user_id, [])
1653
- logger.info("Boss: cleared conversation history for user %s", user_id)
1654
- return json.dumps({
1655
- "status": "cleared",
1656
- "note": "Your conversation history has been wiped. Next session starts fresh. [DONE]",
1657
- })
1658
- return json.dumps({"error": "cannot determine user — not clearing"})
1659
-
1660
- # ── Admin-only tools ──────────────────────────────────────────────────────
1661
- _ADMIN_TOOLS = {"list_all_users", "clear_user_history", "reset_fingerprint", "list_all_errors", "export_db"}
1662
- if name in _ADMIN_TOOLS:
1663
- if not is_admin:
1664
- return json.dumps({"error": "Admin access required. You are not in SLACK_ADMIN_USERS."})
1665
-
1666
- if name == "list_all_users":
1667
- stats = store.get_all_user_stats()
1668
- return json.dumps({"users": stats, "total": len(stats)})
1669
-
1670
- if name == "clear_user_history":
1671
- target = inputs.get("target_user_id", "").strip()
1672
- if not target:
1673
- return json.dumps({"error": "target_user_id is required"})
1674
- store.save_conversation(target, [])
1675
- display = store.get_user_name(target)
1676
- logger.info("Boss admin: cleared history for user %s (%s) by admin %s", target, display, user_id)
1677
- return json.dumps({"status": "cleared", "target_user_id": target, "display_name": display})
1678
-
1679
- if name == "reset_fingerprint":
1680
- fp = inputs.get("fingerprint", "").strip()
1681
- if not fp:
1682
- return json.dumps({"error": "fingerprint is required"})
1683
- found = store.reset_fingerprint(fp)
1684
- logger.info("Boss admin: reset fingerprint %s by admin %s (found=%s)", fp, user_id, found)
1685
- return json.dumps({"status": "reset" if found else "not_found", "fingerprint": fp,
1686
- "note": "Sentinel will retry this error on the next poll." if found else "No fix record found for this fingerprint."})
1687
-
1688
- if name == "list_all_errors":
1689
- hours = int(inputs.get("hours", 0))
1690
- errors = store.get_all_errors(hours)
1691
- return json.dumps({"errors": errors[:100], "total": len(errors),
1692
- "window_hours": hours or "all time"})
1693
-
1694
- if name == "export_db":
1695
- if not slack_client or not channel:
1696
- return json.dumps({"error": "No Slack channel context — cannot upload file"})
1697
- try:
1698
- import sqlite3 as _sq
1699
- import io as _io
1700
- lines = []
1701
- with _sq.connect(store.db_path) as _db:
1702
- for tbl in ["errors", "fixes", "reports", "slack_users", "conversations", "submitted_issues"]:
1703
- try:
1704
- rows = _db.execute(f"SELECT * FROM {tbl}").fetchall() # noqa: S608
1705
- cols = [d[0] for d in _db.execute(f"SELECT * FROM {tbl} LIMIT 0").description] # noqa: S608
1706
- lines.append(f"=== {tbl} ({len(rows)} rows) ===")
1707
- lines.append("\t".join(cols))
1708
- for row in rows:
1709
- lines.append("\t".join(str(v) if v is not None else "" for v in row))
1710
- lines.append("")
1711
- except Exception:
1712
- lines.append(f"=== {tbl} (unavailable) ===\n")
1713
- content = "\n".join(lines)
1714
- await slack_client.files_upload_v2(
1715
- channel=channel,
1716
- content=content,
1717
- filename="sentinel-db-export.tsv",
1718
- title="Sentinel DB Export",
1719
- )
1720
- logger.info("Boss admin: exported DB (%d bytes) by admin %s", len(content), user_id)
1721
- return json.dumps({"ok": True, "bytes": len(content)})
1722
- except Exception as e:
1723
- return json.dumps({"error": str(e)})
1724
-
1725
- return json.dumps({"error": f"unknown tool: {name}"})
1726
-
1727
-
1728
- # ── CLI fallback (OAuth / no API key) ────────────────────────────────────────
1729
-
1730
- def _attachments_to_text(attachments: list[dict]) -> str:
1731
- """Produce a plain-text summary of attachments to append to CLI prompts."""
1732
- if not attachments:
1733
- return ""
1734
- parts = []
1735
- for att in attachments:
1736
- if att["type"] == "text":
1737
- parts.append(
1738
- f"[Attached file: {att['name']}]\n{att['content']}"
1739
- )
1740
- elif att["type"] == "image":
1741
- parts.append(
1742
- f"[Attached image: {att['name']}] (saved at {att['path']})"
1743
- )
1744
- else:
1745
- parts.append(
1746
- f"[Attached file: {att['name']}] (saved at {att['path']} — read it if relevant)"
1747
- )
1748
- return "\n\nATTACHMENTS:\n" + "\n---\n".join(parts)
1749
-
1750
-
1751
- def _attachments_to_api_blocks(attachments: list[dict]) -> list[dict]:
1752
- """Convert attachments into Anthropic API message content blocks."""
1753
- blocks: list[dict] = []
1754
- for att in attachments:
1755
- if att["type"] == "image":
1756
- blocks.append({
1757
- "type": "image",
1758
- "source": {
1759
- "type": "base64",
1760
- "media_type": att.get("mime", "image/png"),
1761
- "data": att["content"],
1762
- },
1763
- })
1764
- elif att["type"] == "text":
1765
- blocks.append({
1766
- "type": "text",
1767
- "text": f"[Attached file: {att['name']}]\n{att['content']}",
1768
- })
1769
- else:
1770
- blocks.append({
1771
- "type": "text",
1772
- "text": f"[Attached file: {att['name']}] saved at {att['path']}",
1773
- })
1774
- return blocks
1775
-
1776
-
1777
- _ACTION_RE = re.compile(r"^ACTION:\s*(\{.*\})", re.MULTILINE)
1778
-
1779
-
1780
- async def _handle_with_cli(
1781
- message: str,
1782
- history: list,
1783
- cfg_loader,
1784
- store,
1785
- slack_client=None,
1786
- user_name: str = "",
1787
- user_id: str = "",
1788
- attachments: list | None = None,
1789
- is_admin: bool = False,
1790
- ) -> tuple[str, bool]:
1791
- """Fallback: use `claude --print` for users without an Anthropic API key."""
1792
- status_json = await _run_tool("get_status", {"hours": 24}, cfg_loader, store)
1793
- prs_json = await _run_tool("list_pending_prs", {}, cfg_loader, store)
1794
-
1795
- # Pre-fetch log search if the message is a search request.
1796
- # Use quoted strings as the query, or fall back to the full message.
1797
- # Never hardcode field names — the query is whatever the user said.
1798
- search_json = ""
1799
- _search_kws = ("search", "find", "look for", "show me log", "grep", "entries for")
1800
- if any(kw in message.lower() for kw in _search_kws):
1801
- quoted = re.findall(r'"([^"]+)"', message)
1802
- query = quoted[0] if quoted else message
1803
- search_json = await _run_tool("search_logs", {"query": query}, cfg_loader, store)
1804
-
1805
- paused = Path("SENTINEL_PAUSE").exists()
1806
- repos = list(cfg_loader.repos.keys())
1807
- log_sources = list(cfg_loader.log_sources.keys())
1808
- ts = datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M UTC")
1809
-
1810
- history_text = ""
1811
- for msg in history[-8:]:
1812
- role = msg["role"].upper()
1813
- content = msg["content"]
1814
- if isinstance(content, list):
1815
- content = " ".join(
1816
- (b.get("text", "") if isinstance(b, dict) else getattr(b, "text", ""))
1817
- for b in content
1818
- if (isinstance(b, dict) and b.get("type") == "text")
1819
- or (hasattr(b, "type") and b.type == "text")
1820
- )
1821
- history_text += f"\n{role}: {content}"
1822
-
1823
- slack_mention = f"<@{user_id}>" if user_id else (user_name or "")
1824
- known_users = store.get_all_users()
1825
- users_hint = ", ".join(f"<@{uid}> = {name}" for uid, name in known_users.items())
1826
- prompt = (
1827
- _SYSTEM
1828
- + (f"\nYou are speaking with: {user_name} (Slack mention: {slack_mention})" if user_name else "")
1829
- + "\nAlways start your reply by addressing the user directly using their Slack mention, e.g. \"<@U123> here is what I found...\"."
1830
- + " Never use their plain name — always use the <@USER_ID> format so Slack highlights it."
1831
- + (f"\nKnown Slack users: {users_hint}" if users_hint else "")
1832
- + f"\n\nCurrent time: {ts}"
1833
- + f"\nSentinel status: {'⏸ PAUSED' if paused else '▶ RUNNING'}"
1834
- + f"\nManaged repos: {', '.join(repos) if repos else '(none configured)'}"
1835
- + (f"\nLog sources: {', '.join(log_sources)}" if log_sources else "")
1836
- + f"\nAdmin access for this user: {'YES — admin tools are available' if is_admin else 'NO — admin tools will be refused'}"
1837
- + "\nNOTE: Running in CLI fallback mode — admin tools and some features are unavailable. Ask user to configure ANTHROPIC_API_KEY for full features."
1838
- + f"\n\nCurrent status (last 24 h):\n{status_json}"
1839
- + f"\n\nOpen PRs:\n{prs_json}"
1840
- + (f"\n\nLog search results:\n{search_json}" if search_json else "")
1841
- + (f"\n\nConversation so far:{history_text}" if history_text else "")
1842
- + _attachments_to_text(attachments or [])
1843
- + f"\n\nUSER: {message}"
1844
- + "\n\nIf you need to take an action, include a line like:\n"
1845
- + " ACTION: {\"action\": \"pause_sentinel\"}\n"
1846
- + " ACTION: {\"action\": \"resume_sentinel\"}\n"
1847
- + " ACTION: {\"action\": \"trigger_poll\"}\n"
1848
- + " ACTION: {\"action\": \"create_issue\", \"description\": \"...\", \"target_repo\": \"\"}\n"
1849
- + " ACTION: {\"action\": \"search_logs\", \"query\": \"<whatever the user asked to find>\"}\n"
1850
- + "End with [DONE] if the request is fully handled."
1851
- )
1852
-
1853
- cfg = cfg_loader.sentinel
1854
- env = os.environ.copy()
1855
- if cfg.anthropic_api_key:
1856
- env["ANTHROPIC_API_KEY"] = cfg.anthropic_api_key
1857
-
1858
- try:
1859
- result = subprocess.run(
1860
- ([cfg.claude_code_bin, "--dangerously-skip-permissions", "--print", prompt]
1861
- if os.getuid() != 0 else
1862
- [cfg.claude_code_bin, "--print", prompt]),
1863
- capture_output=True, text=True, timeout=180, env=env,
1864
- )
1865
- output = (result.stdout or "").strip()
1866
- if result.returncode != 0 or not output:
1867
- stderr = (result.stderr or "").strip()
1868
- logger.error(
1869
- "Boss CLI call failed (rc=%d): stdout=%r stderr=%r",
1870
- result.returncode, output[:200], stderr[:200],
1871
- )
1872
- raw_err = (result.stderr or "").strip()
1873
- if result.returncode != 0 and not output:
1874
- full_err = f"exit {result.returncode}: {raw_err[:300]}"
1875
- cfg = cfg_loader.sentinel
1876
- alert_if_rate_limited(cfg.slack_bot_token, cfg.slack_channel,
1877
- "sentinel_boss/cli", raw_err or full_err)
1878
- return f":warning: `claude --print` failed ({full_err})", True
1879
- except Exception as e:
1880
- logger.error("Boss CLI call failed: %s", e)
1881
- return f":warning: Boss unavailable: {e}", True
1882
-
1883
- for m in _ACTION_RE.finditer(output):
1884
- try:
1885
- action = json.loads(m.group(1))
1886
- name = action.pop("action", "")
1887
- if name:
1888
- result_str = await _run_tool(name, action, cfg_loader, store, user_id=user_id)
1889
- logger.info("Boss CLI action: %s %s", name, result_str[:80])
1890
- except Exception as e:
1891
- logger.warning("Boss action parse error: %s", e)
1892
-
1893
- reply = _ACTION_RE.sub("", output).strip()
1894
- is_done = "[DONE]" in reply
1895
- reply = reply.replace("[DONE]", "").strip()
1896
- if not reply:
1897
- greeting = f"Hi {user_name}! " if user_name else "Hi! "
1898
- reply = f"{greeting}I'm Sentinel, your autonomous DevOps agent. How can I help you?"
1899
-
1900
- history.append({"role": "user", "content": message})
1901
- history.append({"role": "assistant", "content": reply})
1902
- return reply, is_done
1903
-
1904
-
1905
- # ── History serialization helpers ────────────────────────────────────────────
1906
-
1907
- def _serialize_content(content) -> list:
1908
- """Convert Anthropic SDK response content (Pydantic objects) to plain dicts.
1909
-
1910
- The SDK returns TextBlock / ToolUseBlock instances. json.dumps(..., default=str)
1911
- turns them into useless strings like "TextBlock(type='text', text='...')".
1912
- This converts them to proper dicts so history round-trips through SQLite safely.
1913
- """
1914
- if not isinstance(content, list):
1915
- return content
1916
- result = []
1917
- for block in content:
1918
- if isinstance(block, dict):
1919
- result.append(block)
1920
- elif hasattr(block, "model_dump"):
1921
- result.append(block.model_dump())
1922
- elif hasattr(block, "dict"):
1923
- result.append(block.dict())
1924
- elif hasattr(block, "type"):
1925
- if block.type == "text":
1926
- result.append({"type": "text", "text": getattr(block, "text", "")})
1927
- elif block.type == "tool_use":
1928
- result.append({
1929
- "type": "tool_use",
1930
- "id": getattr(block, "id", ""),
1931
- "name": getattr(block, "name", ""),
1932
- "input": getattr(block, "input", {}),
1933
- })
1934
- else:
1935
- result.append({"type": "text", "text": str(block)})
1936
- return result
1937
-
1938
-
1939
- def _clean_history(history: list) -> list:
1940
- """Remove turns that would cause a 400 from the Anthropic API.
1941
-
1942
- Strips orphaned tool_use blocks (assistant turn with tool_use but no
1943
- following tool_result turn) and consecutive same-role turns that result
1944
- from a previous session that crashed mid-tool-loop.
1945
- """
1946
- cleaned = []
1947
- i = 0
1948
- while i < len(history):
1949
- turn = history[i]
1950
- role = turn.get("role", "")
1951
- content = turn.get("content", [])
1952
-
1953
- # Drop assistant turns that contain tool_use if the next turn isn't tool_result
1954
- if role == "assistant" and isinstance(content, list):
1955
- has_tool_use = any(
1956
- (isinstance(b, dict) and b.get("type") == "tool_use")
1957
- for b in content
1958
- )
1959
- if has_tool_use:
1960
- next_turn = history[i + 1] if i + 1 < len(history) else None
1961
- next_content = (next_turn or {}).get("content", [])
1962
- has_result = isinstance(next_content, list) and any(
1963
- (isinstance(b, dict) and b.get("type") == "tool_result")
1964
- for b in next_content
1965
- )
1966
- if not has_result:
1967
- i += 1 # skip orphaned tool_use turn
1968
- continue
1969
-
1970
- # Drop consecutive same-role turns (keep the last one)
1971
- if cleaned and cleaned[-1].get("role") == role:
1972
- cleaned[-1] = turn
1973
- else:
1974
- cleaned.append(turn)
1975
- i += 1
1976
- return cleaned
1977
-
1978
-
1979
- # ── API-key path (structured tools, full agentic loop) ────────────────────────
1980
-
1981
- async def _handle_with_api(
1982
- message: str,
1983
- history: list,
1984
- cfg_loader,
1985
- store,
1986
- slack_client=None,
1987
- user_name: str = "",
1988
- user_id: str = "",
1989
- attachments: list | None = None,
1990
- channel: str = "",
1991
- is_admin: bool = False,
1992
- ) -> tuple[str, bool]:
1993
- import anthropic
1994
-
1995
- api_key = cfg_loader.sentinel.anthropic_api_key or os.environ.get("ANTHROPIC_API_KEY", "")
1996
- client = anthropic.Anthropic(api_key=api_key)
1997
-
1998
- paused = Path("SENTINEL_PAUSE").exists()
1999
- repos = list(cfg_loader.repos.keys())
2000
- ts = datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M UTC")
2001
- known_projects = [_read_project_name(d) for d in _find_project_dirs()]
2002
- log_sources = list(cfg_loader.log_sources.keys())
2003
- slack_mention = f"<@{user_id}>" if user_id else (user_name or "")
2004
- known_users = store.get_all_users() # {user_id: display_name}
2005
- users_hint = ", ".join(f"<@{uid}> = {name}" for uid, name in known_users.items())
2006
- system = (
2007
- _SYSTEM
2008
- + (f"\nYou are speaking with: {user_name} (Slack mention: {slack_mention})" if user_name else "")
2009
- + "\nAlways start your reply by addressing the user directly using their Slack mention, e.g. \"<@U123> here is what I found...\"."
2010
- + " Never use their plain name — always use the <@USER_ID> format so Slack highlights it."
2011
- + (f"\nKnown Slack users: {users_hint}" if users_hint else "")
2012
- + f"\n\nCurrent time: {ts}"
2013
- + f"\nSentinel status: {'⏸ PAUSED' if paused else '▶ RUNNING'}"
2014
- + f"\nManaged repos: {', '.join(repos) if repos else '(none configured)'}"
2015
- + (f"\nLog sources: {', '.join(log_sources)}" if log_sources else "")
2016
- + (f"\nKnown projects in workspace: {', '.join(known_projects)}" if known_projects else "")
2017
- + f"\nAdmin access for this user: {'YES — admin tools are available' if is_admin else 'NO — admin tools will be refused'}"
2018
- )
2019
-
2020
- # Build user content — include attachment blocks if any
2021
- attach_blocks = _attachments_to_api_blocks(attachments or [])
2022
- if attach_blocks:
2023
- user_content = attach_blocks + [{"type": "text", "text": message}]
2024
- else:
2025
- user_content = message
2026
-
2027
- # Work on a local copy — only commit to history on success to prevent
2028
- # cascading 400s if the API rejects a malformed/corrupted history.
2029
- messages = list(history) + [{"role": "user", "content": user_content}]
2030
-
2031
- while True:
2032
- response = client.messages.create(
2033
- model="claude-opus-4-6",
2034
- max_tokens=2048,
2035
- system=system,
2036
- tools=_TOOLS,
2037
- messages=messages,
2038
- )
2039
-
2040
- text_parts = []
2041
- tool_blocks = []
2042
- for block in response.content:
2043
- if block.type == "text":
2044
- text_parts.append(block.text)
2045
- elif block.type == "tool_use":
2046
- tool_blocks.append(block)
2047
-
2048
- if not tool_blocks:
2049
- reply = " ".join(text_parts).strip()
2050
- is_done = "[DONE]" in reply
2051
- reply = reply.replace("[DONE]", "").strip()
2052
- if not reply:
2053
- greeting = f"Hi {user_name}! " if user_name else "Hi! "
2054
- reply = f"{greeting}I'm Sentinel, your autonomous DevOps agent. How can I help you?"
2055
- # Heuristic override: if reply ends with a question, Claude is waiting for input
2056
- if is_done and re.search(r'\?\s*$', reply):
2057
- is_done = False
2058
- # Commit to history only on success — serialize SDK objects to plain dicts
2059
- history.append({"role": "user", "content": user_content})
2060
- history.append({"role": "assistant", "content": _serialize_content(response.content)})
2061
- return reply, is_done
2062
-
2063
- messages.append({"role": "assistant", "content": _serialize_content(response.content)})
2064
- tool_results = []
2065
- for tc in tool_blocks:
2066
- result = await _run_tool(tc.name, tc.input, cfg_loader, store, slack_client=slack_client, user_id=user_id, channel=channel, is_admin=is_admin)
2067
- logger.info("Boss tool: %s(%s) → %s", tc.name, tc.input, result[:120])
2068
- tool_results.append({
2069
- "type": "tool_result",
2070
- "tool_use_id": tc.id,
2071
- "content": result,
2072
- })
2073
- messages.append({"role": "user", "content": tool_results})
2074
-
2075
-
2076
- # ── Main entry point ──────────────────────────────────────────────────────────
2077
-
2078
- async def handle_message(
2079
- message: str,
2080
- history: list,
2081
- cfg_loader,
2082
- store,
2083
- slack_client=None,
2084
- user_name: str = "",
2085
- user_id: str = "",
2086
- attachments: list | None = None,
2087
- channel: str = "",
2088
- is_admin: bool = False,
2089
- ) -> tuple[str, bool]:
2090
- """
2091
- Process one user message through the Sentinel Boss (Claude with tool use).
2092
-
2093
- Priority:
2094
- 1. Claude Pro / OAuth via `claude --print` (CLI path — no API key needed)
2095
- 2. ANTHROPIC_API_KEY fallback (structured tools, full agentic loop)
2096
-
2097
- Returns:
2098
- (reply_text, is_done)
2099
- is_done=True → session complete, release the Slack queue slot.
2100
- is_done=False waiting for user follow-up, keep the slot.
2101
- """
2102
- api_key = cfg_loader.sentinel.anthropic_api_key or os.environ.get("ANTHROPIC_API_KEY", "")
2103
-
2104
- # 1st priority: ANTHROPIC_API_KEY — full structured tools, cheap per-token for Boss queries
2105
- if api_key:
2106
- try:
2107
- import anthropic # noqa: F401
2108
- return await _handle_with_api(
2109
- message, history, cfg_loader, store, slack_client=slack_client,
2110
- user_name=user_name, user_id=user_id, attachments=attachments, channel=channel,
2111
- is_admin=is_admin,
2112
- )
2113
- except Exception as api_err:
2114
- err_str = str(api_err)
2115
- # Detect rate-limit / auth failure and alert Slack before falling through
2116
- cfg = cfg_loader.sentinel
2117
- if is_rate_limited(err_str):
2118
- from .notify import rate_limit_message
2119
- alert_if_rate_limited(cfg.slack_bot_token, cfg.slack_channel,
2120
- "sentinel_boss/api", err_str)
2121
- logger.warning("Boss: API key path failed (%s), trying CLI fallback", err_str)
2122
-
2123
- # 2nd priority: Claude Pro / OAuth via CLI (limited tools but no API key needed)
2124
- cli_reply, cli_done = await _handle_with_cli(
2125
- message, history, cfg_loader, store, slack_client=slack_client, user_name=user_name,
2126
- user_id=user_id, attachments=attachments, is_admin=is_admin,
2127
- )
2128
- if not cli_reply.startswith(":warning:"):
2129
- return cli_reply, cli_done
2130
-
2131
- # Both paths failed — alert Slack and return error
2132
- cfg = cfg_loader.sentinel
2133
- err_output = cli_reply
2134
- alert_if_rate_limited(cfg.slack_bot_token, cfg.slack_channel,
2135
- "sentinel_boss/cli", err_output)
2136
- if not api_key:
2137
- # No auth at all configured
2138
- no_auth_msg = (
2139
- ":warning: *Sentinel Boss — no Claude auth configured*\n"
2140
- "Configure at least one of:\n"
2141
- " `ANTHROPIC_API_KEY` in `sentinel.properties` — full features\n"
2142
- "• Claude Pro OAuth: run `claude login` on the server — required for fix_engine\n"
2143
- "See: https://github.com/misterhuydo/Sentinel#authentication"
2144
- )
2145
- slack_alert(cfg.slack_bot_token, cfg.slack_channel, no_auth_msg)
2146
- return ":warning: No Claude authentication configured. See Slack for details.", True
2147
- return cli_reply, cli_done
1
+ """
2
+ sentinel_boss.py — Claude-backed Sentinel Boss.
3
+
4
+ Claude acts as the boss: reads project state, decides on actions,
5
+ executes them via tool use, and responds naturally. One agentic loop
6
+ per turn — Claude may call multiple tools before replying.
7
+ """
8
+
9
+ import json
10
+ import logging
11
+ import os
12
+ import re
13
+ import subprocess
14
+ import uuid
15
+ from datetime import datetime, timezone
16
+ from pathlib import Path
17
+ from typing import Optional
18
+
19
+ from .notify import alert_if_rate_limited, slack_alert, is_rate_limited
20
+
21
+ logger = logging.getLogger(__name__)
22
+
23
+ # ── System prompt ────────────────────────────────────────────────────────────
24
+
25
+ _SYSTEM = """\
26
+ You are Sentinel Boss — the AI interface for Sentinel, a 24/7 autonomous DevOps agent.
27
+
28
+ Sentinel watches production logs, detects errors, generates code fixes via Claude Code,
29
+ and opens GitHub PRs for admin review (or pushes directly if AUTO_PUBLISH=true).
30
+
31
+ Your job:
32
+ - Understand what the DevOps engineer needs in natural language
33
+ - Query Sentinel's live state (errors, fixes, open PRs) on their behalf
34
+ - Deliver tasks/issues to the right project — you know all projects in this workspace
35
+ - Control Sentinel (pause/resume) when asked
36
+ - Give honest, concise answers — you know this system inside out
37
+ - If a project name is unclear or ambiguous, ask the engineer to clarify — never guess
38
+
39
+ What you can do (tools available):
40
+
41
+ 1. get_status — Show recent errors detected, fixes applied/pending, open PRs.
42
+ e.g. "what happened today?", "any issues?", "show open PRs"
43
+
44
+ 2. create_issue — Deliver a fix/task to any project in this workspace by short name.
45
+ You know all project names — use list_projects if you're unsure.
46
+ If the project name is ambiguous or not found, ask to clarify.
47
+ e.g. "tell 1881 to fix X", "look into Y in elprint", "investigate Z"
48
+
49
+ 3. pause_sentinel — Create SENTINEL_PAUSE file to halt all auto-fix activity.
50
+ e.g. "pause sentinel", "stop auto-fixing"
51
+
52
+ 4. resume_sentinel — Remove SENTINEL_PAUSE file to resume normal operation.
53
+ e.g. "resume sentinel", "unpause"
54
+
55
+ 5. list_projects — List all configured repos and log sources in this Sentinel instance.
56
+ e.g. "what projects are you watching?", "list all repos"
57
+
58
+ 6. search_logs — SSH live to servers and grep logs in real time (uses fetch_log.sh with
59
+ the query as GREP_FILTER). Falls back to cached files if unavailable.
60
+ e.g. "search logs for illegal PIN in 1881", "find X in SSOLWA", "grep logs for Z"
61
+
62
+ 6b. filter_logs Instant keyword/regex search on locally-synced logs. No SSH, sub-second.
63
+ Supports time range (since_hours) and case options.
64
+ e.g. "filter logs for TryDig", "find appid=X in STS logs", "errors last 6h"
65
+
66
+ 7. trigger_poll — Trigger an immediate poll cycle without waiting for the schedule.
67
+ e.g. "check now", "poll immediately", "don't wait, run now"
68
+
69
+ 8. get_repo_status Show the current git branch, last commit, and recent fix branches
70
+ for a specific repository.
71
+ e.g. "status of repo X", "what branch is cairn on?"
72
+
73
+ 9. list_recent_commits List the most recent commits in a repo (including Sentinel's auto-fixes).
74
+ e.g. "show me recent commits in elprint-sales", "what did sentinel commit?"
75
+
76
+ 10. get_fix_detail — Get full details of a specific fix: error, patch path, PR URL, status.
77
+ e.g. "show fix abc123", "details on that fix"
78
+
79
+ 11. list_errors — List recent errors from the state store, optionally filtered by repo or source.
80
+ e.g. "show all errors today", "what errors hit elprint this week?"
81
+
82
+ 12. pull_repo — Run git pull on one or all managed application repos.
83
+ e.g. "pull changes", "git pull all repos", "update the code"
84
+
85
+ 13. pull_config — Run git pull on one or all Sentinel project config dirs.
86
+ e.g. "pull config for 1881", "update sentinel config", "pull all configs"
87
+
88
+ 14. fetch_logs — Run fetch_log.sh on demand to pull fresh logs from remote servers right now.
89
+ Supports --debug mode and parameter overrides (tail count, grep filter).
90
+ e.g. "fetch logs", "try fetch_log.sh for SSOLWA", "fetch logs with debug",
91
+ "grab latest logs from STS", "fetch logs without filter"
92
+
93
+ 15. watch_bot — Register a Slack bot for passive monitoring. Every message it posts is
94
+ auto-queued as an issue in the bot's registered project.
95
+ ALWAYS requires a project infer from context or ask the user first.
96
+ e.g. "listen to @alertbot", "watch @bot1 @bot2 for project 1881", "monitor @errorbot"
97
+
98
+ 16. unwatch_bot — Remove a Slack bot from the passive watch list.
99
+ e.g. "stop watching @alertbot", "unwatch @errorbot"
100
+
101
+ 17. list_watched_botsShow all Slack bots currently being passively monitored and which projects
102
+ they are delivering to.
103
+ e.g. "which bots are you watching?", "list monitored bots"
104
+
105
+ 18. upgrade_sentinel Pull the latest Sentinel agent code, update Python deps, and restart the
106
+ process. Safe to run at any time no restart if already up to date.
107
+ e.g. "upgrade sentinel", "update sentinel", "upgrade yourself"
108
+
109
+ 19. ask_codebase — Ask any natural-language question about a managed repo's codebase.
110
+ Claude Code answers using its full knowledge of the code.
111
+ e.g. "what does the 1881 backend do?", "find PIN validation in elprint",
112
+ "any TODOs in cairn?", "are there security issues in elprint-sales?"
113
+
114
+ 20. restart_project Stop and restart a specific Sentinel monitoring instance (stop.sh + start.sh).
115
+ This restarts the Sentinel agent for that project, NOT the application itself.
116
+ e.g. "restart sentinel for 1881", "restart the 1881 monitor", "reload elprint sentinel"
117
+
118
+ 21. tail_log — Fetch the last N lines of a log source live, without a grep filter.
119
+ e.g. "show recent SSOLWA logs", "tail STS", "last 200 lines from 1881 logs"
120
+
121
+ 22. post_file — Upload a text file to the Slack conversation (diff, log excerpt, report, CSV).
122
+ Use when output is too large for chat, or the user asks to download/export something.
123
+ e.g. "give me that as a file", "export the log", "send me the diff"
124
+
125
+ When someone asks what you can do, what you support, what your capabilities are, or how you can help,
126
+ reply with a short summary grouped by category:
127
+
128
+ *Monitoring & status*
129
+ `get_status` — errors detected, fixes applied/pending/failed, open PRs — "what happened today?"
130
+ • `get_repo_status` — per-repo breakdown of errors and fixes — "how is elprint doing?"
131
+ • `list_recent_commits` — recent Sentinel auto-fix commits — "what did Sentinel commit?"
132
+
133
+ *Log management*
134
+ `fetch_logs` — pull fresh logs from servers right now — "fetch logs for SSOLWA"
135
+ • `search_logs` — live SSH grep on production servers — "search logs for illegal PIN in 1881"
136
+ • `filter_logs` — instant grep on locally-synced logs (no SSH) — "filter logs for TryDig", "show errors from last 24h"
137
+ `tail_log` — last N lines of a log source, no filter — "show recent SSOLWA logs"
138
+
139
+ *Codebase questions*
140
+ • `ask_codebase` — any question about a repo's code "what does 1881 do?", "find PIN validation", "any TODOs?", "security issues?"
141
+
142
+ *Fix management*
143
+ • `get_fix_details` — full details of a specific fix — "show fix abc123"
144
+ • `list_pending_prs` — all open Sentinel PRs awaiting review — "list open PRs"
145
+ • `check_auth_status` — Claude auth health, rate-limit circuit state, fix engine 24 h stats — "is Claude working?", "any rate limits?", "auth issues?"
146
+
147
+ *Project & task delivery*
148
+ `list_projects` all projects and repos Sentinel manages — "what projects do you manage?"
149
+ • `create_issue` — deliver a task to any project by name — "tell 1881 to fix X"
150
+ • `trigger_poll` — run a log-fetch + fix cycle right now — "check now"
151
+ • `pause_sentinel` / `resume_sentinel` — halt or resume all auto-fix activity — "pause Sentinel"
152
+
153
+ *Repo & config sync*
154
+ • `pull_repo` — git pull on managed application repos — "pull latest code"
155
+ • `pull_config` — git pull on Sentinel config dirs — "pull config for elprint"
156
+
157
+ *File sharing*
158
+ • `post_file` — upload a file to Slack — "give me that as a file", "export the log", "send me the diff"
159
+
160
+ *Personal*
161
+ • `my_stats` — your activity: issues submitted, fixes, conversation history — "my stats"
162
+ `clear_my_history` wipe your conversation history and start fresh — "clear my history"
163
+
164
+ *Slack bot watching*
165
+ • `list_watched_bots` — show all bots currently being monitored — "which bots are you watching?"
166
+
167
+ *Admin* (SLACK_ADMIN_USERS if configured, otherwise all allowed users)
168
+ • `watch_bot` — register a Slack bot for passive monitoring; its messages become issues — "listen to @alertbot"
169
+ • `unwatch_bot` — stop monitoring a bot "stop watching @errorbot"
170
+ • `restart_project` — stop + restart a Sentinel monitoring instance (not the app) — "restart sentinel for 1881"
171
+ • `upgrade_sentinel` — pull latest Sentinel release and restart "upgrade sentinel"
172
+ • `list_all_users` — all Slack users who have talked to Sentinel + activity summary
173
+ `clear_user_history` wipe a specific user's conversation history
174
+ `reset_fingerprint` clear the 24h fix lock so Sentinel retries an error
175
+ `list_all_errors` full unfiltered error database
176
+ • `export_db` — dump full Sentinel state as a downloadable file
177
+
178
+ About Sentinel answer any question someone asks:
179
+
180
+ Sentinel is a 24/7 autonomous DevOps agent deployed per-project. Here is everything you know:
181
+
182
+ Architecture:
183
+ - Poll loop every POLL_INTERVAL_SECONDS (default 120s)
184
+ - Log sources: SSH servers (rsync + live grep) or Cloudflare worker endpoints
185
+ - Local sync: rsync --append-verify copies remote logs to workspace/synced/ every SYNC_INTERVAL_SECONDS (default 300s); full history accumulated locally
186
+ - Error detection: regex-based parsing, multi-line stack trace grouping, fingerprinting (hash of normalised message + top 3 stack frames)
187
+ - Dedup: SQLite state_store.db 24h cooldown per fingerprint, plus git log check before each fix
188
+ - Routing: TARGET_REPO=auto uses PACKAGE_PREFIXES to map stack trace frames to the correct repo; explicit TARGET_REPO overrides
189
+ - Fix engine: Claude Code headless (claude --print) with structured prompt (error + stack trace + Cairn MCP context); unified diff output; max 5 files / 200 lines
190
+ - Commit: git pull --rebase, apply patch, run tests, commit with sentinel/fix-<fp> marker
191
+ - Publish: AUTO_PUBLISH=true push to main + CI/CD trigger; AUTO_PUBLISH=false branch + GitHub PR
192
+ - Fix confirmation: SENTINEL marker injected into every modified method; when marker appears in production logs quiet period starts; after MARKER_CONFIRM_HOURS with no recurrence → fix confirmed
193
+
194
+ Health monitoring (HEALTH_URL per repo):
195
+ - Polls the URL on each cycle; expects JSON with "Status": "true"
196
+ - 502/503/504 or connection refused → status=stopped
197
+ - 200 + Status != true status=failing
198
+ - stopped + startup failure in synced logs auto-fix attempt (Spring Boot BeanCreationException, NoSuchMethodError, APPLICATION FAILED TO START, etc.)
199
+ - stopped + no startup errors asks human ONCE "is this deliberate?", then stays silent (state=pending)
200
+ - Human says "maintenance <repo>" state=confirmed, fully silent until recovery
201
+ - Recovery (health=healthy again) → clears state, posts "App X is back online"
202
+
203
+ Duplicate / cross-source dedup:
204
+ - Fingerprint-based: same error from monitor bot + log scan same fingerprint state_store dedup
205
+ - git log check: before each fix attempt, checks recent commits for the fingerprint — skips if already fixed
206
+ - 24h cooldown per fingerprint prevents retry spam
207
+
208
+ Slack Boss:
209
+ - Socket Mode (xapp-... app-level token + xoxb-... bot token)
210
+ - Per-user sessions with SQLite-persisted history (last 40 messages)
211
+ - Tool-use loop with Anthropic API (cheap per-token, structured tools)
212
+ - Falls back to claude CLI if no API key configured
213
+ - Admin users (SLACK_ADMIN_USERS) can access destructive/sensitive tools
214
+
215
+ Common config questions:
216
+ - ANTHROPIC_API_KEY: used by Boss conversation (structured tool-use, cheap); optional for Fix Engine when CLAUDE_PRO_FOR_TASKS=true
217
+ - CLAUDE_PRO_FOR_TASKS=true (default): Fix Engine calls claude CLI using Claude Pro OAuth billing; falls back to API key on auth error
218
+ - AUTO_PUBLISH=false (default): Sentinel opens a PR for admin review; =true: pushes directly to main
219
+ - SYNC_RETENTION_DAYS (default 30): delete synced log files older than N days
220
+ - SYNC_MAX_FILE_MB (default 200): truncate synced log files exceeding this size (drops oldest half of lines)
221
+ - HEALTH_URL: HTTP endpoint per repo; JSON with "Status": "true" = healthy
222
+ - TARGET_REPO=auto: route errors to repo by longest-matching PACKAGE_PREFIXES; =<name>: always route to that repo
223
+ - SLACK_ALLOWED_USERS: if set, only these Slack user IDs can interact with Boss
224
+ - SLACK_ADMIN_USERS: subset of allowed users with access to admin-only tools (reset_fingerprint, export_db, watch_bot, etc.)
225
+
226
+ Required Slack Bot Token scopes: app_mentions:read, channels:history, groups:history, im:history, chat:write, files:read, reactions:write, users:read, conversations.connect:read
227
+ Required App-Level Token scope (Socket Mode): connections:write
228
+ Events to subscribe: app_mention, message.im, message.channels
229
+
230
+ Tone: direct, professional, like a senior engineer who owns the system.
231
+ Don't pad responses. Don't say "Great question!" or "Certainly!".
232
+ If you don't know something, use a tool to find out before saying you don't know.
233
+
234
+ When to act vs. when to ask:
235
+ - Clear command ("check status", "fetch logs", "pause sentinel") → call the tool immediately, reply with results.
236
+ - Ambiguous or exploratory ("what does get_repo_status do?", "tell me about search_logs") → explain the tool naturally, then ask: "Want me to run it?"
237
+ - Unclear intent (could be either) → use judgment: brief explanation + "Want me to run this now?"
238
+ - Prefer filter_logs over search_logs when synced logs are available it's instant and never causes session timeout.
239
+ Use search_logs only when the user explicitly wants live/real-time data or synced logs are not yet available.
240
+ - If a tool call will take a moment (search, fetch, pull), prefix your reply with a brief "working" line ending in "..." before the results, e.g. "Searching SSOLWA for TryDig activity..." then the actual output.
241
+ Never just say a working line and stop — always follow it with the results in the same message.
242
+
243
+ Search reasoning — always do this before calling filter_logs or search_logs:
244
+ 1. Interpret intent: what is the user actually looking for? Don't pass the raw message as the query.
245
+ Examples:
246
+ - "TryDig errors" → query="TryDig" (component name; look for it in any context)
247
+ - "payment failures last hour" → query="pay|payment|transaction", since_hours=1
248
+ - "why is the app crashing" → query="Exception|Error|FAILED|crash", look for stack traces
249
+ - "login issues today" → query="login|auth|401|403|session", since_hours=24
250
+ - "slow requests" → query="timeout|slow|latency|took [0-9]+ms|duration"
251
+ - "startup problems" → query="APPLICATION FAILED|BeanCreation|NoSuchMethod|ClassNotFound"
252
+ Use | in the regex to cover synonyms and related terms. Keep it focused — not too broad.
253
+ 2. Choose since_hours if a time window is implied ("last hour", "today", "this morning").
254
+ 3. Pick source if the user mentioned a specific service (SSOLWA, STS, etc.) or server.
255
+
256
+ After getting filter_logs results, always synthesize never dump raw output:
257
+ - Lead with 1-2 sentences: total count, affected sources, dominant pattern.
258
+ e.g. "Found 47 matches across SSOLWA and STS mostly NullPointerException in DigService (31 hits)."
259
+ - List the top 3-5 patterns with counts in plain language.
260
+ - Call out any notable time clustering (e.g. "spike between 10:23–10:47 UTC").
261
+ - Show 2-3 example lines at most — only the most informative ones.
262
+ - End with a recommendation if the pattern suggests something actionable:
263
+ e.g. "Looks like a dependency resolution issue — create an issue?" or "Pattern consistent with a null config value at startup."
264
+ - If total_matches=0, say so plainly and suggest what else to try.
265
+
266
+ Session context — critical rules:
267
+ - Loaded conversation history is prior-session background only. It may be hours or days old.
268
+ - NEVER say "the previous search", "I already fetched", "as I found earlier", or any phrase implying you already did part of the current task — unless a tool result appears in THIS response's tool calls.
269
+ - When handling a new request, call the tools fresh. Do not assume any prior tool result is still current or that any prior step "counts" toward the current task.
270
+ - The only exception: if the user explicitly asks about something from the history ("what did you find earlier?"), you may reference it — but note it is from a prior session.
271
+
272
+ Trust your tool results — never contradict them:
273
+ - If any search_logs call in this response returned total_matches > 0, you HAVE results. Report them.
274
+ - Never say "no results found" or "nothing was found" when a tool result shows total_matches > 0.
275
+ - If one source-specific call returns 0 but a broader call returned matches, use the broader results.
276
+ - A cached result with files_searched=0 is a source-name lookup failure, NOT an absence of log data.
277
+ Treat it as "source not recognised" and fall back to the broad search results you already have.
278
+
279
+ Avoid redundant tool calls (within a single response only — always run tools fresh for new requests):
280
+ - If a broad search (e.g. search_logs with no source filter) already returned results in THIS response, do NOT repeat the same search with a source filter to "refine" — use what you already fetched.
281
+ - If a tool call fails in THIS response, do NOT retry the entire search from scratch. Continue with what succeeded and note the failure.
282
+ - One pass per task: gather all needed data in a single round of tool calls, then produce the final answer.
283
+
284
+ Issue identification — before calling create_issue:
285
+ 1. Determine if the message is a REAL issue/task (bug report, feature request, investigation ask)
286
+ vs. a status question, tool query, or casual chat. If not an issue, just answer normally.
287
+ 2. If it IS an issue, gather what's needed before creating:
288
+ - Project: which project? If unclear, ask. Use list_projects if you need to check names.
289
+ - Context: what's the problem? Include everything: description, error text, steps to reproduce.
290
+ - Attachments: summarise any files/screenshots the user shared.
291
+ - Support URL: note any ticket/doc/link the user mentioned.
292
+ - Identity: always captured automatically from the Slack session.
293
+ 3. Populate `findings` with curated evidence — only when relevant and concise:
294
+ - If you ran search_logs, tail_log, ask_codebase, or get_status before creating the issue,
295
+ summarise only the findings directly related to this specific issue.
296
+ - Do NOT paste raw tool output. Summarise: which services, how often, key pattern, 1-3 example lines.
297
+ - If the search returned nothing relevant, or the issue is purely user-described with no log evidence, leave `findings` empty.
298
+ - The fix engine reads only the issue file. Give it signal, not noise — 500 words max.
299
+ 4. Before calling the tool, confirm with the user in natural language:
300
+ e.g. "I'll create an issue for project *1881* — here's what I have: [summary]. Look right?"
301
+ Wait for their confirmation before proceeding.
302
+ EXCEPTION: if the user's message already contains a clear project + unambiguous description,
303
+ skip the confirmation and create immediately — don't ask when nothing is unclear.
304
+ 5. After creating, tell them the issue was queued and Sentinel will pick it up on the next poll.
305
+
306
+ When the engineer's request is fully handled, end your LAST message with the token: [DONE]
307
+ IMPORTANT: Always write your actual reply text FIRST, then append [DONE] at the end. Example: "Hello! I'm Sentinel. [DONE]". Never output [DONE] as your only content.
308
+ For greetings like "hello" or empty messages, introduce yourself briefly and offer help, then end with [DONE].
309
+ If you need a follow-up from them, do NOT include [DONE] — wait for their next message.
310
+ """
311
+
312
+ # ── Tool definitions ─────────────────────────────────────────────────────────
313
+
314
+ _TOOLS = [
315
+ {
316
+ "name": "get_status",
317
+ "description": (
318
+ "Get recent errors, fixes applied, fixes pending review, and open PRs. "
319
+ "Use for: 'what happened today?', 'any issues?', 'how are things?', "
320
+ "'what are the open PRs?', 'did sentinel fix anything?'"
321
+ ),
322
+ "input_schema": {
323
+ "type": "object",
324
+ "properties": {
325
+ "hours": {
326
+ "type": "integer",
327
+ "description": "Look-back window in hours (default 24)",
328
+ "default": 24,
329
+ },
330
+ },
331
+ },
332
+ },
333
+ {
334
+ "name": "create_issue",
335
+ "description": (
336
+ "Deliver a confirmed issue/task to a Sentinel project instance. "
337
+ "Only call this after you have: (1) confirmed the message is a real issue or task, "
338
+ "(2) identified the target project, (3) gathered enough context, and "
339
+ "(4) confirmed with the user ('I'll create this issue for project X — does that look right?'). "
340
+ "Do NOT call this for status questions, tool queries, or casual chat."
341
+ ),
342
+ "input_schema": {
343
+ "type": "object",
344
+ "properties": {
345
+ "description": {
346
+ "type": "string",
347
+ "description": "Full problem/task description — include all context the user gave you",
348
+ },
349
+ "project": {
350
+ "type": "string",
351
+ "description": "Project short name (e.g. '1881', 'elprint'). Ask if unclear.",
352
+ },
353
+ "target_repo": {
354
+ "type": "string",
355
+ "description": "Specific repo within the project (omit to let Sentinel auto-route)",
356
+ },
357
+ "support_url": {
358
+ "type": "string",
359
+ "description": "Any URL the user shared (ticket, doc, screenshot link, etc.)",
360
+ },
361
+ "attachments_summary": {
362
+ "type": "string",
363
+ "description": "Summary of any files/screenshots the user attached",
364
+ },
365
+ "findings": {
366
+ "type": "string",
367
+ "description": (
368
+ "A concise, curated summary of evidence directly relevant to this issue "
369
+ "NOT raw tool output. Include only what the fix engine needs: "
370
+ "key error patterns, affected services, approximate frequency/timestamps, "
371
+ "and 1-3 representative log lines. Omit unrelated results. "
372
+ "Keep under 500 words. Leave empty if no tool results are relevant."
373
+ ),
374
+ },
375
+ },
376
+ "required": ["description"],
377
+ },
378
+ },
379
+ {
380
+ "name": "get_fix_details",
381
+ "description": "Get full details of a specific fix by fingerprint (8+ hex chars).",
382
+ "input_schema": {
383
+ "type": "object",
384
+ "properties": {
385
+ "fingerprint": {"type": "string"},
386
+ },
387
+ "required": ["fingerprint"],
388
+ },
389
+ },
390
+ {
391
+ "name": "list_pending_prs",
392
+ "description": "List all open Sentinel PRs awaiting admin review.",
393
+ "input_schema": {"type": "object", "properties": {}},
394
+ },
395
+ {
396
+ "name": "check_auth_status",
397
+ "description": (
398
+ "Check Claude authentication health, current rate-limit / usage-limit circuit state, "
399
+ "and fix engine stats for the last 24 h. "
400
+ "Use when someone asks: 'is Claude working?', 'any rate limits?', 'why aren't fixes running?', "
401
+ "'is the API key OK?', 'auth issues?', 'fix engine status'."
402
+ ),
403
+ "input_schema": {"type": "object", "properties": {}},
404
+ },
405
+ {
406
+ "name": "pause_sentinel",
407
+ "description": (
408
+ "Pause ALL Sentinel fix activity immediately. "
409
+ "Use when the engineer says 'pause', 'stop', 'freeze', or 'hold off'."
410
+ ),
411
+ "input_schema": {"type": "object", "properties": {}},
412
+ },
413
+ {
414
+ "name": "resume_sentinel",
415
+ "description": "Resume Sentinel fix activity after a pause.",
416
+ "input_schema": {"type": "object", "properties": {}},
417
+ },
418
+ {
419
+ "name": "list_projects",
420
+ "description": (
421
+ "List all projects (Sentinel instances) in this workspace and the repos "
422
+ "each one manages. Use for: 'what projects do you manage?', 'list projects', "
423
+ "'what repos are configured?', 'show me all projects'."
424
+ ),
425
+ "input_schema": {"type": "object", "properties": {}},
426
+ },
427
+ {
428
+ "name": "search_logs",
429
+ "description": (
430
+ "Search production logs for a keyword or pattern. "
431
+ "When a project or source is specified (or can be inferred), performs a LIVE fetch "
432
+ "via fetch_log.sh with the query as the grep filter — SSHes directly to the server. "
433
+ "Falls back to searching locally-cached log files when no source can be determined. "
434
+ "Use for: 'search logs for illegal PIN in 1881', 'find X in SSOLWA logs', "
435
+ "'what did user Y do?', 'show entries for appid=Z', 'grep logs for X'."
436
+ ),
437
+ "input_schema": {
438
+ "type": "object",
439
+ "properties": {
440
+ "query": {
441
+ "type": "string",
442
+ "description": "Keyword or regex to grep for",
443
+ },
444
+ "source": {
445
+ "type": "string",
446
+ "description": "Log source name to search (partial match against log-config filenames, e.g. 'SSOLWA', '1881'). Leave empty to search all sources.",
447
+ },
448
+ "max_matches": {
449
+ "type": "integer",
450
+ "description": "Max matching lines to return per source (default 30)",
451
+ "default": 30,
452
+ },
453
+ "tail": {
454
+ "type": "integer",
455
+ "description": (
456
+ "Number of log lines to fetch from the server before grepping (default: config value, typically 500). "
457
+ "Increase when the user asks for a longer time window — e.g. 'yesterday up to now' → use 5000-10000. "
458
+ "Higher values take longer but cover more history."
459
+ ),
460
+ },
461
+ },
462
+ "required": ["query"],
463
+ },
464
+ },
465
+ {
466
+ "name": "filter_logs",
467
+ "description": (
468
+ "Search locally-synced log files by keyword or regex — instant, no SSH required. "
469
+ "Use this for fast queries once logs are synced (check with list_projects if unsure). "
470
+ "Supports time-range filtering and case options. "
471
+ "Use for: 'find TryDig in synced logs', 'show errors from last 24h', "
472
+ "'filter logs for appid=X', 'search local logs for Y'."
473
+ ),
474
+ "input_schema": {
475
+ "type": "object",
476
+ "properties": {
477
+ "query": {
478
+ "type": "string",
479
+ "description": "Keyword or regex to search for",
480
+ },
481
+ "source": {
482
+ "type": "string",
483
+ "description": "Log source name (partial match, e.g. 'STS', 'SSOLWA'). Leave empty to search all synced sources.",
484
+ },
485
+ "since_hours": {
486
+ "type": "integer",
487
+ "description": "Only return lines from the last N hours (uses log line timestamps). Omit for all available history.",
488
+ },
489
+ "max_matches": {
490
+ "type": "integer",
491
+ "description": "Max matching lines to return per source file (default 50)",
492
+ "default": 50,
493
+ },
494
+ "case_sensitive": {
495
+ "type": "boolean",
496
+ "description": "Case-sensitive match (default false)",
497
+ "default": False,
498
+ },
499
+ },
500
+ "required": ["query"],
501
+ },
502
+ },
503
+ {
504
+ "name": "trigger_poll",
505
+ "description": (
506
+ "Trigger an immediate log-fetch and error-detection cycle without waiting "
507
+ "for the next scheduled interval. Use when: 'check now', 'run now', "
508
+ "'poll immediately', 'don't wait'."
509
+ ),
510
+ "input_schema": {"type": "object", "properties": {}},
511
+ },
512
+ {
513
+ "name": "get_repo_status",
514
+ "description": (
515
+ "Per-repository breakdown of errors detected and fixes applied. "
516
+ "Use for: 'how is repo X doing?', 'which repo has the most issues?', "
517
+ "'break down by repo'."
518
+ ),
519
+ "input_schema": {
520
+ "type": "object",
521
+ "properties": {
522
+ "hours": {
523
+ "type": "integer",
524
+ "description": "Look-back window in hours (default 24)",
525
+ "default": 24,
526
+ },
527
+ },
528
+ },
529
+ },
530
+ {
531
+ "name": "list_recent_commits",
532
+ "description": (
533
+ "List recent commits made by Sentinel across all managed repos. "
534
+ "Use for: 'what did Sentinel commit?', 'show recent auto-fixes', 'what was changed?'."
535
+ ),
536
+ "input_schema": {
537
+ "type": "object",
538
+ "properties": {
539
+ "limit": {
540
+ "type": "integer",
541
+ "description": "Max commits per repo (default 5)",
542
+ "default": 5,
543
+ },
544
+ },
545
+ },
546
+ },
547
+ {
548
+ "name": "pull_repo",
549
+ "description": (
550
+ "Run git pull on one or all managed repos to fetch latest changes from GitHub. "
551
+ "Use for: 'pull changes', 'git pull', 'update repo X', 'fetch latest code'."
552
+ ),
553
+ "input_schema": {
554
+ "type": "object",
555
+ "properties": {
556
+ "repo": {
557
+ "type": "string",
558
+ "description": "Repo name to pull (omit to pull all configured repos)",
559
+ },
560
+ },
561
+ },
562
+ },
563
+ {
564
+ "name": "pull_config",
565
+ "description": (
566
+ "Run git pull on one or all Sentinel project config directories. "
567
+ "Projects are matched by short name ('1881', 'elprint') or full dir name ('sentinel-1881'). "
568
+ "Use for: 'pull config for 1881', 'update sentinel config', 'pull all configs'."
569
+ ),
570
+ "input_schema": {
571
+ "type": "object",
572
+ "properties": {
573
+ "project": {
574
+ "type": "string",
575
+ "description": "Project short name or dir name to pull (omit for all projects)",
576
+ },
577
+ },
578
+ },
579
+ },
580
+ {
581
+ "name": "fetch_logs",
582
+ "description": (
583
+ "Run fetch_log.sh for one or all configured log sources to pull the latest logs "
584
+ "from remote servers right now. Use for: 'fetch logs', 'run fetch_log.sh', "
585
+ "'grab latest logs from SSOLWA', 'try fetch_log.sh for STS', "
586
+ "'pull logs from server', 'get fresh logs'."
587
+ ),
588
+ "input_schema": {
589
+ "type": "object",
590
+ "properties": {
591
+ "source": {
592
+ "type": "string",
593
+ "description": "Log source name to fetch (partial match, e.g. 'SSOLWA'). Omit to fetch all.",
594
+ },
595
+ "debug": {
596
+ "type": "boolean",
597
+ "description": "Run fetch_log.sh with --debug flag to show SSH/grep details",
598
+ "default": False,
599
+ },
600
+ "tail": {
601
+ "type": "integer",
602
+ "description": "Override TAIL lines (how many log lines to fetch)",
603
+ },
604
+ "grep_filter": {
605
+ "type": "string",
606
+ "description": "Override GREP_FILTER (regex). Pass 'none' to disable filtering.",
607
+ },
608
+ },
609
+ },
610
+ },
611
+ {
612
+ "name": "watch_bot",
613
+ "description": (
614
+ "Tell Sentinel to passively monitor a Slack bot — queuing its messages as issues. "
615
+ "Extract all <@UXXXXXX> user IDs from the message and pass them here. "
616
+ "Sentinel verifies each is actually a bot (not a human) before adding to the watch list. "
617
+ "IMPORTANT: a bot watcher is only useful if its issues can be delivered to a project. "
618
+ "Try to infer the project from context (bot name, prior messages, available projects). "
619
+ "If it cannot be determined, do NOT call this tool — instead ask the user which project "
620
+ "the bot's alerts belong to, then call this tool with the project filled in. "
621
+ "Use for: 'listen to @alertbot', 'watch @bot1 @bot2', 'monitor @errorbot'."
622
+ ),
623
+ "input_schema": {
624
+ "type": "object",
625
+ "properties": {
626
+ "user_ids": {
627
+ "type": "array",
628
+ "items": {"type": "string"},
629
+ "description": "Slack user IDs to watch — extract from <@UXXXXXX> patterns in the message",
630
+ },
631
+ "project": {
632
+ "type": "string",
633
+ "description": "Project short name this bot's issues should be routed to (e.g. '1881', 'elprint'). Infer from context or ask user before calling.",
634
+ },
635
+ },
636
+ "required": ["user_ids"],
637
+ },
638
+ },
639
+ {
640
+ "name": "unwatch_bot",
641
+ "description": (
642
+ "Stop Sentinel from monitoring a Slack bot. "
643
+ "Use for: 'stop watching @alertbot', 'unwatch @bot', 'remove @errorbot from watchers'."
644
+ ),
645
+ "input_schema": {
646
+ "type": "object",
647
+ "properties": {
648
+ "user_ids": {
649
+ "type": "array",
650
+ "items": {"type": "string"},
651
+ "description": "Slack user IDs to remove from the watch list",
652
+ },
653
+ },
654
+ "required": ["user_ids"],
655
+ },
656
+ },
657
+ {
658
+ "name": "list_watched_bots",
659
+ "description": (
660
+ "List all Slack bots Sentinel is currently monitoring passively. "
661
+ "Use for: 'who are you watching?', 'which bots are you monitoring?', 'list watched bots'."
662
+ ),
663
+ "input_schema": {"type": "object", "properties": {}},
664
+ },
665
+ {
666
+ "name": "upgrade_sentinel",
667
+ "description": (
668
+ "Upgrade the Sentinel agent itself: git pull the latest code, update Python deps, "
669
+ "then restart the process. Safe to call at any time — if already up to date, "
670
+ "no restart is triggered. "
671
+ "Use for: 'upgrade sentinel', 'update sentinel', 'upgrade yourself', "
672
+ "'pull latest sentinel code', 'restart sentinel after upgrade'."
673
+ ),
674
+ "input_schema": {"type": "object", "properties": {}},
675
+ },
676
+ {
677
+ "name": "ask_codebase",
678
+ "description": (
679
+ "Ask any natural-language question about a managed codebase. "
680
+ "Accepts a repo name (e.g. 'STS', 'elprint-sales') OR a project name (e.g. '1881', 'elprint') "
681
+ "— if a project name is given and it has multiple repos, all are queried. "
682
+ "Claude Code answers using its full codebase knowledge — no need to specify how. "
683
+ "Use for: 'what does 1881 do?', 'TODOs in 1881', 'find PIN validation in STS', "
684
+ "'security issues in elprint-sales?', 'summarize the cairn repo'."
685
+ ),
686
+ "input_schema": {
687
+ "type": "object",
688
+ "properties": {
689
+ "repo": {
690
+ "type": "string",
691
+ "description": "Repo name (e.g. 'STS', 'elprint-sales') OR project name (e.g. '1881', 'elprint') — project name queries all its repos",
692
+ },
693
+ "question": {
694
+ "type": "string",
695
+ "description": "Natural language question about the codebase",
696
+ },
697
+ },
698
+ "required": ["repo", "question"],
699
+ },
700
+ },
701
+ {
702
+ "name": "restart_project",
703
+ "description": (
704
+ "Stop and restart a specific Sentinel monitoring instance (runs stop.sh then start.sh). "
705
+ "This restarts the Sentinel agent process for that project — it does NOT restart the application itself. "
706
+ "Use when: 'restart sentinel for 1881', 'reload the 1881 monitor', 'restart elprint sentinel'. "
707
+ "Safer than restarting all projects at once."
708
+ ),
709
+ "input_schema": {
710
+ "type": "object",
711
+ "properties": {
712
+ "project": {
713
+ "type": "string",
714
+ "description": "Project short name or dir name (e.g. '1881', 'elprint')",
715
+ },
716
+ },
717
+ "required": ["project"],
718
+ },
719
+ },
720
+ {
721
+ "name": "my_stats",
722
+ "description": (
723
+ "Show the current user's personal Sentinel dashboard: "
724
+ "conversation history length, issues they submitted, and "
725
+ "a summary of Sentinel fix activity (errors caught, fixes applied, "
726
+ "fixes pending PR review, fixes confirmed live, fixes failed). "
727
+ "Use for: 'what have you done for me?', 'show my stats', "
728
+ "'how many issues have been fixed?', 'my history', 'summary', "
729
+ "'what did sentinel fix this week?', 'pending fixes', 'open PRs'."
730
+ ),
731
+ "input_schema": {
732
+ "type": "object",
733
+ "properties": {
734
+ "hours": {
735
+ "type": "integer",
736
+ "description": "Look-back window in hours (default 168 = 7 days)",
737
+ "default": 168,
738
+ },
739
+ },
740
+ },
741
+ },
742
+ {
743
+ "name": "clear_my_history",
744
+ "description": (
745
+ "Clear the current user's conversation history with Sentinel. "
746
+ "After clearing, future sessions start with no memory of past conversations. "
747
+ "Use for: 'clear my history', 'forget our conversation', "
748
+ "'start fresh', 'reset my context', 'wipe my history'."
749
+ ),
750
+ "input_schema": {"type": "object", "properties": {}},
751
+ },
752
+ {
753
+ "name": "tail_log",
754
+ "description": (
755
+ "Fetch the last N lines of a log source's live production logs without any grep filter. "
756
+ "Use when: 'show me recent SSOLWA logs', 'tail STS', 'what's happening in 1881 logs right now', "
757
+ "'show last 100 lines from SSOLWA'. Different from search_logs — no pattern required."
758
+ ),
759
+ "input_schema": {
760
+ "type": "object",
761
+ "properties": {
762
+ "source": {
763
+ "type": "string",
764
+ "description": "Log source name (partial match against log-config filenames, e.g. 'SSOLWA', 'STS')",
765
+ },
766
+ "lines": {
767
+ "type": "integer",
768
+ "description": "Number of recent lines to fetch (default 100)",
769
+ "default": 100,
770
+ },
771
+ },
772
+ "required": ["source"],
773
+ },
774
+ },
775
+ {
776
+ "name": "post_file",
777
+ "description": (
778
+ "Upload a text file directly to the Slack conversation so the user can read or download it. "
779
+ "Use when: output is too large for a chat message, the user asks to 'download', 'export', or "
780
+ "'send as a file', or when formatted content (diffs, logs, CSVs, reports) is clearer as a file. "
781
+ "e.g. 'give me that as a file', 'export the log', 'send me the diff for PR #41', "
782
+ "'download the health report', 'export recent errors as CSV'"
783
+ ),
784
+ "input_schema": {
785
+ "type": "object",
786
+ "properties": {
787
+ "content": {
788
+ "type": "string",
789
+ "description": "The full text content of the file to upload",
790
+ },
791
+ "filename": {
792
+ "type": "string",
793
+ "description": "Filename with extension, e.g. 'fix-ab12.diff', 'sentinel-report.txt', 'errors.csv', 'ssolwa.log'",
794
+ },
795
+ "title": {
796
+ "type": "string",
797
+ "description": "Optional display title shown above the file in Slack (defaults to filename)",
798
+ },
799
+ },
800
+ "required": ["content", "filename"],
801
+ },
802
+ },
803
+ {
804
+ "name": "list_all_users",
805
+ "description": (
806
+ "ADMIN ONLY. List all Slack users who have ever talked to Sentinel, "
807
+ "with their issue count and conversation message count. "
808
+ "e.g. 'list all users', 'who has talked to you?', 'show user activity'"
809
+ ),
810
+ "input_schema": {"type": "object", "properties": {}},
811
+ },
812
+ {
813
+ "name": "clear_user_history",
814
+ "description": (
815
+ "ADMIN ONLY. Clear the conversation history for a specific Slack user. "
816
+ "e.g. 'clear history for huy', 'reset bob's conversation'"
817
+ ),
818
+ "input_schema": {
819
+ "type": "object",
820
+ "properties": {
821
+ "user_id": {
822
+ "type": "string",
823
+ "description": "Slack user ID to clear (e.g. U01AB2CD3EF)",
824
+ },
825
+ },
826
+ "required": ["user_id"],
827
+ },
828
+ },
829
+ {
830
+ "name": "reset_fingerprint",
831
+ "description": (
832
+ "ADMIN ONLY. Remove the 24h fix lock for an error fingerprint so Sentinel will retry it "
833
+ "on the next poll cycle. Use when a fix attempt failed and you want to force a retry. "
834
+ "e.g. 'retry fix abc123', 'reset fingerprint abc123de', 'let Sentinel try that error again'"
835
+ ),
836
+ "input_schema": {
837
+ "type": "object",
838
+ "properties": {
839
+ "fingerprint": {
840
+ "type": "string",
841
+ "description": "Error fingerprint hash (8+ hex chars, from get_fix_details or list_all_errors)",
842
+ },
843
+ },
844
+ "required": ["fingerprint"],
845
+ },
846
+ },
847
+ {
848
+ "name": "list_all_errors",
849
+ "description": (
850
+ "ADMIN ONLY. Return the full unfiltered error database — all fingerprints, counts, "
851
+ "sources, and last-seen times. "
852
+ "e.g. 'show all errors', 'full error list', 'dump the error DB'"
853
+ ),
854
+ "input_schema": {
855
+ "type": "object",
856
+ "properties": {
857
+ "hours": {
858
+ "type": "integer",
859
+ "description": "Limit to errors seen in the last N hours (0 = all time)",
860
+ "default": 0,
861
+ },
862
+ },
863
+ },
864
+ },
865
+ {
866
+ "name": "export_db",
867
+ "description": (
868
+ "ADMIN ONLY. Export the full Sentinel state (errors, fixes, PRs, users) as a "
869
+ "downloadable text file posted to Slack. "
870
+ "e.g. 'export the DB', 'download state', 'give me a full report file'"
871
+ ),
872
+ "input_schema": {"type": "object", "properties": {}},
873
+ },
874
+ {
875
+ "name": "set_maintenance",
876
+ "description": (
877
+ "Confirm that a repo/app is deliberately stopped for maintenance. "
878
+ "Sentinel will silently monitor the health URL and notify when it comes back online. "
879
+ "Use when Sentinel asked if a 502/503 is deliberate. "
880
+ "e.g. 'yes it's maintenance', 'maintenance ssolwa', 'confirm ssolwa is down for maintenance'"
881
+ ),
882
+ "input_schema": {
883
+ "type": "object",
884
+ "properties": {
885
+ "repo_name": {
886
+ "type": "string",
887
+ "description": "Repo name as configured (from repo-configs/*.properties)",
888
+ },
889
+ "note": {
890
+ "type": "string",
891
+ "description": "Optional reason e.g. 'scheduled maintenance', 'dependency update'",
892
+ },
893
+ },
894
+ "required": ["repo_name"],
895
+ },
896
+ },
897
+ ]
898
+
899
+
900
+ # ── Workspace helpers ─────────────────────────────────────────────────────────
901
+
902
+ def _workspace_dir() -> Path:
903
+ return Path(".").resolve().parent
904
+
905
+ def _short_name(dir_name: str) -> str:
906
+ """'sentinel-1881' → '1881', 'sentinel-elprint' → 'elprint', others unchanged."""
907
+ if dir_name.startswith("sentinel-"):
908
+ return dir_name[len("sentinel-"):]
909
+ return dir_name
910
+
911
+ def _read_project_name(project_dir: Path) -> str:
912
+ """Return PROJECT_NAME from sentinel.properties if set, else fall back to _short_name(dir)."""
913
+ props = project_dir / "config" / "sentinel.properties"
914
+ if props.exists():
915
+ try:
916
+ for line in props.read_text(encoding="utf-8", errors="ignore").splitlines():
917
+ line = line.strip()
918
+ if line.startswith("PROJECT_NAME"):
919
+ _, _, val = line.partition("=")
920
+ val = val.partition("#")[0].strip()
921
+ if val:
922
+ return val
923
+ except Exception:
924
+ pass
925
+ return _short_name(project_dir.name)
926
+
927
+ def _find_project_dirs(target: str = "") -> list[Path]:
928
+ """Return project dirs matching target (PROJECT_NAME, short name, or full dir name), or all if target empty."""
929
+ workspace = _workspace_dir()
930
+ results = []
931
+ try:
932
+ for d in sorted(workspace.iterdir()):
933
+ if not d.is_dir() or d.name in ("code", ".git"):
934
+ continue
935
+ if not (d / "config").exists():
936
+ continue
937
+ if target:
938
+ t = target.lower()
939
+ if (t not in d.name.lower()
940
+ and t not in _short_name(d.name).lower()
941
+ and t not in _read_project_name(d).lower()):
942
+ continue
943
+ results.append(d)
944
+ except Exception:
945
+ pass
946
+ return results
947
+
948
+ def _git_pull(path: Path) -> dict:
949
+ try:
950
+ r = subprocess.run(
951
+ ["git", "pull", "--rebase", "origin"],
952
+ cwd=str(path), capture_output=True, text=True, timeout=60,
953
+ )
954
+ last = r.stdout.strip().splitlines()[-1] if r.stdout.strip() else "already up to date"
955
+ return {"status": "ok" if r.returncode == 0 else "error",
956
+ "detail": last if r.returncode == 0 else r.stderr.strip()}
957
+ except Exception as e:
958
+ return {"status": "error", "detail": str(e)}
959
+
960
+
961
+ # ── Log-source name resolver ──────────────────────────────────────────────────
962
+
963
+ def _filter_log_sources(props_files: list, source_hint: str) -> list:
964
+ """
965
+ Return the subset of props_files whose log source matches source_hint.
966
+
967
+ Matching is tried in order (first match wins per file):
968
+ 1. Substring of the filename stem (e.g. "sts" STS.properties)
969
+ 2. Substring of REMOTE_SERVICE_USER (e.g. "ssolwa" ...SSOLoginWebApp...)
970
+ 3. Substring of HOSTS (e.g. hostname fragment)
971
+
972
+ Case-insensitive throughout. An empty source_hint returns all files unchanged.
973
+ """
974
+ if not source_hint:
975
+ return props_files
976
+ hint = source_hint.lower()
977
+
978
+ def _props_contains(path: Path, key: str, hint: str) -> bool:
979
+ try:
980
+ for line in path.read_text(encoding="utf-8", errors="replace").splitlines():
981
+ stripped = line.strip()
982
+ if stripped.startswith("#"):
983
+ continue
984
+ if stripped.upper().startswith(key + "="):
985
+ val = stripped.split("=", 1)[1].partition("#")[0].strip().lower()
986
+ if hint in val:
987
+ return True
988
+ except OSError:
989
+ pass
990
+ return False
991
+
992
+ matched = []
993
+ for p in props_files:
994
+ if hint in p.stem.lower():
995
+ matched.append(p)
996
+ elif _props_contains(p, "REMOTE_SERVICE_USER", hint):
997
+ matched.append(p)
998
+ elif _props_contains(p, "HOSTS", hint):
999
+ matched.append(p)
1000
+ return matched
1001
+
1002
+
1003
+ # ── Tool execution ────────────────────────────────────────────────────────────
1004
+
1005
+ async def _run_tool(name: str, inputs: dict, cfg_loader, store, slack_client=None, user_id: str = "", channel: str = "", is_admin: bool = False) -> str:
1006
+ if name == "get_status":
1007
+ hours = int(inputs.get("hours", 24))
1008
+ errors = store.get_recent_errors(hours)
1009
+ fixes = store.get_recent_fixes(hours)
1010
+ prs = store.get_open_prs()
1011
+ top_errors = [
1012
+ {
1013
+ "message": e["message"][:120],
1014
+ "count": e["count"],
1015
+ "source": e["source"],
1016
+ "last_seen": e["last_seen"],
1017
+ }
1018
+ for e in errors[:8]
1019
+ ]
1020
+ return json.dumps({
1021
+ "window_hours": hours,
1022
+ "errors_detected": len(errors),
1023
+ "top_errors": top_errors,
1024
+ "fixes_applied": sum(1 for f in fixes if f["status"] == "applied"),
1025
+ "fixes_pending": sum(1 for f in fixes if f["status"] == "pending"),
1026
+ "fixes_failed": sum(1 for f in fixes if f["status"] == "failed"),
1027
+ "open_prs": [
1028
+ {
1029
+ "repo": p["repo_name"],
1030
+ "branch": p["branch"],
1031
+ "pr_url": p["pr_url"],
1032
+ "age": p.get("timestamp", ""),
1033
+ }
1034
+ for p in prs
1035
+ ],
1036
+ "sentinel_paused": Path("SENTINEL_PAUSE").exists(),
1037
+ })
1038
+
1039
+ if name == "check_auth_status":
1040
+ import subprocess as _sp
1041
+ from .notify import get_circuit_status
1042
+ cfg = cfg_loader.sentinel
1043
+
1044
+ # Auth configuration
1045
+ has_key = bool(cfg.anthropic_api_key)
1046
+ pro_for_tasks = cfg.claude_pro_for_tasks
1047
+ if pro_for_tasks and has_key:
1048
+ primary, fallback = "claude_pro_oauth", "api_key"
1049
+ elif pro_for_tasks:
1050
+ primary, fallback = "claude_pro_oauth", None
1051
+ else:
1052
+ primary, fallback = "api_key", "claude_pro_oauth" if not has_key else "claude_pro_oauth"
1053
+
1054
+ # Claude CLI liveness check
1055
+ cli_ok, cli_version = False, ""
1056
+ try:
1057
+ r = _sp.run(
1058
+ [cfg.claude_code_bin, "--version"],
1059
+ capture_output=True, text=True, timeout=10,
1060
+ )
1061
+ if r.returncode == 0:
1062
+ cli_ok = True
1063
+ cli_version = r.stdout.strip() or r.stderr.strip()
1064
+ except Exception:
1065
+ pass
1066
+
1067
+ # Circuit breaker snapshot — only open (unhealthy) circuits appear here
1068
+ circuits = get_circuit_status()
1069
+
1070
+ # Fix engine stats (last 24 h)
1071
+ recent = store.get_recent_fixes(hours=24)
1072
+ counts = {"applied": 0, "failed": 0, "skipped": 0, "pending": 0}
1073
+ last_success = None
1074
+ for f in recent:
1075
+ s = f.get("status", "")
1076
+ if s in counts:
1077
+ counts[s] += 1
1078
+ if s == "applied" and not last_success:
1079
+ last_success = f.get("timestamp", "")
1080
+
1081
+ overall = "healthy"
1082
+ if circuits:
1083
+ overall = "degraded — rate/auth limit active on: " + ", ".join(circuits)
1084
+ elif not cli_ok:
1085
+ overall = "warning claude CLI not reachable"
1086
+
1087
+ return json.dumps({
1088
+ "overall": overall,
1089
+ "auth": {
1090
+ "api_key_configured": has_key,
1091
+ "claude_pro_for_tasks": pro_for_tasks,
1092
+ "primary_method": primary,
1093
+ "fallback_method": fallback,
1094
+ },
1095
+ "claude_cli": {"available": cli_ok, "version": cli_version},
1096
+ "rate_limit_circuits": circuits,
1097
+ "fix_engine_24h": {**counts, "last_successful_fix": last_success},
1098
+ })
1099
+
1100
+ if name == "create_issue":
1101
+ description = inputs["description"]
1102
+ target_repo = inputs.get("target_repo", "")
1103
+ project_arg = inputs.get("project", "")
1104
+
1105
+ if project_arg:
1106
+ project_dirs = _find_project_dirs(project_arg)
1107
+ if not project_dirs:
1108
+ all_names = [_read_project_name(d) for d in _find_project_dirs()]
1109
+ return json.dumps({
1110
+ "error": f"No project found matching '{project_arg}'",
1111
+ "available_projects": all_names,
1112
+ "action_needed": "Ask the user which project they meant.",
1113
+ })
1114
+ if len(project_dirs) > 1:
1115
+ matches = [_read_project_name(d) for d in project_dirs]
1116
+ return json.dumps({
1117
+ "error": f"Ambiguous project name '{project_arg}' — matches: {matches}",
1118
+ "action_needed": "Ask the user to clarify which project they mean.",
1119
+ })
1120
+ project_dir = project_dirs[0]
1121
+ else:
1122
+ project_dir = Path(".")
1123
+
1124
+ support_url = inputs.get("support_url", "").strip()
1125
+ attachments_summary = inputs.get("attachments_summary", "").strip()
1126
+ findings = inputs.get("findings", "").strip()
1127
+
1128
+ issues_dir = project_dir / "issues"
1129
+ issues_dir.mkdir(exist_ok=True)
1130
+ fname = f"slack-{uuid.uuid4().hex[:8]}.txt"
1131
+
1132
+ submitter_name = store.get_user_name(user_id) if user_id else ""
1133
+ submitter_line = f"SUBMITTED_BY: {submitter_name} ({user_id})" if user_id else ""
1134
+ lines = []
1135
+ if submitter_line:
1136
+ lines.append(submitter_line)
1137
+ if target_repo:
1138
+ lines.append(f"TARGET_REPO: {target_repo}")
1139
+ if support_url:
1140
+ lines.append(f"SUPPORT_URL: {support_url}")
1141
+ lines.append(f"SUBMITTED_AT: {datetime.now(timezone.utc).isoformat()}")
1142
+ lines.append("")
1143
+ lines.append(description)
1144
+ if findings:
1145
+ lines.append(f"\nEVIDENCE (gathered by Sentinel Boss):\n{findings}")
1146
+ if attachments_summary:
1147
+ lines.append(f"\nATTACHMENTS:\n{attachments_summary}")
1148
+ content = "\n".join(lines)
1149
+ (issues_dir / fname).write_text(content, encoding="utf-8")
1150
+
1151
+ # Touch SENTINEL_POLL_NOW so the target instance picks it up immediately
1152
+ (project_dir / "SENTINEL_POLL_NOW").touch()
1153
+
1154
+ project_label = _read_project_name(project_dir.resolve()) if project_arg else "this project"
1155
+ logger.info("Boss created issue for %s: %s", project_label, fname)
1156
+ if user_id:
1157
+ try:
1158
+ store.record_submitted_issue(
1159
+ user_id=user_id,
1160
+ user_name=submitter_name,
1161
+ project=project_label,
1162
+ fname=fname,
1163
+ description=description,
1164
+ )
1165
+ except Exception as _rec_err:
1166
+ logger.debug("Boss: could not record submitted issue: %s", _rec_err)
1167
+ return json.dumps({
1168
+ "status": "queued",
1169
+ "project": project_label,
1170
+ "file": fname,
1171
+ "note": f"Delivered to '{project_label}'. Sentinel will process it on the next poll cycle.",
1172
+ })
1173
+
1174
+ if name == "get_fix_details":
1175
+ fp = inputs["fingerprint"]
1176
+ fix = store.get_confirmed_fix(fp) or store.get_marker_seen_fix(fp)
1177
+ if not fix:
1178
+ # Fallback: search recent fixes by prefix
1179
+ recent = store.get_recent_fixes(hours=72)
1180
+ fix = next((f for f in recent if f.get("fingerprint", "").startswith(fp)), None)
1181
+ return json.dumps(fix or {"error": "not found"})
1182
+
1183
+ if name == "list_pending_prs":
1184
+ prs = store.get_open_prs()
1185
+ return json.dumps({
1186
+ "count": len(prs),
1187
+ "open_prs": [
1188
+ {
1189
+ "repo": p["repo_name"],
1190
+ "branch": p["branch"],
1191
+ "pr_url": p["pr_url"],
1192
+ "timestamp": p.get("timestamp", ""),
1193
+ }
1194
+ for p in prs
1195
+ ],
1196
+ })
1197
+
1198
+ if name == "pause_sentinel":
1199
+ Path("SENTINEL_PAUSE").touch()
1200
+ logger.info("Boss: SENTINEL_PAUSE created")
1201
+ return json.dumps({"status": "paused"})
1202
+
1203
+ if name == "resume_sentinel":
1204
+ p = Path("SENTINEL_PAUSE")
1205
+ if p.exists():
1206
+ p.unlink()
1207
+ logger.info("Boss: SENTINEL_PAUSE removed")
1208
+ return json.dumps({"status": "resumed"})
1209
+
1210
+ if name == "list_projects":
1211
+ projects = []
1212
+ for d in _find_project_dirs():
1213
+ repo_cfg_dir = d / "config" / "repo-configs"
1214
+ repos_in_project = []
1215
+ if repo_cfg_dir.exists():
1216
+ for p in sorted(repo_cfg_dir.glob("*.properties")):
1217
+ if p.name.startswith("_"):
1218
+ continue
1219
+ repo_url = ""
1220
+ for line in p.read_text(encoding="utf-8", errors="ignore").splitlines():
1221
+ if line.startswith("REPO_URL"):
1222
+ repo_url = line.split("=", 1)[-1].strip()
1223
+ break
1224
+ repos_in_project.append({"repo": p.stem, "url": repo_url})
1225
+ projects.append({
1226
+ "project": _read_project_name(d),
1227
+ "dir": d.name,
1228
+ "running": (d / "sentinel.pid").exists(),
1229
+ "this": d.resolve() == Path(".").resolve(),
1230
+ "repos": repos_in_project,
1231
+ })
1232
+ return json.dumps({"projects": projects})
1233
+
1234
+ if name == "search_logs":
1235
+ query = inputs.get("query", "")
1236
+ source = inputs.get("source", "").lower()
1237
+ max_matches = int(inputs.get("max_matches", 30))
1238
+ tail_override = inputs.get("tail")
1239
+
1240
+ # ── Preferred path: search locally-synced files (instant, no SSH) ──────
1241
+ synced_base = Path("workspace/synced")
1242
+ if synced_base.exists():
1243
+ log_cfg_dir_s = Path("config") / "log-configs"
1244
+ candidate_sources = (
1245
+ [p.stem for p in _filter_log_sources(sorted(log_cfg_dir_s.glob("*.properties")), source)]
1246
+ if log_cfg_dir_s.exists() else
1247
+ [d.name for d in sorted(synced_base.iterdir()) if d.is_dir()]
1248
+ )
1249
+ synced_results = []
1250
+ try:
1251
+ qpat_s = re.compile(query, re.IGNORECASE)
1252
+ except re.error:
1253
+ qpat_s = re.compile(re.escape(query), re.IGNORECASE)
1254
+ for src_name in candidate_sources:
1255
+ src_dir = synced_base / src_name
1256
+ if not src_dir.is_dir():
1257
+ continue
1258
+ for log_file in sorted(src_dir.glob("*")):
1259
+ try:
1260
+ lines = log_file.read_text(encoding="utf-8", errors="replace").splitlines()
1261
+ matches = [ln[:300] for ln in lines if qpat_s.search(ln)][:max_matches]
1262
+ if matches:
1263
+ synced_results.append({"source": src_name, "file": log_file.name, "matches": matches})
1264
+ except Exception:
1265
+ pass
1266
+ if synced_results:
1267
+ total = sum(len(r["matches"]) for r in synced_results)
1268
+ return json.dumps({
1269
+ "query": query,
1270
+ "mode": "synced",
1271
+ "total_matches": total,
1272
+ "results": synced_results,
1273
+ "note": "Results from locally-synced files. No SSH needed.",
1274
+ })
1275
+
1276
+ # ── Live fetch path: SSH to servers and grep in real time ──────────────
1277
+ script = Path(__file__).resolve().parent.parent / "scripts" / "fetch_log.sh"
1278
+ log_cfg_dir = Path("config") / "log-configs"
1279
+ if script.exists() and log_cfg_dir.exists():
1280
+ props_files = _filter_log_sources(sorted(log_cfg_dir.glob("*.properties")), source)
1281
+ if props_files:
1282
+ live_results = []
1283
+ for props in props_files:
1284
+ env = os.environ.copy()
1285
+ env["GREP_FILTER"] = query
1286
+ if tail_override:
1287
+ env["TAIL"] = str(tail_override)
1288
+ try:
1289
+ r = subprocess.run(
1290
+ ["bash", str(script), str(props)],
1291
+ capture_output=True, text=True, timeout=60, env=env,
1292
+ )
1293
+ try:
1294
+ _qpat = re.compile(query, re.IGNORECASE)
1295
+ except re.error:
1296
+ _qpat = re.compile(re.escape(query), re.IGNORECASE)
1297
+ lines = (r.stdout or "").strip().splitlines()
1298
+ matches = [ln[:300] for ln in lines if _qpat.search(ln)][:max_matches]
1299
+ if matches:
1300
+ live_results.append({"source": props.stem, "matches": matches})
1301
+ logger.info("Boss search_logs live %s rc=%d found=%d", props.stem, r.returncode, len(matches))
1302
+ except subprocess.TimeoutExpired:
1303
+ live_results.append({"source": props.stem, "error": "timed out"})
1304
+ except Exception as e:
1305
+ live_results.append({"source": props.stem, "error": str(e)})
1306
+ total = sum(len(r.get("matches", [])) for r in live_results)
1307
+ return json.dumps({
1308
+ "query": query,
1309
+ "mode": "live",
1310
+ "total_matches": total,
1311
+ "results": live_results,
1312
+ "note": (
1313
+ "Results already include a per-source breakdown. "
1314
+ "Do NOT call search_logs again with a source filter to 'refine' — "
1315
+ "use these results directly."
1316
+ ) if total > 0 else None,
1317
+ })
1318
+
1319
+ # ── Fallback: search locally-cached log files ──────────────────────────
1320
+ # Reaching here means: live script unavailable OR source filter matched no config files.
1321
+ # A result with files_searched=0 means the source name wasn't recognised — NOT that
1322
+ # there are no log entries. Do not interpret this as "no results found".
1323
+ fetched_dir = Path("workspace/fetched")
1324
+ if not fetched_dir.exists():
1325
+ return json.dumps({
1326
+ "error": "No fetched logs found and fetch_log.sh unavailable",
1327
+ "note": "This is a config/setup problem, not a 'no results' answer.",
1328
+ })
1329
+ try:
1330
+ pattern = re.compile(query, re.IGNORECASE)
1331
+ except re.error as e:
1332
+ return json.dumps({"error": f"Invalid regex: {e}"})
1333
+ results = []
1334
+ for log_file in sorted(fetched_dir.glob("*.log")):
1335
+ if source and source not in log_file.name.lower():
1336
+ continue
1337
+ try:
1338
+ lines = log_file.read_text(encoding="utf-8", errors="ignore").splitlines()
1339
+ matches = [
1340
+ {"line": i + 1, "text": line[:300]}
1341
+ for i, line in enumerate(lines)
1342
+ if pattern.search(line)
1343
+ ][:max_matches]
1344
+ if matches:
1345
+ results.append({"file": log_file.name, "matches": matches})
1346
+ except Exception:
1347
+ pass
1348
+ total = sum(len(r["matches"]) for r in results)
1349
+ files_searched = len(list(fetched_dir.glob("*.log")))
1350
+ result = {
1351
+ "query": query,
1352
+ "mode": "cached",
1353
+ "total_matches": total,
1354
+ "files_searched": files_searched,
1355
+ "results": results,
1356
+ }
1357
+ if files_searched == 0:
1358
+ result["warning"] = (
1359
+ "Source name not recognised in cached files — this is a lookup failure, not 'no results'. "
1360
+ "If you already have results from a broader search_logs call, use those. Stop retrying."
1361
+ )
1362
+ return json.dumps(result)
1363
+
1364
+
1365
+ if name == "filter_logs":
1366
+ import re as _re
1367
+ from collections import Counter as _Counter
1368
+ from datetime import datetime, timedelta, timezone as _tz
1369
+
1370
+ # Extract a short grouping key from a log line for pattern analysis
1371
+ _EXC_PAT = _re.compile(r'([A-Z][a-zA-Z]+(?:Exception|Error|Failure|Fault|Warning))')
1372
+ _LVL_PAT = _re.compile(r'\b(ERROR|WARN(?:ING)?|CRITICAL|FATAL|SEVERE)\b', _re.IGNORECASE)
1373
+
1374
+ def _signature(line):
1375
+ exc = _EXC_PAT.search(line)
1376
+ if exc:
1377
+ return exc.group(1)
1378
+ m = _LVL_PAT.search(line)
1379
+ if m:
1380
+ after = line[m.end():].strip()
1381
+ token = after.split()[0].rstrip(':.,') if after.split() else ''
1382
+ if token and len(token) > 2:
1383
+ return m.group(1).upper() + ' ' + token[:40]
1384
+ return line.strip()[:40]
1385
+
1386
+ query_f = inputs.get("query", "")
1387
+ source_f = inputs.get("source", "").lower()
1388
+ since_hours = inputs.get("since_hours")
1389
+ max_matches = int(inputs.get("max_matches", 300))
1390
+ case_flag = 0 if inputs.get("case_sensitive") else _re.IGNORECASE
1391
+ try:
1392
+ pat = _re.compile(query_f, case_flag)
1393
+ except _re.error as e:
1394
+ return json.dumps({"error": f"Invalid regex: {e}"})
1395
+
1396
+ synced_base = Path("workspace/synced")
1397
+ if not synced_base.exists():
1398
+ return json.dumps({
1399
+ "error": "No synced logs found.",
1400
+ "hint": "Log sync runs every SYNC_INTERVAL_SECONDS (default 300s). "
1401
+ "If just started, wait a minute then try again.",
1402
+ })
1403
+
1404
+ # Build cutoff timestamp for since_hours filter
1405
+ cutoff = None
1406
+ if since_hours:
1407
+ cutoff = datetime.now(_tz.utc) - timedelta(hours=int(since_hours))
1408
+
1409
+ # Determine which source directories to search
1410
+ if source_f:
1411
+ src_dirs = [d for d in sorted(synced_base.iterdir())
1412
+ if d.is_dir() and source_f in d.name.lower()]
1413
+ else:
1414
+ src_dirs = [d for d in sorted(synced_base.iterdir()) if d.is_dir()]
1415
+
1416
+ if not src_dirs:
1417
+ available = [d.name for d in synced_base.iterdir() if d.is_dir()]
1418
+ return json.dumps({
1419
+ "error": f"No synced source matching '{source_f}'",
1420
+ "available_sources": available,
1421
+ })
1422
+
1423
+ results = []
1424
+ total_matches = 0
1425
+ for src_dir in src_dirs:
1426
+ for log_file in sorted(src_dir.glob("*")):
1427
+ try:
1428
+ lines = log_file.read_text(encoding="utf-8", errors="replace").splitlines()
1429
+ matches = []
1430
+ for line in lines:
1431
+ if not pat.search(line):
1432
+ continue
1433
+ if cutoff:
1434
+ # Try to parse timestamp from line
1435
+ from .log_fetcher import _parse_line_ts
1436
+ ts = _parse_line_ts(line)
1437
+ if ts and ts < cutoff:
1438
+ continue
1439
+ matches.append(line[:300])
1440
+ if len(matches) >= max_matches:
1441
+ break
1442
+ if matches:
1443
+ results.append({
1444
+ "source": src_dir.name,
1445
+ "file": log_file.name,
1446
+ "matches": matches,
1447
+ })
1448
+ total_matches += len(matches)
1449
+ except Exception:
1450
+ pass
1451
+
1452
+ if not results:
1453
+ return json.dumps({
1454
+ "query": query_f,
1455
+ "total_matches": 0,
1456
+ "sources_searched": [d.name for d in src_dirs],
1457
+ "note": "No matches found in synced logs.",
1458
+ })
1459
+
1460
+
1461
+ try:
1462
+ pat = _re.compile(query_f, case_flag)
1463
+ except _re.error as e:
1464
+ return json.dumps({"error": f"Invalid regex: {e}"})
1465
+
1466
+ synced_base = Path("workspace/synced")
1467
+ if not synced_base.exists():
1468
+ return json.dumps({
1469
+ "error": "No synced logs found.",
1470
+ "hint": "Log sync runs every SYNC_INTERVAL_SECONDS (default 300s). "
1471
+ "If just started, wait a minute then try again.",
1472
+ })
1473
+
1474
+ cutoff = None
1475
+ if since_hours:
1476
+ cutoff = datetime.now(_tz.utc) - timedelta(hours=int(since_hours))
1477
+
1478
+ if source_f:
1479
+ src_dirs = [d for d in sorted(synced_base.iterdir())
1480
+ if d.is_dir() and source_f in d.name.lower()]
1481
+ else:
1482
+ src_dirs = [d for d in sorted(synced_base.iterdir()) if d.is_dir()]
1483
+
1484
+ if not src_dirs:
1485
+ available = [d.name for d in synced_base.iterdir() if d.is_dir()]
1486
+ return json.dumps({
1487
+ "error": f"No synced source matching '{source_f}'",
1488
+ "available_sources": available,
1489
+ })
1490
+
1491
+ all_matches = [] # list of (source_name, line)
1492
+ sources_hit = set()
1493
+ for src_dir in src_dirs:
1494
+ for log_file in sorted(src_dir.glob("*")):
1495
+ try:
1496
+ lines = log_file.read_text(encoding="utf-8", errors="replace").splitlines()
1497
+ for line in lines:
1498
+ if not pat.search(line):
1499
+ continue
1500
+ if cutoff:
1501
+ from .log_fetcher import _parse_line_ts
1502
+ ts = _parse_line_ts(line)
1503
+ if ts and ts < cutoff:
1504
+ continue
1505
+ all_matches.append((src_dir.name, line[:300]))
1506
+ sources_hit.add(src_dir.name)
1507
+ if len(all_matches) >= max_matches:
1508
+ break
1509
+ except Exception:
1510
+ pass
1511
+ if len(all_matches) >= max_matches:
1512
+ break
1513
+
1514
+ total = len(all_matches)
1515
+ if total == 0:
1516
+ return json.dumps({
1517
+ "query": query_f,
1518
+ "total_matches": 0,
1519
+ "sources_searched": [d.name for d in src_dirs],
1520
+ "note": "No matches found in synced logs.",
1521
+ })
1522
+
1523
+ # Pattern grouping: count occurrences of each error signature
1524
+ sig_counter = _Counter()
1525
+ sig_examples = {}
1526
+ for src, line in all_matches:
1527
+ sig = _signature(line)
1528
+ sig_counter[sig] += 1
1529
+ if sig not in sig_examples:
1530
+ sig_examples[sig] = f"[{src}] {line}"
1531
+
1532
+ top_patterns = [
1533
+ {"pattern": sig, "count": cnt, "example": sig_examples[sig][:250]}
1534
+ for sig, cnt in sig_counter.most_common(10)
1535
+ ]
1536
+
1537
+ # Sample: first unique-signature line from each source
1538
+ sample_lines = []
1539
+ seen_sigs = set()
1540
+ for src, line in all_matches:
1541
+ sig = _signature(line)
1542
+ if sig not in seen_sigs:
1543
+ sample_lines.append(f"[{src}] {line}")
1544
+ seen_sigs.add(sig)
1545
+ if len(sample_lines) >= 10:
1546
+ break
1547
+
1548
+ # Time span
1549
+ time_span = {}
1550
+ try:
1551
+ from .log_fetcher import _parse_line_ts
1552
+ timestamps = [_parse_line_ts(ln) for _, ln in all_matches]
1553
+ timestamps = [t for t in timestamps if t]
1554
+ if timestamps:
1555
+ time_span = {
1556
+ "earliest": min(timestamps).strftime("%Y-%m-%d %H:%M:%S UTC"),
1557
+ "latest": max(timestamps).strftime("%Y-%m-%d %H:%M:%S UTC"),
1558
+ }
1559
+ except Exception:
1560
+ pass
1561
+
1562
+ return json.dumps({
1563
+ "query": query_f,
1564
+ "total_matches": total,
1565
+ "sources_hit": sorted(sources_hit),
1566
+ "sources_searched": [d.name for d in src_dirs],
1567
+ "top_patterns": top_patterns,
1568
+ "sample_lines": sample_lines,
1569
+ "time_span": time_span,
1570
+ "capped": total >= max_matches,
1571
+ })
1572
+
1573
+ if name == "trigger_poll":
1574
+ Path("SENTINEL_POLL_NOW").touch()
1575
+ logger.info("Boss: immediate poll requested")
1576
+ return json.dumps({"status": "triggered", "note": "Sentinel will run a poll cycle within seconds"})
1577
+
1578
+ if name == "get_repo_status":
1579
+ hours = int(inputs.get("hours", 24))
1580
+ fixes = store.get_recent_fixes(hours)
1581
+ errors = store.get_recent_errors(hours)
1582
+ by_repo: dict = {}
1583
+ for fix in fixes:
1584
+ repo = fix.get("repo_name", "unknown")
1585
+ s = by_repo.setdefault(repo, {"applied": 0, "pending": 0, "failed": 0, "skipped": 0})
1586
+ key = fix.get("status", "failed")
1587
+ s[key] = s.get(key, 0) + 1
1588
+ return json.dumps({"window_hours": hours, "total_errors": len(errors), "by_repo": by_repo})
1589
+
1590
+ if name == "list_recent_commits":
1591
+ limit = int(inputs.get("limit", 5))
1592
+ results = []
1593
+ for repo_name, repo in cfg_loader.repos.items():
1594
+ local = Path(repo.local_path)
1595
+ if not local.exists():
1596
+ continue
1597
+ try:
1598
+ r = subprocess.run(
1599
+ ["git", "log", "--oneline", "--grep=sentinel", "-n", str(limit)],
1600
+ cwd=str(local), capture_output=True, text=True, timeout=10,
1601
+ )
1602
+ commits = r.stdout.strip().splitlines()
1603
+ if commits:
1604
+ results.append({"repo": repo_name, "commits": commits})
1605
+ except Exception:
1606
+ pass
1607
+ return json.dumps({"sentinel_commits": results})
1608
+
1609
+ if name == "pull_repo":
1610
+ target = inputs.get("repo", "").lower()
1611
+ results = []
1612
+ for repo_name, repo in cfg_loader.repos.items():
1613
+ if target and target not in repo_name.lower():
1614
+ continue
1615
+ local = Path(repo.local_path)
1616
+ if not local.exists():
1617
+ results.append({"repo": repo_name, "status": "error", "detail": "local path not found"})
1618
+ continue
1619
+ try:
1620
+ r = subprocess.run(
1621
+ ["git", "pull", "--rebase", "origin", repo.branch],
1622
+ cwd=str(local), capture_output=True, text=True, timeout=60,
1623
+ )
1624
+ last_line = r.stdout.strip().splitlines()[-1] if r.stdout.strip() else "already up to date"
1625
+ if r.returncode == 0:
1626
+ results.append({"repo": repo_name, "status": "ok", "detail": last_line})
1627
+ else:
1628
+ results.append({"repo": repo_name, "status": "error", "detail": r.stderr.strip()})
1629
+ except Exception as e:
1630
+ results.append({"repo": repo_name, "status": "error", "detail": str(e)})
1631
+ return json.dumps({"results": results})
1632
+
1633
+ if name == "pull_config":
1634
+ target = inputs.get("project", "")
1635
+ dirs = _find_project_dirs(target)
1636
+ if not dirs:
1637
+ return json.dumps({"error": f"No project found matching '{target}'"})
1638
+ results = []
1639
+ for d in dirs:
1640
+ res = _git_pull(d)
1641
+ results.append({"project": _read_project_name(d), "dir": d.name, **res})
1642
+ logger.info("Boss: pull_config %s → %s", d.name, res["status"])
1643
+ return json.dumps({"results": results})
1644
+
1645
+ if name == "fetch_logs":
1646
+ source_filter = inputs.get("source", "").lower()
1647
+ debug = bool(inputs.get("debug", False))
1648
+ tail_override = inputs.get("tail")
1649
+ grep_override = inputs.get("grep_filter", "")
1650
+
1651
+ # Find fetch_log.sh relative to this file
1652
+ script = Path(__file__).resolve().parent.parent / "scripts" / "fetch_log.sh"
1653
+ if not script.exists():
1654
+ return json.dumps({"error": f"fetch_log.sh not found at {script}"})
1655
+
1656
+ log_cfg_dir = Path("config") / "log-configs"
1657
+ if not log_cfg_dir.exists():
1658
+ return json.dumps({"error": "config/log-configs/ not found"})
1659
+
1660
+ props_files = _filter_log_sources(sorted(log_cfg_dir.glob("*.properties")), source_filter)
1661
+ if not props_files:
1662
+ return json.dumps({"error": f"No log-config found matching '{source_filter}'"})
1663
+
1664
+ results = []
1665
+ for props in props_files:
1666
+ env = os.environ.copy()
1667
+ if tail_override:
1668
+ env["TAIL"] = str(tail_override)
1669
+ if grep_override:
1670
+ env["GREP_FILTER"] = grep_override
1671
+
1672
+ cmd = ["bash", str(script)]
1673
+ if debug:
1674
+ cmd.append("--debug")
1675
+ cmd.append(str(props))
1676
+
1677
+ try:
1678
+ r = subprocess.run(
1679
+ cmd, capture_output=True, text=True, timeout=120, env=env,
1680
+ )
1681
+ output = (r.stdout or "").strip()
1682
+ stderr = (r.stderr or "").strip()
1683
+ results.append({
1684
+ "source": props.stem,
1685
+ "returncode": r.returncode,
1686
+ "output": output[-2000:] if output else "",
1687
+ "stderr": stderr[-1000:] if stderr else "",
1688
+ })
1689
+ logger.info("Boss fetch_logs %s rc=%d", props.stem, r.returncode)
1690
+ except subprocess.TimeoutExpired:
1691
+ results.append({"source": props.stem, "error": "timed out after 120s"})
1692
+ except Exception as e:
1693
+ results.append({"source": props.stem, "error": str(e)})
1694
+
1695
+ return json.dumps({"fetched": len(results), "results": results})
1696
+
1697
+ if name == "watch_bot":
1698
+ if not is_admin:
1699
+ return json.dumps({"error": "Admin access required to register bots for monitoring."})
1700
+ user_ids = inputs.get("user_ids", [])
1701
+ project_arg = inputs.get("project", "").strip()
1702
+ if not user_ids:
1703
+ return json.dumps({"error": "No user_ids provided"})
1704
+
1705
+ # Resolve + validate project required for bot issue routing
1706
+ resolved_project = ""
1707
+ if project_arg:
1708
+ project_dirs = _find_project_dirs(project_arg)
1709
+ if not project_dirs:
1710
+ all_names = [_read_project_name(d) for d in _find_project_dirs()]
1711
+ return json.dumps({
1712
+ "error": f"No project found matching '{project_arg}'",
1713
+ "available_projects": all_names,
1714
+ "action_needed": "Ask the user which project these bot alerts belong to.",
1715
+ })
1716
+ if len(project_dirs) > 1:
1717
+ matches = [_read_project_name(d) for d in project_dirs]
1718
+ return json.dumps({
1719
+ "error": f"Ambiguous project name '{project_arg}' — matches: {matches}",
1720
+ "action_needed": "Ask the user to clarify which project.",
1721
+ })
1722
+ resolved_project = _read_project_name(project_dirs[0])
1723
+ else:
1724
+ all_projects = _find_project_dirs()
1725
+ if len(all_projects) == 1:
1726
+ # Single project in workspace — auto-assign
1727
+ resolved_project = _read_project_name(all_projects[0])
1728
+ elif all_projects:
1729
+ all_names = [_read_project_name(d) for d in all_projects]
1730
+ return json.dumps({
1731
+ "error": "Cannot determine which project these bot alerts belong to.",
1732
+ "available_projects": all_names,
1733
+ "action_needed": "Ask the user to specify the project, then retry with project filled in.",
1734
+ })
1735
+
1736
+ results = []
1737
+ for uid in user_ids:
1738
+ if not slack_client:
1739
+ results.append({"user_id": uid, "status": "error", "reason": "no Slack client available"})
1740
+ continue
1741
+ try:
1742
+ info = await slack_client.users_info(user=uid)
1743
+ user = info.get("user", {})
1744
+ if not user.get("is_bot", False):
1745
+ results.append({"user_id": uid, "status": "skipped", "reason": "not a bot — only bots can be watched passively"})
1746
+ continue
1747
+ bot_name = user.get("real_name") or user.get("name") or uid
1748
+ store.add_watched_bot(uid, bot_name, added_by="boss", project_name=resolved_project)
1749
+ logger.info("Boss: now watching bot %s (%s) → project '%s'", bot_name, uid, resolved_project or "unset")
1750
+ results.append({"user_id": uid, "bot_name": bot_name, "project": resolved_project, "status": "watching"})
1751
+ except Exception as e:
1752
+ results.append({"user_id": uid, "status": "error", "reason": str(e)})
1753
+ return json.dumps({"results": results})
1754
+
1755
+ if name == "unwatch_bot":
1756
+ if not is_admin:
1757
+ return json.dumps({"error": "Admin access required to remove bots from monitoring."})
1758
+ user_ids = inputs.get("user_ids", [])
1759
+ if not user_ids:
1760
+ return json.dumps({"error": "No user_ids provided"})
1761
+ results = []
1762
+ for uid in user_ids:
1763
+ removed = store.remove_watched_bot(uid)
1764
+ logger.info("Boss: unwatch bot %s → %s", uid, "removed" if removed else "not found")
1765
+ results.append({"user_id": uid, "status": "removed" if removed else "not found"})
1766
+ return json.dumps({"results": results})
1767
+
1768
+ if name == "list_watched_bots":
1769
+ bots = store.get_watched_bots()
1770
+ return json.dumps({
1771
+ "count": len(bots),
1772
+ "bots": [
1773
+ {
1774
+ "bot_id": b["bot_id"],
1775
+ "bot_name": b["bot_name"],
1776
+ "project": b.get("project_name") or "",
1777
+ "added_by": b["added_by"],
1778
+ "added_at": b["added_at"],
1779
+ }
1780
+ for b in bots
1781
+ ],
1782
+ })
1783
+
1784
+ if name == "upgrade_sentinel":
1785
+ if not is_admin:
1786
+ return json.dumps({"error": "Admin access required to upgrade Sentinel."})
1787
+ import threading
1788
+
1789
+ # Sentinel is installed via npm — use `sentinel upgrade` which handles
1790
+ # npm install + Python bundle copy + restart via stopAll/startAll.
1791
+ # Run it in the background after a short delay so the Slack reply is
1792
+ # sent before the process is replaced.
1793
+ try:
1794
+ r = subprocess.run(
1795
+ ["sentinel", "--version"],
1796
+ capture_output=True, text=True, timeout=10,
1797
+ )
1798
+ sentinel_bin_ok = r.returncode == 0
1799
+ except Exception:
1800
+ sentinel_bin_ok = False
1801
+
1802
+ if not sentinel_bin_ok:
1803
+ return json.dumps({
1804
+ "status": "error",
1805
+ "note": "`sentinel` CLI not found. Run: npm install -g @misterhuydo/sentinel",
1806
+ })
1807
+
1808
+ def _do_upgrade():
1809
+ import time
1810
+ time.sleep(10) # give Slack time to post the reply
1811
+ subprocess.Popen(["sentinel", "upgrade"], close_fds=True)
1812
+
1813
+ threading.Thread(target=_do_upgrade, daemon=True).start()
1814
+ logger.info("Boss: upgrade_sentinel scheduled via `sentinel upgrade`")
1815
+ return json.dumps({
1816
+ "status": "ok",
1817
+ "note": "Upgrade started pulling latest version via npm and restarting. Give me ~30 seconds then I'll be back.",
1818
+ })
1819
+
1820
+ if name == "ask_codebase":
1821
+ target = inputs.get("repo", "").lower()
1822
+ question = inputs.get("question", "")
1823
+
1824
+ # 1. Find repos whose name contains the target (e.g. "STS", "elprint-sales")
1825
+ matched = [(rn, r) for rn, r in cfg_loader.repos.items() if target in rn.lower()]
1826
+
1827
+ # 2. No repo match — check if target is a project name → use ALL repos in cfg_loader
1828
+ # (each Sentinel instance is scoped to one project, so all repos belong to it)
1829
+ if not matched:
1830
+ current_project = _read_project_name(Path("."))
1831
+ if target in current_project.lower() or current_project.lower() in target:
1832
+ matched = list(cfg_loader.repos.items())
1833
+
1834
+ if not matched:
1835
+ return json.dumps({
1836
+ "error": f"No repo or project found matching '{target}'",
1837
+ "available_repos": list(cfg_loader.repos.keys()),
1838
+ })
1839
+
1840
+ cfg = cfg_loader.sentinel
1841
+ env = os.environ.copy()
1842
+ # Only inject API key when Claude Pro is NOT preferred for heavy tasks
1843
+ if cfg.anthropic_api_key and not cfg.claude_pro_for_tasks:
1844
+ env["ANTHROPIC_API_KEY"] = cfg.anthropic_api_key
1845
+
1846
+ def _ask_one(repo_name, repo_cfg) -> dict:
1847
+ local_path = Path(repo_cfg.local_path)
1848
+ if not local_path.exists():
1849
+ return {"repo": repo_name, "error": f"not cloned yet at {local_path}"}
1850
+ prompt = (
1851
+ f"You are a code analyst. Answer the following question about the codebase at: {local_path}\n\n"
1852
+ f"Question: {question}\n\n"
1853
+ f"Use whatever tools you need to answer accurately. Be concise and direct. Plain text only."
1854
+ )
1855
+ try:
1856
+ r = subprocess.run(
1857
+ ([cfg.claude_code_bin, "--dangerously-skip-permissions", "--print", prompt]
1858
+ if os.getuid() != 0 else
1859
+ [cfg.claude_code_bin, "--print", prompt]),
1860
+ capture_output=True, text=True, timeout=180, env=env,
1861
+ cwd=str(local_path),
1862
+ )
1863
+ output = (r.stdout or "").strip()
1864
+ logger.info("Boss ask_codebase %s rc=%d len=%d", repo_name, r.returncode, len(output))
1865
+ if r.returncode != 0 and not output:
1866
+ raw_err = (r.stderr or "")
1867
+ alert_if_rate_limited(
1868
+ cfg.slack_bot_token, cfg.slack_channel,
1869
+ f"ask_codebase/{repo_name}", raw_err,
1870
+ )
1871
+ return {"repo": repo_name, "error": f"claude --print failed (rc={r.returncode}): {raw_err[:200]}"}
1872
+ return {"repo": repo_name, "answer": output[:3000]}
1873
+ except subprocess.TimeoutExpired:
1874
+ return {"repo": repo_name, "error": "timed out after 180s"}
1875
+ except Exception as e:
1876
+ return {"repo": repo_name, "error": str(e)}
1877
+
1878
+ if len(matched) == 1:
1879
+ result = _ask_one(*matched[0])
1880
+ # Unwrap single-repo result for cleaner response
1881
+ return json.dumps(result)
1882
+
1883
+ # Multiple repos — query each and combine
1884
+ results = [_ask_one(rn, r) for rn, r in matched]
1885
+ return json.dumps({"project": target, "repos_queried": len(results), "results": results})
1886
+
1887
+ if name == "restart_project":
1888
+ if not is_admin:
1889
+ return json.dumps({"error": "Admin access required to restart a project."})
1890
+ project_arg = inputs.get("project", "").lower()
1891
+ dirs = _find_project_dirs(project_arg)
1892
+ if not dirs:
1893
+ return json.dumps({"error": f"No project found matching '{project_arg}'"})
1894
+ results = []
1895
+ for d in dirs:
1896
+ stop_sh = d / "stop.sh"
1897
+ start_sh = d / "start.sh"
1898
+ if not stop_sh.exists() or not start_sh.exists():
1899
+ results.append({"project": d.name, "status": "error", "detail": "stop.sh or start.sh not found"})
1900
+ continue
1901
+ try:
1902
+ subprocess.run(["bash", str(stop_sh)], cwd=str(d), timeout=30)
1903
+ subprocess.run(["bash", str(start_sh)], cwd=str(d), timeout=30)
1904
+ results.append({"project": d.name, "status": "restarted"})
1905
+ logger.info("Boss: restarted project %s", d.name)
1906
+ except Exception as e:
1907
+ results.append({"project": d.name, "status": "error", "detail": str(e)})
1908
+ return json.dumps({"results": results})
1909
+
1910
+ if name == "tail_log":
1911
+ source = inputs.get("source", "").lower()
1912
+ lines = int(inputs.get("lines", 100))
1913
+ script = Path(__file__).resolve().parent.parent / "scripts" / "fetch_log.sh"
1914
+ log_cfg_dir = Path("config") / "log-configs"
1915
+
1916
+ if not script.exists():
1917
+ return json.dumps({"error": "fetch_log.sh not found"})
1918
+ if not log_cfg_dir.exists():
1919
+ return json.dumps({"error": "config/log-configs/ not found"})
1920
+
1921
+ props_files = sorted(log_cfg_dir.glob("*.properties"))
1922
+ if source:
1923
+ props_files = [p for p in props_files if source in p.stem.lower()]
1924
+ if not props_files:
1925
+ return json.dumps({"error": f"No log-config found matching '{source}'"})
1926
+
1927
+ results = []
1928
+ for props in props_files:
1929
+ env = os.environ.copy()
1930
+ env["TAIL"] = str(lines)
1931
+ env["GREP_FILTER"] = "" # no filter — show everything
1932
+ try:
1933
+ r = subprocess.run(
1934
+ ["bash", str(script), str(props)],
1935
+ capture_output=True, text=True, timeout=60, env=env,
1936
+ )
1937
+ tail_lines = (r.stdout or "").strip().splitlines()[-lines:]
1938
+ results.append({
1939
+ "source": props.stem,
1940
+ "lines": len(tail_lines),
1941
+ "content": "\n".join(tail_lines),
1942
+ })
1943
+ logger.info("Boss tail_log %s rc=%d lines=%d", props.stem, r.returncode, len(tail_lines))
1944
+ except subprocess.TimeoutExpired:
1945
+ results.append({"source": props.stem, "error": "timed out"})
1946
+ except Exception as e:
1947
+ results.append({"source": props.stem, "error": str(e)})
1948
+ return json.dumps({"results": results})
1949
+
1950
+ if name == "post_file":
1951
+ if not slack_client or not channel:
1952
+ return json.dumps({"error": "No Slack channel context — cannot upload file"})
1953
+ content = inputs.get("content", "")
1954
+ filename = inputs.get("filename", "sentinel-output.txt")
1955
+ title = inputs.get("title", filename)
1956
+ if not content:
1957
+ return json.dumps({"error": "No content provided"})
1958
+ try:
1959
+ await slack_client.files_upload_v2(
1960
+ channel=channel,
1961
+ content=content,
1962
+ filename=filename,
1963
+ title=title,
1964
+ )
1965
+ logger.info("Boss post_file: uploaded %s (%d bytes) to %s", filename, len(content), channel)
1966
+ return json.dumps({"ok": True, "filename": filename, "bytes": len(content)})
1967
+ except Exception as e:
1968
+ logger.warning("Boss post_file failed: %s", e)
1969
+ return json.dumps({"error": str(e)})
1970
+
1971
+ if name == "my_stats":
1972
+ hours = int(inputs.get("hours", 168))
1973
+ errors = store.get_recent_errors(hours)
1974
+ fixes = store.get_recent_fixes(hours)
1975
+ prs = store.get_open_prs()
1976
+ pending_conf = store.get_fixes_pending_confirmation()
1977
+ # Conversation stats
1978
+ history = store.load_conversation(user_id) if user_id else []
1979
+ hist_len = len(history)
1980
+ # Load conversation updated_at from DB
1981
+ conv_updated = ""
1982
+ try:
1983
+ import sqlite3 as _sqlite3
1984
+ with _sqlite3.connect(store.db_path) as _db:
1985
+ row = _db.execute(
1986
+ "SELECT updated_at FROM conversations WHERE user_id=?", (user_id,)
1987
+ ).fetchone()
1988
+ if row:
1989
+ conv_updated = row[0]
1990
+ except Exception:
1991
+ pass
1992
+ # Tally fix statuses
1993
+ by_status: dict = {}
1994
+ for fix in fixes:
1995
+ s = fix.get("status", "unknown")
1996
+ by_status[s] = by_status.get(s, 0) + 1
1997
+ # Fixes confirmed via sentinel marker in prod
1998
+ confirmed = [f for f in fixes if f.get("fix_outcome") == "confirmed"]
1999
+ regressed = [f for f in fixes if f.get("fix_outcome") == "regressed"]
2000
+ submitted = store.get_submitted_issues(user_id, hours=hours) if user_id else []
2001
+ submitted_recent = store.get_submitted_issues(user_id, hours=hours) if user_id else []
2002
+ return json.dumps({
2003
+ "conversation": {
2004
+ "messages_in_history": hist_len,
2005
+ "turns": hist_len // 2,
2006
+ "last_active": conv_updated or "no history",
2007
+ },
2008
+ "issues_you_submitted": {
2009
+ "total_in_window": len(submitted_recent),
2010
+ "all_time": len(store.get_submitted_issues(user_id) if user_id else []),
2011
+ "recent": [
2012
+ {"project": i["project"], "description": i["description"][:80],
2013
+ "submitted_at": i["submitted_at"]}
2014
+ for i in submitted_recent[:5]
2015
+ ],
2016
+ },
2017
+ "window_hours": hours,
2018
+ "errors_detected": len(errors),
2019
+ "fixes": {
2020
+ "applied": by_status.get("applied", 0),
2021
+ "pending_pr": len(prs),
2022
+ "failed": by_status.get("failed", 0),
2023
+ "skipped": by_status.get("skipped", 0),
2024
+ "error": by_status.get("error", 0),
2025
+ },
2026
+ "confirmed_in_prod": len(confirmed),
2027
+ "regressed_after_fix": len(regressed),
2028
+ "awaiting_confirmation": len(pending_conf),
2029
+ "open_prs": [
2030
+ {"repo": p["repo_name"], "pr_url": p["pr_url"], "timestamp": p["timestamp"]}
2031
+ for p in prs
2032
+ ],
2033
+ "top_errors": [
2034
+ {"message": e["message"][:100], "count": e["count"], "source": e["source"]}
2035
+ for e in errors[:5]
2036
+ ],
2037
+ })
2038
+ if name == "clear_my_history":
2039
+ if user_id:
2040
+ store.save_conversation(user_id, [])
2041
+ logger.info("Boss: cleared conversation history for user %s", user_id)
2042
+ return json.dumps({
2043
+ "status": "cleared",
2044
+ "note": "Your conversation history has been wiped. Next session starts fresh. [DONE]",
2045
+ })
2046
+ return json.dumps({"error": "cannot determine user — not clearing"})
2047
+
2048
+ # ── Admin-only tools ──────────────────────────────────────────────────────
2049
+ _ADMIN_TOOLS = {"list_all_users", "clear_user_history", "reset_fingerprint", "list_all_errors", "export_db"}
2050
+ if name in _ADMIN_TOOLS:
2051
+ if not is_admin:
2052
+ return json.dumps({"error": "Admin access required. You are not in SLACK_ADMIN_USERS."})
2053
+
2054
+ if name == "list_all_users":
2055
+ stats = store.get_all_user_stats()
2056
+ return json.dumps({"users": stats, "total": len(stats)})
2057
+
2058
+ if name == "clear_user_history":
2059
+ target = inputs.get("target_user_id", "").strip()
2060
+ if not target:
2061
+ return json.dumps({"error": "target_user_id is required"})
2062
+ store.save_conversation(target, [])
2063
+ display = store.get_user_name(target)
2064
+ logger.info("Boss admin: cleared history for user %s (%s) by admin %s", target, display, user_id)
2065
+ return json.dumps({"status": "cleared", "target_user_id": target, "display_name": display})
2066
+
2067
+ if name == "set_maintenance":
2068
+ repo_name = inputs.get("repo_name", "").strip()
2069
+ note = inputs.get("note", "").strip()
2070
+ if not repo_name:
2071
+ return json.dumps({"error": "repo_name is required"})
2072
+ store.set_health_state(repo_name, "confirmed", note=note)
2073
+ logger.info("Boss: maintenance confirmed for %s by %s (note: %s)", repo_name, user_id, note or "none")
2074
+ return json.dumps({
2075
+ "status": "confirmed",
2076
+ "repo": repo_name,
2077
+ "note": note or "none",
2078
+ "message": (
2079
+ f"Got it. I'll silently monitor {repo_name}'s health URL and "
2080
+ f"notify you as soon as it comes back online."
2081
+ ),
2082
+ })
2083
+
2084
+ if name == "reset_fingerprint":
2085
+ fp = inputs.get("fingerprint", "").strip()
2086
+ if not fp:
2087
+ return json.dumps({"error": "fingerprint is required"})
2088
+ found = store.reset_fingerprint(fp)
2089
+ logger.info("Boss admin: reset fingerprint %s by admin %s (found=%s)", fp, user_id, found)
2090
+ return json.dumps({"status": "reset" if found else "not_found", "fingerprint": fp,
2091
+ "note": "Sentinel will retry this error on the next poll." if found else "No fix record found for this fingerprint."})
2092
+
2093
+ if name == "list_all_errors":
2094
+ hours = int(inputs.get("hours", 0))
2095
+ errors = store.get_all_errors(hours)
2096
+ return json.dumps({"errors": errors[:100], "total": len(errors),
2097
+ "window_hours": hours or "all time"})
2098
+
2099
+ if name == "export_db":
2100
+ if not slack_client or not channel:
2101
+ return json.dumps({"error": "No Slack channel context — cannot upload file"})
2102
+ try:
2103
+ import sqlite3 as _sq
2104
+ import io as _io
2105
+ lines = []
2106
+ with _sq.connect(store.db_path) as _db:
2107
+ for tbl in ["errors", "fixes", "reports", "slack_users", "conversations", "submitted_issues"]:
2108
+ try:
2109
+ rows = _db.execute(f"SELECT * FROM {tbl}").fetchall() # noqa: S608
2110
+ cols = [d[0] for d in _db.execute(f"SELECT * FROM {tbl} LIMIT 0").description] # noqa: S608
2111
+ lines.append(f"=== {tbl} ({len(rows)} rows) ===")
2112
+ lines.append("\t".join(cols))
2113
+ for row in rows:
2114
+ lines.append("\t".join(str(v) if v is not None else "" for v in row))
2115
+ lines.append("")
2116
+ except Exception:
2117
+ lines.append(f"=== {tbl} (unavailable) ===\n")
2118
+ content = "\n".join(lines)
2119
+ await slack_client.files_upload_v2(
2120
+ channel=channel,
2121
+ content=content,
2122
+ filename="sentinel-db-export.tsv",
2123
+ title="Sentinel DB Export",
2124
+ )
2125
+ logger.info("Boss admin: exported DB (%d bytes) by admin %s", len(content), user_id)
2126
+ return json.dumps({"ok": True, "bytes": len(content)})
2127
+ except Exception as e:
2128
+ return json.dumps({"error": str(e)})
2129
+
2130
+ return json.dumps({"error": f"unknown tool: {name}"})
2131
+
2132
+
2133
+ # ── CLI fallback (OAuth / no API key) ────────────────────────────────────────
2134
+
2135
+ def _attachments_to_text(attachments: list[dict]) -> str:
2136
+ """Produce a plain-text summary of attachments to append to CLI prompts."""
2137
+ if not attachments:
2138
+ return ""
2139
+ parts = []
2140
+ for att in attachments:
2141
+ if att["type"] == "text":
2142
+ parts.append(
2143
+ f"[Attached file: {att['name']}]\n{att['content']}"
2144
+ )
2145
+ elif att["type"] == "image":
2146
+ parts.append(
2147
+ f"[Attached image: {att['name']}] (saved at {att['path']})"
2148
+ )
2149
+ else:
2150
+ parts.append(
2151
+ f"[Attached file: {att['name']}] (saved at {att['path']} — read it if relevant)"
2152
+ )
2153
+ return "\n\nATTACHMENTS:\n" + "\n---\n".join(parts)
2154
+
2155
+
2156
+ def _attachments_to_api_blocks(attachments: list[dict]) -> list[dict]:
2157
+ """Convert attachments into Anthropic API message content blocks."""
2158
+ blocks: list[dict] = []
2159
+ for att in attachments:
2160
+ if att["type"] == "image":
2161
+ blocks.append({
2162
+ "type": "image",
2163
+ "source": {
2164
+ "type": "base64",
2165
+ "media_type": att.get("mime", "image/png"),
2166
+ "data": att["content"],
2167
+ },
2168
+ })
2169
+ elif att["type"] == "text":
2170
+ blocks.append({
2171
+ "type": "text",
2172
+ "text": f"[Attached file: {att['name']}]\n{att['content']}",
2173
+ })
2174
+ else:
2175
+ blocks.append({
2176
+ "type": "text",
2177
+ "text": f"[Attached file: {att['name']}] saved at {att['path']}",
2178
+ })
2179
+ return blocks
2180
+
2181
+
2182
+ _ACTION_RE = re.compile(r"^ACTION:\s*(\{.*\})", re.MULTILINE)
2183
+
2184
+
2185
+ async def _handle_with_cli(
2186
+ message: str,
2187
+ history: list,
2188
+ cfg_loader,
2189
+ store,
2190
+ slack_client=None,
2191
+ user_name: str = "",
2192
+ user_id: str = "",
2193
+ attachments: list | None = None,
2194
+ is_admin: bool = False,
2195
+ ) -> tuple[str, bool]:
2196
+ """Fallback: use `claude --print` for users without an Anthropic API key."""
2197
+ status_json = await _run_tool("get_status", {"hours": 24}, cfg_loader, store)
2198
+ prs_json = await _run_tool("list_pending_prs", {}, cfg_loader, store)
2199
+
2200
+ # Pre-fetch log search if the message is a search request.
2201
+ # Use quoted strings as the query, or fall back to the full message.
2202
+ # Never hardcode field names — the query is whatever the user said.
2203
+ search_json = ""
2204
+ _search_kws = ("search", "find", "look for", "show me log", "grep", "entries for")
2205
+ if any(kw in message.lower() for kw in _search_kws):
2206
+ quoted = re.findall(r'"([^"]+)"', message)
2207
+ query = quoted[0] if quoted else message
2208
+ search_json = await _run_tool("search_logs", {"query": query}, cfg_loader, store)
2209
+
2210
+ paused = Path("SENTINEL_PAUSE").exists()
2211
+ repos = list(cfg_loader.repos.keys())
2212
+ log_sources = list(cfg_loader.log_sources.keys())
2213
+ ts = datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M UTC")
2214
+
2215
+ history_text = ""
2216
+ for msg in history[-8:]:
2217
+ role = msg["role"].upper()
2218
+ content = msg["content"]
2219
+ if isinstance(content, list):
2220
+ content = " ".join(
2221
+ (b.get("text", "") if isinstance(b, dict) else getattr(b, "text", ""))
2222
+ for b in content
2223
+ if (isinstance(b, dict) and b.get("type") == "text")
2224
+ or (hasattr(b, "type") and b.type == "text")
2225
+ )
2226
+ history_text += f"\n{role}: {content}"
2227
+
2228
+ slack_mention = f"<@{user_id}>" if user_id else (user_name or "")
2229
+ known_users = store.get_all_users()
2230
+ users_hint = ", ".join(f"<@{uid}> = {name}" for uid, name in known_users.items())
2231
+ prompt = (
2232
+ _SYSTEM
2233
+ + (f"\nYou are speaking with: {user_name} (Slack mention: {slack_mention})" if user_name else "")
2234
+ + "\nAlways start your reply by addressing the user directly using their Slack mention, e.g. \"<@U123> here is what I found...\"."
2235
+ + " Never use their plain name — always use the <@USER_ID> format so Slack highlights it."
2236
+ + (f"\nKnown Slack users: {users_hint}" if users_hint else "")
2237
+ + f"\n\nCurrent time: {ts}"
2238
+ + f"\nSentinel status: {'⏸ PAUSED' if paused else '▶ RUNNING'}"
2239
+ + f"\nManaged repos: {', '.join(repos) if repos else '(none configured)'}"
2240
+ + (f"\nLog sources: {', '.join(log_sources)}" if log_sources else "")
2241
+ + f"\nAdmin access for this user: {'YES — admin tools are available' if is_admin else 'NO — admin tools will be refused'}"
2242
+ + "\nNOTE: Running in CLI fallback mode — admin tools and some features are unavailable. Ask user to configure ANTHROPIC_API_KEY for full features."
2243
+ + f"\n\nCurrent status (last 24 h):\n{status_json}"
2244
+ + f"\n\nOpen PRs:\n{prs_json}"
2245
+ + (f"\n\nLog search results:\n{search_json}" if search_json else "")
2246
+ + (f"\n\nConversation so far:{history_text}" if history_text else "")
2247
+ + _attachments_to_text(attachments or [])
2248
+ + f"\n\nUSER: {message}"
2249
+ + "\n\nIf you need to take an action, include a line like:\n"
2250
+ + " ACTION: {\"action\": \"pause_sentinel\"}\n"
2251
+ + " ACTION: {\"action\": \"resume_sentinel\"}\n"
2252
+ + " ACTION: {\"action\": \"trigger_poll\"}\n"
2253
+ + " ACTION: {\"action\": \"create_issue\", \"description\": \"...\", \"target_repo\": \"\"}\n"
2254
+ + " ACTION: {\"action\": \"search_logs\", \"query\": \"<whatever the user asked to find>\"}\n"
2255
+ + "End with [DONE] if the request is fully handled."
2256
+ )
2257
+
2258
+ cfg = cfg_loader.sentinel
2259
+ env = os.environ.copy()
2260
+ if cfg.anthropic_api_key:
2261
+ env["ANTHROPIC_API_KEY"] = cfg.anthropic_api_key
2262
+
2263
+ try:
2264
+ result = subprocess.run(
2265
+ ([cfg.claude_code_bin, "--dangerously-skip-permissions", "--print", prompt]
2266
+ if os.getuid() != 0 else
2267
+ [cfg.claude_code_bin, "--print", prompt]),
2268
+ capture_output=True, text=True, timeout=180, env=env,
2269
+ )
2270
+ output = (result.stdout or "").strip()
2271
+ if result.returncode != 0 or not output:
2272
+ stderr = (result.stderr or "").strip()
2273
+ logger.error(
2274
+ "Boss CLI call failed (rc=%d): stdout=%r stderr=%r",
2275
+ result.returncode, output[:200], stderr[:200],
2276
+ )
2277
+ raw_err = (result.stderr or "").strip()
2278
+ if result.returncode != 0 and not output:
2279
+ full_err = f"exit {result.returncode}: {raw_err[:300]}"
2280
+ cfg = cfg_loader.sentinel
2281
+ alert_if_rate_limited(cfg.slack_bot_token, cfg.slack_channel,
2282
+ "sentinel_boss/cli", raw_err or full_err)
2283
+ return f":warning: `claude --print` failed ({full_err})", True
2284
+ except Exception as e:
2285
+ logger.error("Boss CLI call failed: %s", e)
2286
+ return f":warning: Boss unavailable: {e}", True
2287
+
2288
+ for m in _ACTION_RE.finditer(output):
2289
+ try:
2290
+ action = json.loads(m.group(1))
2291
+ name = action.pop("action", "")
2292
+ if name:
2293
+ result_str = await _run_tool(name, action, cfg_loader, store, user_id=user_id)
2294
+ logger.info("Boss CLI action: %s → %s", name, result_str[:80])
2295
+ except Exception as e:
2296
+ logger.warning("Boss action parse error: %s", e)
2297
+
2298
+ reply = _ACTION_RE.sub("", output).strip()
2299
+ is_done = "[DONE]" in reply
2300
+ reply = reply.replace("[DONE]", "").strip()
2301
+ if not reply:
2302
+ greeting = f"Hi {user_name}! " if user_name else "Hi! "
2303
+ reply = f"{greeting}I'm Sentinel, your autonomous DevOps agent. How can I help you?"
2304
+
2305
+ history.append({"role": "user", "content": message})
2306
+ history.append({"role": "assistant", "content": reply})
2307
+ return reply, is_done
2308
+
2309
+
2310
+ # ── History serialization helpers ────────────────────────────────────────────
2311
+
2312
+ def _serialize_content(content) -> list:
2313
+ """Convert Anthropic SDK response content (Pydantic objects) to plain dicts.
2314
+
2315
+ The SDK returns TextBlock / ToolUseBlock instances. json.dumps(..., default=str)
2316
+ turns them into useless strings like "TextBlock(type='text', text='...')".
2317
+ This converts them to proper dicts so history round-trips through SQLite safely.
2318
+ """
2319
+ if not isinstance(content, list):
2320
+ return content
2321
+ result = []
2322
+ for block in content:
2323
+ if isinstance(block, dict):
2324
+ result.append(block)
2325
+ elif hasattr(block, "model_dump"):
2326
+ result.append(block.model_dump())
2327
+ elif hasattr(block, "dict"):
2328
+ result.append(block.dict())
2329
+ elif hasattr(block, "type"):
2330
+ if block.type == "text":
2331
+ result.append({"type": "text", "text": getattr(block, "text", "")})
2332
+ elif block.type == "tool_use":
2333
+ result.append({
2334
+ "type": "tool_use",
2335
+ "id": getattr(block, "id", ""),
2336
+ "name": getattr(block, "name", ""),
2337
+ "input": getattr(block, "input", {}),
2338
+ })
2339
+ else:
2340
+ result.append({"type": "text", "text": str(block)})
2341
+ return result
2342
+
2343
+
2344
+ def _clean_history(history: list) -> list:
2345
+ """Remove turns that would cause a 400 from the Anthropic API.
2346
+
2347
+ Strips orphaned tool_use blocks (assistant turn with tool_use but no
2348
+ following tool_result turn) and consecutive same-role turns that result
2349
+ from a previous session that crashed mid-tool-loop.
2350
+ """
2351
+ cleaned = []
2352
+ i = 0
2353
+ while i < len(history):
2354
+ turn = history[i]
2355
+ role = turn.get("role", "")
2356
+ content = turn.get("content", [])
2357
+
2358
+ # Drop assistant turns that contain tool_use if the next turn isn't tool_result
2359
+ if role == "assistant" and isinstance(content, list):
2360
+ has_tool_use = any(
2361
+ (isinstance(b, dict) and b.get("type") == "tool_use")
2362
+ for b in content
2363
+ )
2364
+ if has_tool_use:
2365
+ next_turn = history[i + 1] if i + 1 < len(history) else None
2366
+ next_content = (next_turn or {}).get("content", [])
2367
+ has_result = isinstance(next_content, list) and any(
2368
+ (isinstance(b, dict) and b.get("type") == "tool_result")
2369
+ for b in next_content
2370
+ )
2371
+ if not has_result:
2372
+ i += 1 # skip orphaned tool_use turn
2373
+ continue
2374
+
2375
+ # Drop consecutive same-role turns (keep the last one)
2376
+ if cleaned and cleaned[-1].get("role") == role:
2377
+ cleaned[-1] = turn
2378
+ else:
2379
+ cleaned.append(turn)
2380
+ i += 1
2381
+ return cleaned
2382
+
2383
+
2384
+ # ── API-key path (structured tools, full agentic loop) ────────────────────────
2385
+
2386
+ async def _handle_with_api(
2387
+ message: str,
2388
+ history: list,
2389
+ cfg_loader,
2390
+ store,
2391
+ slack_client=None,
2392
+ user_name: str = "",
2393
+ user_id: str = "",
2394
+ attachments: list | None = None,
2395
+ channel: str = "",
2396
+ is_admin: bool = False,
2397
+ ) -> tuple[str, bool]:
2398
+ import anthropic
2399
+
2400
+ api_key = cfg_loader.sentinel.anthropic_api_key or os.environ.get("ANTHROPIC_API_KEY", "")
2401
+ client = anthropic.Anthropic(api_key=api_key)
2402
+
2403
+ paused = Path("SENTINEL_PAUSE").exists()
2404
+ repos = list(cfg_loader.repos.keys())
2405
+ ts = datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M UTC")
2406
+ known_projects = [_read_project_name(d) for d in _find_project_dirs()]
2407
+ log_sources = list(cfg_loader.log_sources.keys())
2408
+ slack_mention = f"<@{user_id}>" if user_id else (user_name or "")
2409
+ known_users = store.get_all_users() # {user_id: display_name}
2410
+ users_hint = ", ".join(f"<@{uid}> = {name}" for uid, name in known_users.items())
2411
+ system = (
2412
+ _SYSTEM
2413
+ + (f"\nYou are speaking with: {user_name} (Slack mention: {slack_mention})" if user_name else "")
2414
+ + "\nAlways start your reply by addressing the user directly using their Slack mention, e.g. \"<@U123> here is what I found...\"."
2415
+ + " Never use their plain name — always use the <@USER_ID> format so Slack highlights it."
2416
+ + (f"\nKnown Slack users: {users_hint}" if users_hint else "")
2417
+ + f"\n\nCurrent time: {ts}"
2418
+ + f"\nSentinel status: {'⏸ PAUSED' if paused else '▶ RUNNING'}"
2419
+ + f"\nManaged repos: {', '.join(repos) if repos else '(none configured)'}"
2420
+ + (f"\nLog sources: {', '.join(log_sources)}" if log_sources else "")
2421
+ + (f"\nKnown projects in workspace: {', '.join(known_projects)}" if known_projects else "")
2422
+ + f"\nAdmin access for this user: {'YES — admin tools are available' if is_admin else 'NO — admin tools will be refused'}"
2423
+ )
2424
+
2425
+ # Build user content — include attachment blocks if any
2426
+ attach_blocks = _attachments_to_api_blocks(attachments or [])
2427
+ if attach_blocks:
2428
+ user_content = attach_blocks + [{"type": "text", "text": message}]
2429
+ else:
2430
+ user_content = message
2431
+
2432
+ # Work on a local copy — only commit to history on success to prevent
2433
+ # cascading 400s if the API rejects a malformed/corrupted history.
2434
+ messages = list(history) + [{"role": "user", "content": user_content}]
2435
+
2436
+ while True:
2437
+ response = client.messages.create(
2438
+ model="claude-opus-4-6",
2439
+ max_tokens=2048,
2440
+ system=system,
2441
+ tools=_TOOLS,
2442
+ messages=messages,
2443
+ )
2444
+
2445
+ text_parts = []
2446
+ tool_blocks = []
2447
+ for block in response.content:
2448
+ if block.type == "text":
2449
+ text_parts.append(block.text)
2450
+ elif block.type == "tool_use":
2451
+ tool_blocks.append(block)
2452
+
2453
+ if not tool_blocks:
2454
+ reply = " ".join(text_parts).strip()
2455
+ is_done = "[DONE]" in reply
2456
+ reply = reply.replace("[DONE]", "").strip()
2457
+ if not reply:
2458
+ greeting = f"Hi {user_name}! " if user_name else "Hi! "
2459
+ reply = f"{greeting}I'm Sentinel, your autonomous DevOps agent. How can I help you?"
2460
+ # Heuristic override: if reply ends with a question, Claude is waiting for input
2461
+ if is_done and re.search(r'\?\s*$', reply):
2462
+ is_done = False
2463
+ # Commit to history only on success — serialize SDK objects to plain dicts
2464
+ history.append({"role": "user", "content": user_content})
2465
+ history.append({"role": "assistant", "content": _serialize_content(response.content)})
2466
+ return reply, is_done
2467
+
2468
+ messages.append({"role": "assistant", "content": _serialize_content(response.content)})
2469
+ tool_results = []
2470
+ for tc in tool_blocks:
2471
+ result = await _run_tool(tc.name, tc.input, cfg_loader, store, slack_client=slack_client, user_id=user_id, channel=channel, is_admin=is_admin)
2472
+ logger.info("Boss tool: %s(%s) → %s", tc.name, tc.input, result[:120])
2473
+ tool_results.append({
2474
+ "type": "tool_result",
2475
+ "tool_use_id": tc.id,
2476
+ "content": result,
2477
+ })
2478
+ messages.append({"role": "user", "content": tool_results})
2479
+
2480
+
2481
+ # ── Main entry point ──────────────────────────────────────────────────────────
2482
+
2483
+ async def handle_message(
2484
+ message: str,
2485
+ history: list,
2486
+ cfg_loader,
2487
+ store,
2488
+ slack_client=None,
2489
+ user_name: str = "",
2490
+ user_id: str = "",
2491
+ attachments: list | None = None,
2492
+ channel: str = "",
2493
+ is_admin: bool = False,
2494
+ ) -> tuple[str, bool]:
2495
+ """
2496
+ Process one user message through the Sentinel Boss (Claude with tool use).
2497
+
2498
+ Priority:
2499
+ 1. Claude Pro / OAuth via `claude --print` (CLI path — no API key needed)
2500
+ 2. ANTHROPIC_API_KEY fallback (structured tools, full agentic loop)
2501
+
2502
+ Returns:
2503
+ (reply_text, is_done)
2504
+ is_done=True → session complete, release the Slack queue slot.
2505
+ is_done=False → waiting for user follow-up, keep the slot.
2506
+ """
2507
+ api_key = cfg_loader.sentinel.anthropic_api_key or os.environ.get("ANTHROPIC_API_KEY", "")
2508
+
2509
+ # 1st priority: ANTHROPIC_API_KEY — full structured tools, cheap per-token for Boss queries
2510
+ if api_key:
2511
+ try:
2512
+ import anthropic # noqa: F401
2513
+ return await _handle_with_api(
2514
+ message, history, cfg_loader, store, slack_client=slack_client,
2515
+ user_name=user_name, user_id=user_id, attachments=attachments, channel=channel,
2516
+ is_admin=is_admin,
2517
+ )
2518
+ except Exception as api_err:
2519
+ err_str = str(api_err)
2520
+ # Detect rate-limit / auth failure and alert Slack before falling through
2521
+ cfg = cfg_loader.sentinel
2522
+ if is_rate_limited(err_str):
2523
+ from .notify import rate_limit_message
2524
+ alert_if_rate_limited(cfg.slack_bot_token, cfg.slack_channel,
2525
+ "sentinel_boss/api", err_str)
2526
+ logger.warning("Boss: API key path failed (%s), trying CLI fallback", err_str)
2527
+
2528
+ # 2nd priority: Claude Pro / OAuth via CLI (limited tools but no API key needed)
2529
+ cli_reply, cli_done = await _handle_with_cli(
2530
+ message, history, cfg_loader, store, slack_client=slack_client, user_name=user_name,
2531
+ user_id=user_id, attachments=attachments, is_admin=is_admin,
2532
+ )
2533
+ if not cli_reply.startswith(":warning:"):
2534
+ return cli_reply, cli_done
2535
+
2536
+ # Both paths failed — alert Slack and return error
2537
+ cfg = cfg_loader.sentinel
2538
+ err_output = cli_reply
2539
+ alert_if_rate_limited(cfg.slack_bot_token, cfg.slack_channel,
2540
+ "sentinel_boss/cli", err_output)
2541
+ if not api_key:
2542
+ # No auth at all configured
2543
+ no_auth_msg = (
2544
+ ":warning: *Sentinel Boss — no Claude auth configured*\n"
2545
+ "Configure at least one of:\n"
2546
+ "• `ANTHROPIC_API_KEY` in `sentinel.properties` — full features\n"
2547
+ "• Claude Pro OAuth: run `claude login` on the server — required for fix_engine\n"
2548
+ "See: https://github.com/misterhuydo/Sentinel#authentication"
2549
+ )
2550
+ slack_alert(cfg.slack_bot_token, cfg.slack_channel, no_auth_msg)
2551
+ return ":warning: No Claude authentication configured. See Slack for details.", True
2552
+ return cli_reply, cli_done