switchroom 0.5.0 → 0.7.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (89) hide show
  1. package/README.md +142 -121
  2. package/bin/autoaccept.exp +29 -6
  3. package/dist/agent-scheduler/index.js +12261 -0
  4. package/dist/cli/autoaccept-poll.js +10 -0
  5. package/dist/cli/switchroom.js +27250 -25324
  6. package/dist/vault/approvals/kernel-server.js +12709 -0
  7. package/dist/vault/broker/server.js +15724 -0
  8. package/package.json +4 -3
  9. package/profiles/_base/start.sh.hbs +133 -0
  10. package/profiles/_shared/telegram-style.md.hbs +3 -3
  11. package/profiles/default/CLAUDE.md +3 -3
  12. package/profiles/default/CLAUDE.md.hbs +2 -2
  13. package/profiles/default/workspace/CLAUDE.md.hbs +9 -0
  14. package/skills/docx/VENDORED.md +1 -1
  15. package/skills/mcp-builder/VENDORED.md +1 -1
  16. package/skills/pdf/VENDORED.md +1 -1
  17. package/skills/pptx/VENDORED.md +1 -1
  18. package/skills/skill-creator/VENDORED.md +1 -1
  19. package/skills/switchroom-architecture/SKILL.md +8 -7
  20. package/skills/switchroom-cli/SKILL.md +23 -15
  21. package/skills/switchroom-health/SKILL.md +7 -7
  22. package/skills/switchroom-install/SKILL.md +36 -39
  23. package/skills/switchroom-manage/SKILL.md +4 -4
  24. package/skills/switchroom-status/SKILL.md +1 -1
  25. package/skills/webapp-testing/VENDORED.md +1 -1
  26. package/skills/xlsx/VENDORED.md +1 -1
  27. package/telegram-plugin/admin-commands/dispatch.test.ts +119 -1
  28. package/telegram-plugin/admin-commands/index.ts +71 -0
  29. package/telegram-plugin/ask-user.ts +1 -0
  30. package/telegram-plugin/card-event-log.ts +138 -0
  31. package/telegram-plugin/dist/bridge/bridge.js +178 -31
  32. package/telegram-plugin/dist/foreman/foreman.js +6875 -6526
  33. package/telegram-plugin/dist/gateway/gateway.js +13862 -11834
  34. package/telegram-plugin/dist/server.js +202 -40
  35. package/telegram-plugin/fleet-state.ts +25 -10
  36. package/telegram-plugin/foreman/foreman.ts +38 -3
  37. package/telegram-plugin/gateway/approval-callback.ts +126 -0
  38. package/telegram-plugin/gateway/approval-card.test.ts +90 -0
  39. package/telegram-plugin/gateway/approval-card.ts +127 -0
  40. package/telegram-plugin/gateway/approvals-commands.ts +126 -0
  41. package/telegram-plugin/gateway/boot-card.ts +31 -6
  42. package/telegram-plugin/gateway/boot-probes.ts +503 -72
  43. package/telegram-plugin/gateway/gateway.ts +822 -94
  44. package/telegram-plugin/gateway/ipc-protocol.ts +34 -1
  45. package/telegram-plugin/gateway/ipc-server.ts +35 -0
  46. package/telegram-plugin/gateway/startup-mutex.ts +110 -2
  47. package/telegram-plugin/hooks/hooks.json +19 -0
  48. package/telegram-plugin/hooks/tool-label-pretool.mjs +216 -0
  49. package/telegram-plugin/hooks/tool-label-stop.mjs +63 -0
  50. package/telegram-plugin/package.json +4 -1
  51. package/telegram-plugin/plugin-logger.ts +20 -1
  52. package/telegram-plugin/progress-card-driver.ts +202 -13
  53. package/telegram-plugin/progress-card.ts +2 -2
  54. package/telegram-plugin/quota-check.ts +1 -0
  55. package/telegram-plugin/registry/subagents-schema.ts +37 -0
  56. package/telegram-plugin/registry/subagents.test.ts +64 -0
  57. package/telegram-plugin/session-tail.ts +58 -5
  58. package/telegram-plugin/shared/bot-runtime.ts +48 -2
  59. package/telegram-plugin/subagent-watcher.ts +139 -7
  60. package/telegram-plugin/tests/_progress-card-harness.ts +4 -0
  61. package/telegram-plugin/tests/bg-agent-progress-card-757.test.ts +201 -0
  62. package/telegram-plugin/tests/boot-card-probe-target.test.ts +10 -34
  63. package/telegram-plugin/tests/boot-card-render.test.ts +6 -5
  64. package/telegram-plugin/tests/boot-probes.test.ts +558 -0
  65. package/telegram-plugin/tests/card-event-log.test.ts +145 -0
  66. package/telegram-plugin/tests/gateway-startup-mutex.test.ts +102 -0
  67. package/telegram-plugin/tests/ipc-server-validate-inject-inbound.test.ts +134 -0
  68. package/telegram-plugin/tests/progress-card-delay-842.test.ts +160 -0
  69. package/telegram-plugin/tests/quota-check.test.ts +37 -1
  70. package/telegram-plugin/tests/subagent-registry-bugs.test.ts +5 -0
  71. package/telegram-plugin/tests/subagent-watcher-stall-notification.test.ts +104 -1
  72. package/telegram-plugin/tests/subagent-watcher.test.ts +5 -0
  73. package/telegram-plugin/tests/tool-label-sidecar.test.ts +114 -0
  74. package/telegram-plugin/tests/two-zone-bg-done-when-all-terminal.test.ts +5 -3
  75. package/telegram-plugin/tests/two-zone-card-header-phases.test.ts +10 -0
  76. package/telegram-plugin/tests/two-zone-snapshot-extras.test.ts +58 -14
  77. package/telegram-plugin/tests/welcome-text.test.ts +57 -0
  78. package/telegram-plugin/tool-label-sidecar.ts +140 -0
  79. package/telegram-plugin/tool-labels.ts +55 -0
  80. package/telegram-plugin/two-zone-card.ts +27 -7
  81. package/telegram-plugin/uat/SETUP.md +160 -0
  82. package/telegram-plugin/uat/assertions.ts +140 -0
  83. package/telegram-plugin/uat/driver.ts +174 -0
  84. package/telegram-plugin/uat/harness.ts +161 -0
  85. package/telegram-plugin/uat/login.ts +134 -0
  86. package/telegram-plugin/uat/port-allocator.ts +71 -0
  87. package/telegram-plugin/uat/scenarios/smoke-clerk-reply.test.ts +61 -0
  88. package/telegram-plugin/welcome-text.ts +44 -2
  89. package/bin/bridge-watchdog.sh +0 -967
@@ -1,967 +0,0 @@
1
- #!/usr/bin/env bash
2
- # Watchdog: restarts switchroom agent services whose Telegram bridge has
3
- # disconnected from the gateway, OR whose journal output has been silent
4
- # for too long (indicating an internally-frozen agent that systemd still
5
- # reports as "active (running)"). Designed to run on a systemd timer.
6
- #
7
- # For each agent, checks whether the gateway is up and has an active bridge.
8
- # If the gateway is healthy but the bridge is disconnected (or never connected),
9
- # restarts the agent service so Claude Code gets a fresh MCP server.
10
- #
11
- # Journal-silence check (2026-04-26, issue #116): Three klanker hangs in
12
- # 10 hours exposed a class of failure where the agent process is
13
- # "active (running)" to systemd but internally frozen — no journal output
14
- # for many minutes, manual restart the only recovery. Two hangs were on the
15
- # Stop-hook ladder ("running stop hooks 0/N"); one was mid-task at 1.0 GB
16
- # RSS. The watchdog now also checks journal-output freshness per-agent and
17
- # restarts via `switchroom agent restart <agent>` when an agent has been
18
- # silent for JOURNAL_SILENCE_SECS (default 600s) and has cleared the uptime
19
- # grace. Sustained suspicion via a state file under
20
- # /run/user/<uid>/switchroom-watchdog/ prevents transient quiet from
21
- # triggering.
22
- #
23
- # Agent discovery: enumerates ALL active switchroom-*-gateway.service units
24
- # and derives the agent name + gateway-log path from each. This replaces the
25
- # previous hardcoded (agent, log) list which rotted any time an agent was
26
- # renamed or added — e.g. on 2026-04-21 the old list still held "assistant"
27
- # (since renamed to "clerk") and silently skipped the new "lawgpt" agent
28
- # entirely, leaving both in a stale-bridge state for hours while klanker
29
- # (still on the list) kept getting healed.
30
- #
31
- # False-restart fix (2026-04-22): the bridge IPC flaps `registered ↔
32
- # disconnected` rapidly across Claude Code turn boundaries. The old
33
- # `tail -1` heuristic caught transient disconnect states and restarted
34
- # otherwise-healthy agents. On 2026-04-21 20:12–20:26 AEST this produced
35
- # 3 spurious restarts of klanker mid-CPU-heavy-work. The watchdog now
36
- # requires SUSTAINED disconnection (>= DISCONNECT_GRACE_SECS across
37
- # consecutive ticks) and an uptime grace (>= UPTIME_GRACE_SECS since
38
- # the agent service started) before acting.
39
-
40
- set -euo pipefail
41
-
42
- # Tunables. Expressed as env-overridable so the test harness can drive
43
- # edge cases without mutating the script.
44
- : "${UPTIME_GRACE_SECS:=90}" # skip checks for this long after agent (re)start
45
- : "${DISCONNECT_GRACE_SECS:=600}" # require disconnection to persist this long before restarting
46
- : "${LIVENESS_GRACE_SECS:=30}" # liveness file mtime must be recent before we treat bridge as dead
47
- # Journal-silence thresholds. Defaults raised from 600s to 4000s on
48
- # 2026-04-30 (issue #405). The previous 600s default opened a trap zone
49
- # where any agent whose latest journal entry sat between
50
- # JOURNAL_SILENCE_SECS (600s) and RECENT_ACTIVITY_WINDOW_SECS (3600s)
51
- # was eligible for restart. Normal chat-cadence agents (10–60 min between
52
- # user messages) land in that zone every cycle, producing ~208 false
53
- # restarts/24h on a typical host. With both defaults at 4000s (> the
54
- # 3600s recent-activity window), the trap zone closes: by the time
55
- # silence reaches 4000s, the latest entry is already past the
56
- # recent-activity gate and gets treated as idle. The hang detector is
57
- # effectively inert under defaults — operators who want it active must
58
- # opt in by lowering these values via env, and `Restart=on-failure` in
59
- # the unit file still catches actual crashes. See issue #405 for the
60
- # worked example showing the 21.5-min restart cadence the trap zone
61
- # produced.
62
- : "${JOURNAL_SILENCE_SECS:=4000}" # seconds of journal silence before suspecting a hang
63
- : "${JOURNAL_SILENCE_HARD_SECS:=4000}" # seconds the silence_since marker must predate before restarting
64
- # Recent-activity gate: only treat journal-silence as suspect-hang when the
65
- # agent had ANY log activity within this window. Distinguishes "hung mid-task"
66
- # (last log moments ago, then silence) from "genuinely idle" (no logs in
67
- # hours/days — agent waiting for the next user message). Default 1h: long
68
- # enough to span a normal session but short enough that a long overnight idle
69
- # doesn't get falsely flagged.
70
- : "${RECENT_ACTIVITY_WINDOW_SECS:=3600}"
71
- # Turn-active marker check (issue #412): the gateway writes a per-agent
72
- # `turn-active.json` at turn-start, touches its mtime on every tool_use,
73
- # and removes it on turn_complete. If the file exists AND its mtime
74
- # hasn't advanced in TURN_HANG_SECS, the agent is wedged mid-turn —
75
- # distinguishable from "legitimately idle" because legitimate idle
76
- # leaves no marker file at all. Default 5 min: bigger than the slowest
77
- # legitimate single-tool turn (a long Bash compile maybe) but tight
78
- # enough to catch Stop-hook deadlocks before the user notices.
79
- : "${TURN_HANG_SECS:=300}"
80
- # Forward-progress liveness window. The gateway only bumps
81
- # `turn-active.json` mtime on PARENT-stream tool_use events; when the
82
- # parent dispatches a Task() to a sub-agent, the marker goes stale
83
- # even while real work is happening. The bridge can also flap
84
- # (transient socket close, MCP plugin restart) while a sub-agent
85
- # keeps working. Before any restart path acts, probe the agent's
86
- # `.claude/projects/**/*.jsonl` AND `.claude/tasks/**/*.json` files:
87
- # if EITHER was modified within JSONL_LIVENESS_SECS, the agent is
88
- # making forward progress and the restart is a false positive.
89
- #
90
- # Two independent fingerprints means a wedged agent has to be silent
91
- # on BOTH to be killed — much stronger evidence than a single signal.
92
- # 60s matches the in-flight detector's "recent" semantics in
93
- # src/agents/in-flight.ts (30s window + 60s tick spread).
94
- #
95
- # (Name kept as JSONL_LIVENESS_SECS for back-compat with operators
96
- # who already set it via env; the value gates both fingerprints.)
97
- : "${JSONL_LIVENESS_SECS:=60}"
98
-
99
- # Per-agent watchdog state lives under /run/user/$UID/switchroom-watchdog/
100
- # (tmpfs, cleared on logout — correct: we don't want stale silence markers
101
- # surviving restarts). mkdir -p is idempotent.
102
- # WATCHDOG_STATE_DIR is env-overridable for the test harness.
103
- UID_VAL="${UID:-$(id -u)}"
104
- : "${WATCHDOG_STATE_DIR:=/run/user/${UID_VAL}/switchroom-watchdog}"
105
- mkdir -p "$WATCHDOG_STATE_DIR" 2>/dev/null || true
106
-
107
- now_epoch() { date +%s; }
108
-
109
- # Unified logging — every decision goes to journalctl with the
110
- # `switchroom-watchdog` tag AND to the unit's own stdout (which is
111
- # also captured by journal via StandardOutput=journal). Use level tags
112
- # (`detect`, `restart`, `skip`, `error`) so `journalctl -t
113
- # switchroom-watchdog | grep '\[restart\]'` is a clean audit trail of
114
- # every action this watchdog took.
115
- wd_log() {
116
- local level="$1"
117
- shift
118
- local msg="$*"
119
- logger -t switchroom-watchdog "[$level] $msg" 2>/dev/null || true
120
- # Stdout (not stderr) matches the prior `echo` lines so existing
121
- # systemd journal capture (StandardOutput=journal) and the test
122
- # harness that reads stdout from execFileSync both see the line.
123
- echo "$(date -Iseconds) watchdog [$level] $msg"
124
- }
125
-
126
- # Returns 0 (true) iff the agent shows ANY of two independent
127
- # forward-progress fingerprints within the last `$2` seconds:
128
- #
129
- # 1. `.claude/projects/**/*.jsonl` — Claude Code appends to these
130
- # transcripts on every event (model output, tool_use, sub-agent
131
- # activity). Fresh mtime ⇒ the model or a sub-agent is alive.
132
- # 2. `.claude/tasks/<session>/*.json` — TodoWrite / Task-tool state
133
- # files. Updated independently of the transcript stream when the
134
- # agent is iterating on a task list. Catches the case where the
135
- # transcript momentarily quiets (large model thinking pause)
136
- # while the agent is still progressing through todos.
137
- #
138
- # OR semantics: a wedged agent has to be silent on BOTH to be
139
- # declared dead. Two uncorrelated fingerprints make false positives
140
- # (kill while still working) much rarer than a single signal.
141
- #
142
- # `find -mmin` minimum granularity is minutes; round up to be
143
- # conservative (better to defer a restart by an extra minute than to
144
- # kill a live sub-agent). Both probes are bounded — quit on first
145
- # match — so this stays O(1)-ish even on busy projects.
146
- agent_has_recent_progress() {
147
- local agent_name="$1"
148
- local within_secs="$2"
149
- local agent_root="${HOME}/.switchroom/agents/${agent_name}/.claude"
150
- [[ -d "$agent_root" ]] || return 1
151
- local mmin=$(( (within_secs + 59) / 60 ))
152
- [[ "$mmin" -lt 1 ]] && mmin=1
153
-
154
- # Signal 1: transcript JSONL writes (parent or sub-agent).
155
- local hit
156
- hit=$(find "${agent_root}/projects" -name '*.jsonl' -mmin "-${mmin}" -print -quit 2>/dev/null)
157
- [[ -n "$hit" ]] && return 0
158
-
159
- # Signal 2: TodoWrite/Task state JSON updates.
160
- hit=$(find "${agent_root}/tasks" -name '*.json' -mmin "-${mmin}" -print -quit 2>/dev/null)
161
- [[ -n "$hit" ]] && return 0
162
-
163
- return 1
164
- }
165
-
166
- # ─── Forensic observation helpers ──────────────────────────────────────
167
- # Composed at every restart/detect/skip log line so `journalctl -t
168
- # switchroom-watchdog` carries enough context to reconstruct WHY any
169
- # action was taken without re-deriving from kernel/process state
170
- # after the fact (which is impossible — the process is gone after a
171
- # restart). Each helper is best-effort, returns a compact key=value
172
- # fragment, and never fails the script.
173
-
174
- # Resolve the most-interesting PID in the agent's systemd cgroup —
175
- # i.e., the actual `claude` process, not the start.sh / `script -qfc`
176
- # PTY wrappers that systemd reports as MainPID. Strategy:
177
- #
178
- # 1. Look up the unit's cgroup path via systemctl.
179
- # 2. Read `/sys/fs/cgroup/<cgroup>/cgroup.procs` for the full list.
180
- # 3. Pick the PID with the largest RSS — claude is reliably the
181
- # memory-heaviest member of the cgroup (start.sh: ~2MB, script:
182
- # ~1MB, claude: hundreds of MB to multiple GB).
183
- #
184
- # Falls back to MainPID if the cgroup walk fails (rare — only when
185
- # cgroup v2 isn't mounted at /sys/fs/cgroup or systemd reports an
186
- # unusual unit layout). Returns 0 when nothing resolvable.
187
- agent_main_pid() {
188
- local name="$1"
189
- local unit="switchroom-${name}.service"
190
- local cgroup
191
- cgroup=$(systemctl --user show "$unit" -p ControlGroup --value 2>/dev/null)
192
- if [[ -n "$cgroup" && -r "/sys/fs/cgroup${cgroup}/cgroup.procs" ]]; then
193
- # Pick the PID whose RSS (in KB) is largest. ps -o rss= prints
194
- # just the rss column; pair with -p PID-list to score them.
195
- local pids
196
- pids=$(tr '\n' ' ' < "/sys/fs/cgroup${cgroup}/cgroup.procs" 2>/dev/null)
197
- if [[ -n "$pids" ]]; then
198
- local heaviest
199
- heaviest=$(ps -o pid=,rss= -p $pids 2>/dev/null \
200
- | awk 'BEGIN{best_pid=0; best_rss=0} {if ($2+0 > best_rss) {best_rss=$2+0; best_pid=$1+0}} END{print best_pid}')
201
- if [[ "${heaviest:-0}" -gt 0 ]]; then
202
- echo "$heaviest"
203
- return 0
204
- fi
205
- fi
206
- fi
207
- systemctl --user show "$unit" -p MainPID --value 2>/dev/null || echo 0
208
- }
209
-
210
- # Process-state snapshot: state letter (R running, S sleeping, D
211
- # uninterruptible sleep — usually I/O wait or kernel stuck, Z zombie,
212
- # T stopped), CPU%, RSS in MB. State `D` for >30s is the smoking-gun
213
- # signature of a genuinely wedged process (the original #116 hangs).
214
- # Reads /proc/<pid>/stat for the state letter (field 3) and uses
215
- # `ps -o` for CPU/RSS — both cheap and free of GNU/BSD portability
216
- # pitfalls on Linux.
217
- agent_proc_snapshot() {
218
- local pid="$1"
219
- if [[ -z "$pid" || "$pid" == "0" ]]; then
220
- echo "pid=0 state=missing"
221
- return 0
222
- fi
223
- if [[ ! -r "/proc/${pid}/stat" ]]; then
224
- echo "pid=${pid} state=gone"
225
- return 0
226
- fi
227
- # /proc/<pid>/stat field 3 is the state letter. The comm field
228
- # (field 2) is parenthesized and may contain spaces — strip it
229
- # before splitting so awk indexing is reliable.
230
- local stat_state
231
- stat_state=$(awk '{
232
- line=$0;
233
- sub(/.*\) /, "", line);
234
- split(line, a, " ");
235
- print a[1];
236
- }' "/proc/${pid}/stat" 2>/dev/null || echo "?")
237
- local cpu rss
238
- read -r cpu rss < <(ps -o pcpu=,rss= -p "$pid" 2>/dev/null | awk '{print $1, $2}')
239
- cpu="${cpu:-?}"
240
- rss="${rss:-0}"
241
- local rss_mb=$(( rss / 1024 ))
242
- echo "pid=${pid} state=${stat_state} cpu=${cpu}% rss_mb=${rss_mb}"
243
- }
244
-
245
- # Per-fingerprint freshness summary. Reports the age (in seconds) of
246
- # the newest JSONL transcript and tasks-state file under the agent's
247
- # `.claude/` tree. A wedged process shows both ages climbing past the
248
- # threshold; a working sub-agent shows at least one stays small.
249
- agent_progress_snapshot() {
250
- local name="$1"
251
- local agent_root="${HOME}/.switchroom/agents/${name}/.claude"
252
- if [[ ! -d "$agent_root" ]]; then
253
- echo "jsonl_age=- tasks_age=-"
254
- return 0
255
- fi
256
- local now
257
- now=$(now_epoch)
258
- # Newest JSONL mtime (may be empty if no project history yet).
259
- local newest_jsonl_mtime
260
- newest_jsonl_mtime=$(find "${agent_root}/projects" -name '*.jsonl' \
261
- -printf '%T@\n' 2>/dev/null | awk 'BEGIN{m=0} {if ($1+0 > m) m=$1+0} END{print int(m)}')
262
- local newest_tasks_mtime
263
- newest_tasks_mtime=$(find "${agent_root}/tasks" -name '*.json' \
264
- -printf '%T@\n' 2>/dev/null | awk 'BEGIN{m=0} {if ($1+0 > m) m=$1+0} END{print int(m)}')
265
- local jsonl_age="-"
266
- local tasks_age="-"
267
- if [[ "${newest_jsonl_mtime:-0}" -gt 0 ]]; then
268
- jsonl_age=$(( now - newest_jsonl_mtime ))s
269
- fi
270
- if [[ "${newest_tasks_mtime:-0}" -gt 0 ]]; then
271
- tasks_age=$(( now - newest_tasks_mtime ))s
272
- fi
273
- echo "jsonl_age=${jsonl_age} tasks_age=${tasks_age}"
274
- }
275
-
276
- # Compose the full forensic observation line: process state + the
277
- # two progress fingerprints. Embedded in every action log message so
278
- # the journal entry is self-contained — operators don't need to
279
- # re-run probes after the fact (which would be useless if the
280
- # process has been restarted in the meantime).
281
- agent_observation() {
282
- local name="$1"
283
- local pid
284
- pid=$(agent_main_pid "$name")
285
- local proc progress
286
- proc=$(agent_proc_snapshot "$pid")
287
- progress=$(agent_progress_snapshot "$name")
288
- echo "${proc} ${progress}"
289
- }
290
-
291
- # Stamp a clean-shutdown.json marker into the agent's telegram state
292
- # dir BEFORE issuing a restart, so the next greeting card can render
293
- # "Restarted <reason>". Mirrors the inline jq/printf logic that lived
294
- # in the bridge-disconnect path; pulled into a function so every
295
- # restart path stamps consistently. Best-effort: never fails.
296
- stamp_restart_reason() {
297
- local marker="$1"
298
- local reason="$2"
299
- local ts_ms
300
- ts_ms=$(( $(date +%s) * 1000 ))
301
- local tmp="${marker}.tmp-$$"
302
- if command -v jq >/dev/null 2>&1; then
303
- jq -n --argjson ts "$ts_ms" --arg reason "$reason" \
304
- '{ts: $ts, signal: "SIGTERM", reason: $reason}' > "$tmp" 2>/dev/null \
305
- && mv -f "$tmp" "$marker" 2>/dev/null || rm -f "$tmp" 2>/dev/null || true
306
- else
307
- local esc_reason
308
- esc_reason=$(printf '%s' "$reason" | sed 's/\\/\\\\/g; s/"/\\"/g')
309
- printf '{"ts":%s,"signal":"SIGTERM","reason":"%s"}' "$ts_ms" "$esc_reason" > "$tmp" 2>/dev/null \
310
- && mv -f "$tmp" "$marker" 2>/dev/null || rm -f "$tmp" 2>/dev/null || true
311
- fi
312
- }
313
-
314
- # ─── Crash-time tmux pane capture (#725 PR-2) ──────────────────────────
315
- #
316
- # Snapshot the agent's tmux pane scrollback to
317
- # `<agentDir>/crash-reports/<ISO8601>-<reason>.txt` immediately
318
- # before a watchdog-triggered restart. Gives RCA tooling the live
319
- # screen state at the moment of the kill.
320
- #
321
- # Mirror of `src/agents/tmux.ts#captureAgentPane`. Same socket
322
- # convention (`switchroom-<agent>`), same target session
323
- # (`<agent>`), same output dir, same header. Keep the two paths in
324
- # sync — RCA tooling reads from one stream regardless of which
325
- # crash path produced the file.
326
- #
327
- # Best-effort: every step is `|| true`-ish so a missing socket /
328
- # tmux / write failure NEVER blocks the restart. Operator-initiated
329
- # restarts (`switchroom agent restart <agent>`) do NOT call this —
330
- # only watchdog-triggered restart paths do, since clean restarts
331
- # aren't crashes.
332
- #
333
- # Retention: 20 newest .txt files; size cap: 10MB per file
334
- # (post-header bytes; tmux history-limit is 100k lines so worst-case
335
- # ANSI-heavy panes can spike beyond that).
336
- capture_pane_before_restart() {
337
- local agent="$1"
338
- local reason="$2"
339
- local agent_dir="${HOME}/.switchroom/agents/${agent}"
340
- local socket="switchroom-${agent}"
341
- local out_dir="${agent_dir}/crash-reports"
342
- local ts
343
- ts="$(date -u +%Y-%m-%dT%H-%M-%SZ)"
344
- local out="${out_dir}/${ts}-${reason}.txt"
345
- mkdir -p "$out_dir" 2>/dev/null || true
346
- {
347
- printf '# agent: %s\n# reason: %s\n# captured-at: %s\n# tmux-socket: %s\n\n' \
348
- "$agent" "$reason" "$ts" "$socket"
349
- timeout 5 tmux -L "$socket" capture-pane -p -S - -t "$agent" 2>&1 \
350
- | head -c 10485760 \
351
- || echo "[capture-pane failed: $?]"
352
- } > "$out" 2>/dev/null || true
353
- # Retention: keep newest 20 .txt files in the dir.
354
- ls -1t "$out_dir"/*.txt 2>/dev/null | tail -n +21 | xargs -r rm -f 2>/dev/null || true
355
- }
356
-
357
- # ─── Restart rate cap ──────────────────────────────────────────────────
358
- #
359
- # Belt-and-suspenders for runaway restart loops (#550 follow-up). Even
360
- # with the in-flight detector + progress-fingerprint defences above,
361
- # there are pathological combinations (e.g. a stuck marker file the
362
- # sweep can't clear, a bridge that ESTAB-flaps once a minute) where
363
- # the watchdog could chew through Claude quota by restarting the same
364
- # agent N times an hour — every restart loads model context fresh.
365
- #
366
- # Rule: a single agent cannot be restarted by THIS watchdog more than
367
- # `MAX_RESTARTS_PER_WINDOW` times within `RESTART_RATE_WINDOW_SECS`.
368
- # When the cap trips, the restart is logged-and-skipped with a clear
369
- # `restart-rate-capped` reason so an operator can see the throttle
370
- # fired in `journalctl -t switchroom-watchdog | grep rate-capped`.
371
- #
372
- # The cap covers ALL three restart paths (bridge-disconnect, turn-hang,
373
- # journal-silence) plus the service-inactive heal — anything that
374
- # would cost a fresh `claude` startup. systemd's own
375
- # StartLimitBurst/IntervalSec is not enough on its own because each
376
- # `switchroom agent restart` resets that counter.
377
- #
378
- # State file: `${WATCHDOG_STATE_DIR}/${agent}.restarts` — newline-
379
- # separated epoch timestamps, trimmed to the window on every check.
380
- # tmpfs → cleared on logout, which is what we want (don't carry a
381
- # stale 30-min window across a reboot).
382
- : "${MAX_RESTARTS_PER_WINDOW:=5}"
383
- : "${RESTART_RATE_WINDOW_SECS:=1800}"
384
-
385
- # Returns 0 (allow) when the agent is under the cap. Returns 1 (block)
386
- # and emits a `[skip]` log line when the cap would be exceeded. Pure
387
- # read — does NOT record the restart; call restart_rate_record on the
388
- # allowed path.
389
- restart_rate_check() {
390
- local agent="$1"
391
- local reason_tag="$2"
392
- local rate_file="${WATCHDOG_STATE_DIR}/${agent}.restarts"
393
- [[ -f "$rate_file" ]] || return 0
394
- local now cutoff count=0
395
- now=$(now_epoch)
396
- cutoff=$(( now - RESTART_RATE_WINDOW_SECS ))
397
- while IFS= read -r ts; do
398
- [[ "$ts" =~ ^[0-9]+$ ]] || continue
399
- (( ts >= cutoff )) && count=$(( count + 1 ))
400
- done < "$rate_file"
401
- if (( count >= MAX_RESTARTS_PER_WINDOW )); then
402
- wd_log skip "agent=${agent} reason=${reason_tag} decision=restart-rate-capped recent=${count} max=${MAX_RESTARTS_PER_WINDOW} window=${RESTART_RATE_WINDOW_SECS}s (operator intervention required — investigate before clearing ${rate_file})"
403
- return 1
404
- fi
405
- return 0
406
- }
407
-
408
- # Append a restart timestamp and trim to the window. Best-effort I/O.
409
- restart_rate_record() {
410
- local agent="$1"
411
- local rate_file="${WATCHDOG_STATE_DIR}/${agent}.restarts"
412
- local now cutoff
413
- now=$(now_epoch)
414
- cutoff=$(( now - RESTART_RATE_WINDOW_SECS ))
415
- local tmp="${rate_file}.tmp-$$"
416
- {
417
- if [[ -f "$rate_file" ]]; then
418
- while IFS= read -r ts; do
419
- [[ "$ts" =~ ^[0-9]+$ ]] || continue
420
- (( ts >= cutoff )) && echo "$ts"
421
- done < "$rate_file"
422
- fi
423
- echo "$now"
424
- } > "$tmp" 2>/dev/null && mv -f "$tmp" "$rate_file" 2>/dev/null || rm -f "$tmp" 2>/dev/null || true
425
- }
426
-
427
- # Discover active gateway units. systemd's list-units output includes only
428
- # currently-loaded units; we filter to the switchroom-*-gateway.service
429
- # pattern and strip the prefix/suffix to get the agent name.
430
- mapfile -t gateway_services < <(
431
- systemctl --user list-units --type=service --state=active --no-legend --plain 2>/dev/null \
432
- | awk '{print $1}' \
433
- | grep -E '^switchroom-.+-gateway\.service$' || true
434
- )
435
-
436
- if [[ ${#gateway_services[@]} -eq 0 ]]; then
437
- # No active gateways — nothing to watch. Exit cleanly so the timer
438
- # keeps firing; transient absences (deploy windows) shouldn't error.
439
- exit 0
440
- fi
441
-
442
- for gateway_svc in "${gateway_services[@]}"; do
443
- # Extract agent name: switchroom-<agent>-gateway.service → <agent>
444
- agent="${gateway_svc#switchroom-}"
445
- agent="${agent%-gateway.service}"
446
- agent_svc="switchroom-${agent}.service"
447
-
448
- # Resolve the gateway's WorkingDirectory to locate its telegram state
449
- # dir. The gateway's gateway.log lives under WorkingDirectory/gateway.log
450
- # (the unit generator in src/agents/systemd.ts sets WorkingDirectory to
451
- # the agent's telegram/ subdir; see generateGatewayUnit).
452
- gateway_state_dir="$(
453
- systemctl --user show "$gateway_svc" -p WorkingDirectory --value 2>/dev/null
454
- )"
455
- if [[ -z "$gateway_state_dir" ]]; then
456
- wd_log error "agent=${agent} gateway has no WorkingDirectory; skipping"
457
- continue
458
- fi
459
- gateway_log="${gateway_state_dir}/gateway.log"
460
- # Sidecar file where we remember when the disconnected state started,
461
- # so we can detect SUSTAINED disconnection across ticks. Lives in the
462
- # same per-agent state dir so it's self-cleaning when an agent is
463
- # removed.
464
- disconnect_marker="${gateway_state_dir}/.watchdog-disconnect-since"
465
-
466
- if [[ ! -f "$gateway_log" ]]; then
467
- # Log file missing — gateway probably hasn't written a full turn yet.
468
- # Skip this tick; we'll try again in 60s.
469
- continue
470
- fi
471
-
472
- # If the agent service itself is inactive but the gateway is up,
473
- # treat that as a stale-bridge scenario too and restart it.
474
- #
475
- # Why: the agent service has `Restart=on-failure` in its unit (not
476
- # `Restart=always`) so a clean 0-exit of start.sh leaves it inactive.
477
- # That happens when Claude Code exits normally mid-session for any
478
- # reason (including external kill that start.sh handles gracefully).
479
- # Without this heal path the watchdog's earlier skip-if-inactive
480
- # guard left agents dead indefinitely.
481
- #
482
- # Production incident: 2026-04-22 ~03:44 AEST clerk's start.sh
483
- # exited with status=0/SUCCESS and the service went inactive. The
484
- # gateway stayed up; bridge was disconnected; systemd did nothing.
485
- if ! systemctl --user is-active --quiet "$agent_svc" 2>/dev/null; then
486
- # Also skip if the service is marked failed (start-limit-hit etc.)
487
- # — that needs operator intervention, not a restart loop.
488
- state="$(systemctl --user show "$agent_svc" -p ActiveState --value 2>/dev/null)"
489
- if [[ "$state" == "failed" ]]; then
490
- wd_log skip "agent=${agent} reason=service-failed decision=needs-operator-reset state=${state} $(agent_progress_snapshot "$agent") (unit in failed state; needs operator reset-failed)"
491
- continue
492
- fi
493
- if ! restart_rate_check "$agent" "service-inactive"; then
494
- continue
495
- fi
496
- wd_log restart "agent=${agent} reason=service-inactive state=${state} action=start $(agent_progress_snapshot "$agent") (agent service is inactive)"
497
- restart_rate_record "$agent"
498
- systemctl --user start "$agent_svc" || {
499
- wd_log error "agent=${agent} systemctl start failed"
500
- }
501
- continue
502
- fi
503
-
504
- # Uptime grace: freshly-started agents haven't had time to register
505
- # their bridge yet. systemctl emits ActiveEnterTimestamp in a format
506
- # like "Tue 2026-04-21 20:23:38 AEST"; ActiveEnterTimestampMonotonic
507
- # is easier to parse (microseconds since boot) but comparing to
508
- # wall-clock uptime is cross-platform-icky. We use the wall-clock
509
- # field and parse it with `date -d`, which systemd's format supports.
510
- active_enter_ts="$(
511
- systemctl --user show "$agent_svc" -p ActiveEnterTimestamp --value 2>/dev/null
512
- )"
513
- if [[ -n "$active_enter_ts" ]]; then
514
- # `date -d ""` fails; guard the empty case.
515
- active_enter_epoch="$(date -d "$active_enter_ts" +%s 2>/dev/null || echo 0)"
516
- if [[ "$active_enter_epoch" -gt 0 ]]; then
517
- uptime_secs=$(( $(now_epoch) - active_enter_epoch ))
518
- if [[ "$uptime_secs" -lt "$UPTIME_GRACE_SECS" ]]; then
519
- # Agent just started — give it time to come up. Clear any
520
- # stale disconnect marker from a previous cycle too, so the
521
- # grace window really is a clean slate.
522
- rm -f "$disconnect_marker" 2>/dev/null || true
523
- continue
524
- fi
525
- fi
526
- fi
527
-
528
- # Check the IPC socket for an actual ESTAB connection from the
529
- # agent's bridge. This is authoritative — if there's a live unix
530
- # socket, the bridge is connected right now. If not, it isn't.
531
- #
532
- # Why not just grep the gateway log: log grep used to be the check,
533
- # but it had a subtle bug. After a gateway restart, the log persists
534
- # across the restart (the gateway's `tee $LOG_PATH` appends). The
535
- # last "bridge registered" event might be from BEFORE the restart,
536
- # so `tail -1` reports it as healthy even though the agent hasn't
537
- # reconnected yet. Production incident 2026-04-22 ~07:20: clerk was
538
- # stuck with 0 IPC connections but watchdog said healthy because
539
- # the pre-restart "bridge registered" was the latest in the log.
540
- #
541
- # ss -x reads kernel-level socket state so it's immune to log
542
- # staleness. Unix sockets are visible without sudo for the owner.
543
- gateway_sock="${gateway_state_dir}/gateway.sock"
544
- if [[ ! -S "$gateway_sock" ]]; then
545
- # Socket file doesn't exist — gateway hasn't fully started or is
546
- # shutting down. Skip this tick; try again in 60s.
547
- continue
548
- fi
549
-
550
- ipc_estab_count=$(
551
- ss -x 2>/dev/null \
552
- | awk -v sock="$gateway_sock" '$1 == "u_str" && $2 == "ESTAB" && index($0, sock) { n++ } END { print n+0 }'
553
- )
554
-
555
- if (( ipc_estab_count > 0 )); then
556
- bridge_healthy=true
557
- else
558
- # ESTAB == 0: socket is disconnected. Before declaring the bridge dead,
559
- # check the liveness file the bridge writes on every heartbeat tick (~5s).
560
- # A recent mtime means the bridge process is alive but temporarily
561
- # reconnecting (e.g. after a gateway restart) — restarting the agent
562
- # here would be wasteful and would kill any in-flight Claude turn.
563
- liveness_file="${gateway_state_dir}/.bridge-alive"
564
- bridge_healthy=false
565
- if [[ -f "$liveness_file" ]]; then
566
- liveness_mtime=$(stat -c %Y "$liveness_file" 2>/dev/null || echo 0)
567
- liveness_age=$(( $(now_epoch) - liveness_mtime ))
568
- if (( liveness_age < LIVENESS_GRACE_SECS )); then
569
- bridge_healthy=true
570
- wd_log skip "agent=${agent} reason=bridge-socket-flap decision=liveness-file-fresh liveness_age=${liveness_age}s threshold=${LIVENESS_GRACE_SECS}s $(agent_observation "$agent") (liveness file is fresh)"
571
- fi
572
- fi
573
- fi
574
-
575
- if [[ "$bridge_healthy" == true ]]; then
576
- # Healthy — wipe the disconnect marker so the next disconnect
577
- # starts a fresh grace window.
578
- rm -f "$disconnect_marker" 2>/dev/null || true
579
- continue
580
- fi
581
-
582
- # Disconnected. Has it been sustained long enough to act?
583
- now="$(now_epoch)"
584
- if [[ -f "$disconnect_marker" ]]; then
585
- disc_since="$(cat "$disconnect_marker" 2>/dev/null || echo "$now")"
586
- # Paranoia: if the file got corrupted (non-numeric), treat as now.
587
- if ! [[ "$disc_since" =~ ^[0-9]+$ ]]; then
588
- disc_since="$now"
589
- echo "$now" > "$disconnect_marker"
590
- fi
591
- else
592
- # First observation of disconnect on this tick. Record it and wait.
593
- echo "$now" > "$disconnect_marker"
594
- disc_since="$now"
595
- fi
596
-
597
- disc_duration=$(( now - disc_since ))
598
- if [[ "$disc_duration" -lt "$DISCONNECT_GRACE_SECS" ]]; then
599
- # Transient flap — the bridge IPC disconnects across Claude Code
600
- # turn boundaries. Don't restart yet; give it another tick or two.
601
- continue
602
- fi
603
-
604
- # Progress gate — same defence as turn-hang/journal-silence. A
605
- # bridge can flap (MCP plugin crash, transient socket close)
606
- # while a sub-agent is still doing real work. Without this gate
607
- # the bridge-disconnect path would kill any in-flight sub-agent
608
- # whenever the bridge had a bad minute. Skip the restart if any
609
- # forward-progress fingerprint is fresh and just keep the
610
- # disconnect marker around — next tick will re-evaluate.
611
- observation=$(agent_observation "$agent")
612
- if agent_has_recent_progress "$agent" "$JSONL_LIVENESS_SECS"; then
613
- wd_log skip "agent=${agent} reason=bridge-disconnect disc_duration=${disc_duration}s threshold=${DISCONNECT_GRACE_SECS}s decision=defer-progress-fresh ${observation}"
614
- continue
615
- fi
616
-
617
- wd_log detect "agent=${agent} reason=bridge-disconnect disc_duration=${disc_duration}s threshold=${DISCONNECT_GRACE_SECS}s ${observation}"
618
- if ! restart_rate_check "$agent" "bridge-disconnect"; then
619
- continue
620
- fi
621
- wd_log restart "agent=${agent} reason=bridge-disconnect disc_duration=${disc_duration}s threshold=${DISCONNECT_GRACE_SECS}s ${observation}"
622
- capture_pane_before_restart "$agent" "bridge-disconnect"
623
- restart_rate_record "$agent"
624
- # Clear the marker so post-restart we don't immediately re-trip on
625
- # the still-old tail. The uptime grace will cover the startup window
626
- # anyway, but removing the marker keeps state clean.
627
- rm -f "$disconnect_marker" 2>/dev/null || true
628
- # Stamp WHY before killing so the next agent greeting card can show
629
- # "Restarted watchdog: bridge disconnected for ${disc_duration}s".
630
- # The gateway's own SIGTERM handler writes `clean-shutdown.json` on
631
- # shutdown too — but its marker carries no `reason`, so the greeting
632
- # omits the row.
633
- stamp_restart_reason \
634
- "${gateway_state_dir}/clean-shutdown.json" \
635
- "watchdog: bridge disconnected for ${disc_duration}s"
636
- # Route through `switchroom agent restart` (not raw systemctl) for
637
- # parity with the turn-hang and journal-silence paths: the CLI's
638
- # in-flight guard is one more belt-and-suspenders check, and config
639
- # reconciliation runs on every lifecycle transition per the project
640
- # contract. Falls back to systemctl if the CLI isn't on PATH.
641
- switchroom_cli=""
642
- for candidate in "${HOME}/.bun/bin/switchroom" "${HOME}/.local/bin/switchroom"; do
643
- if [[ -x "$candidate" ]]; then
644
- switchroom_cli="$candidate"
645
- break
646
- fi
647
- done
648
- if [[ -z "$switchroom_cli" ]] && command -v switchroom >/dev/null 2>&1; then
649
- switchroom_cli="$(command -v switchroom)"
650
- fi
651
- if [[ -n "$switchroom_cli" ]]; then
652
- "$switchroom_cli" agent restart "$agent" || {
653
- wd_log error "agent=${agent} switchroom agent restart failed; falling back to systemctl --user restart"
654
- systemctl --user restart "$agent_svc" || true
655
- }
656
- else
657
- wd_log error "agent=${agent} switchroom CLI not on PATH; using systemctl restart fallback"
658
- systemctl --user restart "$agent_svc" || true
659
- fi
660
- done
661
-
662
- # ─── Auth refresh tick ───────────────────────────────────────────────────────
663
- #
664
- # Wire `switchroom auth refresh-tick` into every watchdog cycle (issue #429
665
- # Phase 1). The command is idempotent and cheap when tokens are healthy, so
666
- # it's safe to run once per watchdog tick (≈60s).
667
- #
668
- # Two independently-tunable knobs (both default to 600, but for different
669
- # reasons — coincidence, not coupling):
670
- #
671
- # AUTH_REFRESH_INTERVAL_SECS — how often the watchdog runs the CLI at all.
672
- # Gated by a state-file timestamp; the CLI is skipped entirely until this
673
- # many seconds have passed since the last run. Default 600s (10 min).
674
- #
675
- # AUTH_REFRESH_THRESHOLD_MS — how close to expiry a token must be before
676
- # the CLI actually contacts the OAuth endpoint to refresh it. Passed as
677
- # --threshold-ms. Default 600000 ms (10 min). Operators who want earlier
678
- # proactive refreshes (e.g. 1800000 ms = 30 min) can raise this without
679
- # touching the run cadence, and vice-versa.
680
- #
681
- # Disabled by setting WATCHDOG_REFRESH_AUTH=0 (default on).
682
- : "${WATCHDOG_REFRESH_AUTH:=1}"
683
- : "${AUTH_REFRESH_INTERVAL_SECS:=600}"
684
- : "${AUTH_REFRESH_THRESHOLD_MS:=600000}"
685
-
686
- if [[ "${WATCHDOG_REFRESH_AUTH}" == "1" ]]; then
687
- auth_refresh_marker="${WATCHDOG_STATE_DIR}/.auth-refresh-last"
688
- last_refresh=0
689
- if [[ -f "$auth_refresh_marker" ]]; then
690
- last_refresh="$(cat "$auth_refresh_marker" 2>/dev/null || echo 0)"
691
- [[ "$last_refresh" =~ ^[0-9]+$ ]] || last_refresh=0
692
- fi
693
- now_for_auth="$(now_epoch)"
694
- auth_age=$(( now_for_auth - last_refresh ))
695
- if [[ "$auth_age" -ge "$AUTH_REFRESH_INTERVAL_SECS" ]]; then
696
- # Resolve the switchroom CLI (same pattern as restart paths above).
697
- switchroom_cli_auth=""
698
- for candidate in "${HOME}/.bun/bin/switchroom" "${HOME}/.local/bin/switchroom"; do
699
- if [[ -x "$candidate" ]]; then
700
- switchroom_cli_auth="$candidate"
701
- break
702
- fi
703
- done
704
- if [[ -z "$switchroom_cli_auth" ]] && command -v switchroom >/dev/null 2>&1; then
705
- switchroom_cli_auth="$(command -v switchroom)"
706
- fi
707
- if [[ -n "$switchroom_cli_auth" ]]; then
708
- wd_log detect "auth-refresh age=${auth_age}s threshold=${AUTH_REFRESH_INTERVAL_SECS}s decision=run-refresh-tick"
709
- if "$switchroom_cli_auth" auth refresh-tick --threshold-ms "${AUTH_REFRESH_THRESHOLD_MS}" >/dev/null 2>&1; then
710
- echo "$now_for_auth" > "$auth_refresh_marker"
711
- wd_log skip "auth-refresh decision=tick-complete threshold_ms=${AUTH_REFRESH_THRESHOLD_MS}"
712
- else
713
- wd_log error "auth-refresh switchroom auth refresh-tick exited non-zero (partial failures are logged by the CLI; state file not updated)"
714
- fi
715
- else
716
- wd_log error "auth-refresh switchroom CLI not on PATH; skipping refresh tick"
717
- fi
718
- fi
719
- fi
720
-
721
- # ─── Journal-silence check ───────────────────────────────────────────────────
722
- #
723
- # Independent of the bridge-disconnect check above. For each active
724
- # switchroom-<agent>.service unit (NOT the gateway), verify that it has
725
- # emitted at least one journal entry within JOURNAL_SILENCE_SECS. If an
726
- # agent has been silent longer than that AND uptime has cleared
727
- # UPTIME_GRACE_SECS, record a silence_since marker in the watchdog state
728
- # dir. Once the marker is older than JOURNAL_SILENCE_HARD_SECS, restart
729
- # via `switchroom agent restart <agent>` (the contracted reconcile+restart
730
- # path; NOT raw systemctl restart, which would bypass switchroom's
731
- # config reconciliation).
732
- #
733
- # Why `switchroom agent restart` rather than `systemctl --user restart`:
734
- # the project contract is that all lifecycle transitions go through the
735
- # switchroom CLI so that config reconciliation always runs. Raw systemctl
736
- # calls skip that step and can leave units with stale unit files.
737
-
738
- mapfile -t agent_services < <(
739
- systemctl --user list-units --type=service --state=active --no-legend --plain 2>/dev/null \
740
- | awk '{print $1}' \
741
- | grep -E '^switchroom-.+\.service$' \
742
- | grep -v -E '^switchroom-(gateway|vault-broker|foreman)\.service$' \
743
- | grep -v -E '^switchroom-.+-gateway\.service$' \
744
- | grep -v -E '^switchroom-.+-cron-[0-9]+\.service$' || true
745
- )
746
-
747
- for agent_svc in "${agent_services[@]}"; do
748
- # Extract agent name: switchroom-<agent>.service → <agent>
749
- agent="${agent_svc#switchroom-}"
750
- agent="${agent%.service}"
751
-
752
- silence_marker="${WATCHDOG_STATE_DIR}/${agent}.silence_since"
753
-
754
- # Uptime grace: same logic as the bridge check. Fresh agents haven't
755
- # had time to settle into a normal logging cadence.
756
- active_enter_ts="$(
757
- systemctl --user show "$agent_svc" -p ActiveEnterTimestamp --value 2>/dev/null
758
- )"
759
- if [[ -n "$active_enter_ts" ]]; then
760
- active_enter_epoch="$(date -d "$active_enter_ts" +%s 2>/dev/null || echo 0)"
761
- if [[ "$active_enter_epoch" -gt 0 ]]; then
762
- uptime_secs=$(( $(now_epoch) - active_enter_epoch ))
763
- if [[ "$uptime_secs" -lt "$UPTIME_GRACE_SECS" ]]; then
764
- # Clear stale silence marker on fresh start so the grace window
765
- # is a clean slate.
766
- rm -f "$silence_marker" 2>/dev/null || true
767
- continue
768
- fi
769
- fi
770
- fi
771
-
772
- # Issue #412: turn-active marker hang detector. The gateway writes
773
- # `<agentDir>/telegram/turn-active.json` at turn-start, bumps its
774
- # mtime on every tool_use, and removes it on turn_complete. If the
775
- # file is older than TURN_HANG_SECS, the agent is wedged mid-turn —
776
- # distinguishable from healthy idle because healthy idle leaves no
777
- # marker file at all. This closes the gap left when JOURNAL_SILENCE_SECS
778
- # was raised to 4000s (PR #410) to kill chat-cadence false positives.
779
- agent_state_dir="${HOME}/.switchroom/agents/${agent}/telegram"
780
- turn_active_file="${agent_state_dir}/turn-active.json"
781
- if [[ -f "$turn_active_file" ]]; then
782
- turn_mtime=$(stat -c %Y "$turn_active_file" 2>/dev/null || echo 0)
783
- if [[ "$turn_mtime" -gt 0 ]]; then
784
- turn_age=$(( $(now_epoch) - turn_mtime ))
785
- if [[ "$turn_age" -ge "$TURN_HANG_SECS" ]]; then
786
- # Progress gate — sub-agent activity does NOT bump the
787
- # parent's turn-active marker, so a stale marker plus fresh
788
- # JSONL writes means a sub-agent (or the main turn) is doing
789
- # real work and a restart would kill it mid-flight. This was
790
- # the dominant false-positive path observed in the journal
791
- # 2026-05-02 (finn/klanker restarted while sub-agents had
792
- # `last activity: 0s ago` per the in-flight detector).
793
- observation=$(agent_observation "$agent")
794
- if agent_has_recent_progress "$agent" "$JSONL_LIVENESS_SECS"; then
795
- wd_log skip "agent=${agent} reason=turn-hang turn_age=${turn_age}s threshold=${TURN_HANG_SECS}s decision=defer-progress-fresh ${observation}"
796
- continue
797
- fi
798
- wd_log detect "agent=${agent} reason=turn-hang turn_age=${turn_age}s threshold=${TURN_HANG_SECS}s ${observation} (no progress fingerprints within ${JSONL_LIVENESS_SECS}s — wedged mid-turn)"
799
- if ! restart_rate_check "$agent" "turn-hang"; then
800
- continue
801
- fi
802
- # Stamp the reason BEFORE the restart so the next greeting
803
- # card renders "Restarted watchdog: …".
804
- stamp_restart_reason \
805
- "${agent_state_dir}/clean-shutdown.json" \
806
- "watchdog: turn-active marker stale ${turn_age}s with no JSONL activity"
807
- wd_log restart "agent=${agent} reason=turn-hang turn_age=${turn_age}s threshold=${TURN_HANG_SECS}s ${observation}"
808
- capture_pane_before_restart "$agent" "turn-hang"
809
- restart_rate_record "$agent"
810
- # Resolve the switchroom CLI (same belt-and-suspenders as below)
811
- switchroom_cli=""
812
- for candidate in "${HOME}/.bun/bin/switchroom" "${HOME}/.local/bin/switchroom"; do
813
- if [[ -x "$candidate" ]]; then
814
- switchroom_cli="$candidate"
815
- break
816
- fi
817
- done
818
- if [[ -z "$switchroom_cli" ]] && command -v switchroom >/dev/null 2>&1; then
819
- switchroom_cli="$(command -v switchroom)"
820
- fi
821
- if [[ -n "$switchroom_cli" ]]; then
822
- "$switchroom_cli" agent restart "$agent" || {
823
- wd_log error "agent=${agent} switchroom agent restart failed; falling back to systemctl --user restart"
824
- systemctl --user restart "$agent_svc" || true
825
- }
826
- else
827
- wd_log error "agent=${agent} switchroom CLI not on PATH; using systemctl restart fallback"
828
- systemctl --user restart "$agent_svc" || true
829
- fi
830
- # Restarted — skip remaining checks for this agent this tick.
831
- continue
832
- fi
833
- fi
834
- fi
835
-
836
- # Read the timestamp of the most recent journal entry from this unit.
837
- # --output=short-unix gives "EPOCH.USEC MESSAGE" format; we grab the
838
- # leading integer epoch seconds.
839
- latest_journal_line="$(
840
- journalctl --user -u "$agent_svc" -n 1 --output=short-unix --no-pager 2>/dev/null || true
841
- )"
842
- latest_journal_epoch=0
843
- if [[ -n "$latest_journal_line" ]]; then
844
- # short-unix format: "1745632800.123456 hostname unit[pid]: message"
845
- # Extract the leading epoch (integer part before the dot or space).
846
- candidate="$(echo "$latest_journal_line" | awk '{print $1}' | cut -d. -f1)"
847
- if [[ "$candidate" =~ ^[0-9]+$ ]]; then
848
- latest_journal_epoch="$candidate"
849
- fi
850
- fi
851
-
852
- now="$(now_epoch)"
853
- if [[ "$latest_journal_epoch" -eq 0 ]]; then
854
- # No journal entries at all — possibly a very new unit that hasn't
855
- # logged yet. Treat conservatively: skip this tick (uptime grace
856
- # should have caught a genuine fresh start above, so this branch
857
- # mostly hits units that truly haven't logged due to a bug — still
858
- # give them one tick of benefit of the doubt).
859
- continue
860
- fi
861
-
862
- journal_age=$(( now - latest_journal_epoch ))
863
-
864
- if [[ "$journal_age" -lt "$JOURNAL_SILENCE_SECS" ]]; then
865
- # Journal is fresh — clear any stale silence marker and move on.
866
- rm -f "$silence_marker" 2>/dev/null || true
867
- continue
868
- fi
869
-
870
- # Recent-activity gate: only suspect a hang if the agent had log activity
871
- # within RECENT_ACTIVITY_WINDOW_SECS. A genuinely idle agent (e.g. a
872
- # personal agent that hasn't received a message in hours/days) has its
873
- # latest journal entry far in the past — restarting it would just churn
874
- # state for no reason. A hung agent, by contrast, was active before
875
- # freezing, so its most recent entry is recent (within the window).
876
- #
877
- # Implementation: if `journal_age >= RECENT_ACTIVITY_WINDOW_SECS`, the
878
- # latest entry is older than the window, so by definition there's no
879
- # activity inside it. Treat as idle — clear any stale marker and skip.
880
- if [[ "$journal_age" -ge "$RECENT_ACTIVITY_WINDOW_SECS" ]]; then
881
- rm -f "$silence_marker" 2>/dev/null || true
882
- continue
883
- fi
884
-
885
- # Journal has been silent for >= JOURNAL_SILENCE_SECS but the agent had
886
- # activity within RECENT_ACTIVITY_WINDOW_SECS. Record the first
887
- # observation so we can require sustained silence.
888
- if [[ -f "$silence_marker" ]]; then
889
- silence_since="$(cat "$silence_marker" 2>/dev/null || echo "$now")"
890
- if ! [[ "$silence_since" =~ ^[0-9]+$ ]]; then
891
- silence_since="$now"
892
- echo "$now" > "$silence_marker"
893
- fi
894
- else
895
- echo "$now" > "$silence_marker"
896
- silence_since="$now"
897
- wd_log detect "agent=${agent} reason=journal-silence journal_age=${journal_age}s threshold=${JOURNAL_SILENCE_SECS}s decision=record-silence-marker $(agent_observation "$agent") (will restart after ${JOURNAL_SILENCE_HARD_SECS}s of sustained silence)"
898
- continue
899
- fi
900
-
901
- silence_duration=$(( now - silence_since ))
902
- if [[ "$silence_duration" -lt "$JOURNAL_SILENCE_HARD_SECS" ]]; then
903
- # Silence not yet sustained long enough to act.
904
- continue
905
- fi
906
-
907
- # Progress gate — same defence as the turn-hang path. A silent
908
- # agent journal can co-exist with a busy sub-agent (the parent's
909
- # stdout goes quiet while the sub-agent runs). If JSONL or tasks
910
- # writes are happening, real work is in progress; don't restart.
911
- observation=$(agent_observation "$agent")
912
- if agent_has_recent_progress "$agent" "$JSONL_LIVENESS_SECS"; then
913
- wd_log skip "agent=${agent} reason=journal-silence journal_age=${journal_age}s silence_duration=${silence_duration}s threshold=${JOURNAL_SILENCE_HARD_SECS}s decision=defer-progress-fresh ${observation}"
914
- rm -f "$silence_marker" 2>/dev/null || true
915
- continue
916
- fi
917
-
918
- # The agent has been journal-silent for >= JOURNAL_SILENCE_HARD_SECS
919
- # AND has cleared the uptime grace AND has no progress fingerprints.
920
- # This matches the production hang pattern (issue #116). Restart
921
- # via the switchroom CLI.
922
- wd_log detect "agent=${agent} reason=journal-silence journal_age=${journal_age}s silence_duration=${silence_duration}s threshold=${JOURNAL_SILENCE_HARD_SECS}s ${observation} (no progress fingerprints — wedged)"
923
- if ! restart_rate_check "$agent" "journal-silence"; then
924
- continue
925
- fi
926
- agent_state_dir="${HOME}/.switchroom/agents/${agent}/telegram"
927
- stamp_restart_reason \
928
- "${agent_state_dir}/clean-shutdown.json" \
929
- "watchdog: journal silent for ${journal_age}s with no progress activity"
930
- wd_log restart "agent=${agent} reason=journal-silence journal_age=${journal_age}s silence_duration=${silence_duration}s threshold=${JOURNAL_SILENCE_HARD_SECS}s ${observation}"
931
- capture_pane_before_restart "$agent" "journal-silence"
932
- restart_rate_record "$agent"
933
- rm -f "$silence_marker" 2>/dev/null || true
934
-
935
- # Use `switchroom agent restart` (not raw systemctl) — the project
936
- # contract is that all agent lifecycle transitions go through the CLI
937
- # so config reconciliation always runs.
938
- #
939
- # Belt-and-suspenders CLI resolution (issue #406): the systemd .service
940
- # unit pins Environment=PATH=~/.bun/bin:..., but if a hand-installed
941
- # legacy unit is still on disk the PATH may be empty. Probe the two
942
- # known install locations directly before falling back to PATH lookup,
943
- # so a silent PATH gap can't silently downgrade us to the systemctl
944
- # fallback (which bypasses reconcile).
945
- switchroom_cli=""
946
- for candidate in "${HOME}/.bun/bin/switchroom" "${HOME}/.local/bin/switchroom"; do
947
- if [[ -x "$candidate" ]]; then
948
- switchroom_cli="$candidate"
949
- break
950
- fi
951
- done
952
- if [[ -z "$switchroom_cli" ]] && command -v switchroom >/dev/null 2>&1; then
953
- switchroom_cli="$(command -v switchroom)"
954
- fi
955
-
956
- if [[ -n "$switchroom_cli" ]]; then
957
- "$switchroom_cli" agent restart "$agent" || {
958
- wd_log error "agent=${agent} switchroom agent restart failed; falling back to systemctl --user restart"
959
- systemctl --user restart "$agent_svc" || true
960
- }
961
- else
962
- # Fallback: if the switchroom CLI isn't on PATH (unusual), use systemctl
963
- # directly and log the degraded path.
964
- wd_log error "agent=${agent} switchroom CLI not on PATH; using systemctl restart fallback"
965
- systemctl --user restart "$agent_svc" || true
966
- fi
967
- done