claude-slack-channel-bots 0.7.2 → 0.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "claude-slack-channel-bots",
3
- "version": "0.7.2",
3
+ "version": "0.8.0",
4
4
  "description": "Multi-session Slack-to-Claude bridge — run multiple Claude Code bots across Slack channels via Socket Mode",
5
5
  "type": "module",
6
6
  "bin": {
@@ -31,7 +31,7 @@
31
31
  "@modelcontextprotocol/sdk": "^1.0.0",
32
32
  "@slack/socket-mode": "^2.0.0",
33
33
  "@slack/web-api": "^7.0.0",
34
- "agent-director": "^0.7.0",
34
+ "agent-director": "^0.7.8",
35
35
  "semver": "^7.6.0"
36
36
  },
37
37
  "devDependencies": {
@@ -11,6 +11,10 @@
11
11
  * - ErrSystemInstallNotFound (Client.create / resolveSystemBinary — no `agent-director` on PATH or at ~/.agent-director)
12
12
  * - ErrSystemInstallTooOld (Client.create / resolveSystemBinary — system binary older than required minimum)
13
13
  * - ErrSystemInstallUnreachable (Client.create / resolveSystemBinary — system binary present but not executable or fails --version)
14
+ * - ErrSystemInstallDisappeared (any verb / binary gone after valid construction — b.xht)
15
+ * - ErrTmuxNotAvailable (spawn / tmux binary not found or not executable)
16
+ * - ErrCwdNotFound (spawn / route cwd does not exist on disk)
17
+ * - ErrCwdNotADirectory (spawn / route cwd path exists but is not a directory)
14
18
  * - ErrInstanceIdCollision (spawn / SR-1.4 idempotency)
15
19
  * - ErrSpawnNotFound (get / status / decide on missing row)
16
20
  * - ErrNoSessionId (resume / SR-1.3 fallthrough)
@@ -41,6 +45,10 @@ export {
41
45
  ErrSystemInstallNotFound,
42
46
  ErrSystemInstallTooOld,
43
47
  ErrSystemInstallUnreachable,
48
+ ErrSystemInstallDisappeared,
49
+ ErrTmuxNotAvailable,
50
+ ErrCwdNotFound,
51
+ ErrCwdNotADirectory,
44
52
  ErrCallTimeout,
45
53
  ErrInstanceIdCollision,
46
54
  ErrSpawnNotFound,
@@ -8,6 +8,8 @@
8
8
  * SPDX-License-Identifier: MIT
9
9
  */
10
10
 
11
+ import { setOutageFlag, clearOutageFlag } from './outage-state.ts'
12
+
11
13
  // ---------------------------------------------------------------------------
12
14
  // Types
13
15
  // ---------------------------------------------------------------------------
@@ -15,7 +17,7 @@
15
17
  export interface HealthCheckDeps {
16
18
  isSessionAlive(channelId: string): Promise<boolean>
17
19
  isRestartPendingOrActive(channelId: string): boolean
18
- hasReachedMaxFailures(channelId: string): boolean
20
+ statRoute(cwd: string): Promise<boolean>
19
21
  scheduleRestart(channelId: string, cwd: string): void
20
22
  isShuttingDown(): boolean
21
23
  getRoutes(): Record<string, string>
@@ -27,6 +29,8 @@ export interface HealthCheckDeps {
27
29
 
28
30
  let deps: HealthCheckDeps | null = null
29
31
  let intervalId: ReturnType<typeof setInterval> | null = null
32
+ let tickInFlight = false
33
+ let skippedTicks = 0
30
34
 
31
35
  // ---------------------------------------------------------------------------
32
36
  // initHealthCheck
@@ -46,21 +50,45 @@ export function startHealthCheck(intervalSeconds: number): void {
46
50
  intervalId = setInterval(async () => {
47
51
  if (!deps) return
48
52
  if (deps.isShuttingDown()) return
49
-
50
- const routes = deps.getRoutes()
51
-
52
- for (const [channelId, cwd] of Object.entries(routes)) {
53
- try {
54
- if (deps.isRestartPendingOrActive(channelId)) continue
55
- if (deps.hasReachedMaxFailures(channelId)) continue
56
-
57
- const alive = await deps.isSessionAlive(channelId)
58
- if (!alive) {
59
- deps.scheduleRestart(channelId, cwd)
53
+ if (tickInFlight) {
54
+ skippedTicks++
55
+ // Fire exactly once when the streak crosses 5 (4 → 5 transition). At the
56
+ // 120 s interval, five consecutive skips represents ~10 minutes of
57
+ // tick-budget exhaustion — long enough to indicate genuine health-check
58
+ // wedging (e.g., a persistently hung statRoute or isSessionAliveAdapter)
59
+ // rather than transient slowness. Further skips within the same streak
60
+ // are silent; the next successfully-started tick body resets the counter
61
+ // and re-arms the warning for a future streak.
62
+ if (skippedTicks === 5) {
63
+ console.error('[slack] health-check: tick body in flight; skipped 5 consecutive ticks — investigate budget exhaustion')
64
+ }
65
+ return
66
+ }
67
+ tickInFlight = true
68
+ skippedTicks = 0
69
+ try {
70
+ const routes = deps.getRoutes()
71
+
72
+ for (const [channelId, cwd] of Object.entries(routes)) {
73
+ try {
74
+ if (deps.isRestartPendingOrActive(channelId)) continue
75
+
76
+ if (await deps.statRoute(cwd)) {
77
+ clearOutageFlag(channelId, 'cwd-unreachable')
78
+ } else {
79
+ setOutageFlag(channelId, 'cwd-unreachable', cwd)
80
+ }
81
+
82
+ const alive = await deps.isSessionAlive(channelId)
83
+ if (!alive) {
84
+ deps.scheduleRestart(channelId, cwd)
85
+ }
86
+ } catch (err) {
87
+ console.error(`[slack] health-check: error checking channel=${channelId}:`, err)
60
88
  }
61
- } catch (err) {
62
- console.error(`[slack] health-check: error checking channel=${channelId}:`, err)
63
89
  }
90
+ } finally {
91
+ tickInFlight = false
64
92
  }
65
93
  }, intervalSeconds * 1000)
66
94
  }
@@ -86,4 +114,6 @@ export function _resetHealthCheckState(): void {
86
114
  intervalId = null
87
115
  }
88
116
  deps = null
117
+ tickInFlight = false
118
+ skippedTicks = 0
89
119
  }
@@ -0,0 +1,287 @@
1
+ /**
2
+ * outage-state.ts — Channel-scoped outage-flag state machine + Slack emit surface.
3
+ *
4
+ * Tracks three orthogonal outage classes per routed Slack channel and emits
5
+ * onset / all-clear messages via the injected `postToChannel` hook. The module
6
+ * is intentionally free of Date / timestamp logic — operators scroll back to
7
+ * the onset message for timing context.
8
+ *
9
+ * Public API surface (all exported):
10
+ * - initOutageState(deps) — install production dependencies
11
+ * - getOutageFlags(channelId) — read live flag set
12
+ * - setOutageFlag(channelId, cls, detail?) — raise flag + emit onset message
13
+ * - clearOutageFlag(channelId, cls) — lower flag; emits all-clear when set empties
14
+ * - resetAllToHealthy(channelIds) — silent bulk wipe (boot-time reset)
15
+ * - withOutageDetection(ch, cwd, fn) — AD verb wrapper; raises/clears flags on error/success
16
+ * - withSpawnDetection(ch, cwd, fn) — like withOutageDetection + clears cwd-unreachable on success
17
+ * - _resetOutageState() — test-only state reset
18
+ *
19
+ * Template exports (used by tests):
20
+ * - ONSET_TEMPLATES
21
+ * - ALL_CLEAR_TEMPLATE
22
+ *
23
+ * SPDX-License-Identifier: MIT
24
+ */
25
+
26
+ import type { Client } from 'agent-director'
27
+ import {
28
+ ErrSystemInstallDisappeared,
29
+ ErrTmuxNotAvailable,
30
+ ErrCwdNotFound,
31
+ ErrCwdNotADirectory,
32
+ } from './agent-director-errors.ts'
33
+
34
+ // ---------------------------------------------------------------------------
35
+ // Data types
36
+ // ---------------------------------------------------------------------------
37
+
38
+ /** Union of all outage classifications. No 'healthy' member — absence == healthy. */
39
+ export type OutageClass = 'ad-unreachable' | 'cwd-unreachable' | 'tmux-unavailable'
40
+
41
+ /**
42
+ * Detail record for a single outage class in a bad stretch.
43
+ * No `enteredAtIso` — timestamps are intentionally absent from the all-clear template.
44
+ */
45
+ export interface ClassRecord {
46
+ detail?: string
47
+ }
48
+
49
+ /** Per-channel state entry. */
50
+ interface ChannelEntry {
51
+ /** Currently active outage flags. */
52
+ flags: Set<OutageClass>
53
+ /** Class → detail record for the current bad stretch. Reset to empty Map on all-clear. */
54
+ badStretchClasses: Map<OutageClass, ClassRecord>
55
+ }
56
+
57
+ /** Dependencies injected via `initOutageState` — wires the module to production Slack + AD. */
58
+ export interface OutageStateDeps {
59
+ /** Fire-and-forget Slack post; errors MUST be handled internally by the caller. */
60
+ postToChannel(channelId: string, text: string): void
61
+ /** Return the singleton AD Client. Same semantics as getClient() in agent-director-client.ts. */
62
+ getClient(): Client
63
+ }
64
+
65
+ // ---------------------------------------------------------------------------
66
+ // Module-scoped state
67
+ // ---------------------------------------------------------------------------
68
+
69
+ let deps: OutageStateDeps | undefined
70
+ const entries = new Map<string, ChannelEntry>()
71
+
72
+ // ---------------------------------------------------------------------------
73
+ // Helpers
74
+ // ---------------------------------------------------------------------------
75
+
76
+ /** Lazy-create a ChannelEntry for `channelId` on first access. */
77
+ function entryFor(channelId: string): ChannelEntry {
78
+ let entry = entries.get(channelId)
79
+ if (!entry) {
80
+ entry = { flags: new Set(), badStretchClasses: new Map() }
81
+ entries.set(channelId, entry)
82
+ }
83
+ return entry
84
+ }
85
+
86
+ // ---------------------------------------------------------------------------
87
+ // Slack message templates
88
+ // ---------------------------------------------------------------------------
89
+
90
+ /** Stable iteration order for the all-clear template. */
91
+ const STABLE_CLASS_ORDER: OutageClass[] = [
92
+ 'ad-unreachable',
93
+ 'cwd-unreachable',
94
+ 'tmux-unavailable',
95
+ ]
96
+
97
+ /**
98
+ * ONSET_TEMPLATES — one template function per outage class.
99
+ * The optional `detail` parameter carries class-specific context
100
+ * (binary path for ad-unreachable; cwd path for cwd-unreachable).
101
+ */
102
+ export const ONSET_TEMPLATES: Record<OutageClass, (detail?: string) => string> = {
103
+ 'ad-unreachable': (binaryPath?: string) =>
104
+ `:rotating_light: *agent-director unreachable* — affects every routed channel.\nBinary: \`${binaryPath ?? '<unknown>'}\`\nRemediation: reinstall agent-director.`,
105
+
106
+ 'tmux-unavailable': (_detail?: string) =>
107
+ `:rotating_light: *tmux unavailable* — affects every routed channel.\nRemediation: install or repair tmux.`,
108
+
109
+ 'cwd-unreachable': (cwd?: string) =>
110
+ `:rotating_light: *Route cwd unreachable* — \`${cwd ?? '<unknown>'}\`\nRemediation: restore the directory or remove this route from \`config.json\`.`,
111
+ }
112
+
113
+ /**
114
+ * ALL_CLEAR_TEMPLATE — renders a Slack all-clear message from the bad-stretch
115
+ * history snapshot. Entries are emitted in the stable class order regardless
116
+ * of the order flags were raised. No timestamps.
117
+ */
118
+ export function ALL_CLEAR_TEMPLATE(resolved: Map<OutageClass, ClassRecord>): string {
119
+ const parts: string[] = []
120
+ for (const cls of STABLE_CLASS_ORDER) {
121
+ const rec = resolved.get(cls)
122
+ if (rec === undefined) continue
123
+ const detailSuffix = rec.detail !== undefined ? ` (\`${rec.detail}\`)` : ''
124
+ parts.push(`\`${cls}\`${detailSuffix}`)
125
+ }
126
+ return `:white_check_mark: *All clear.* Resolved: ${parts.join(', ')}.`
127
+ }
128
+
129
+ // ---------------------------------------------------------------------------
130
+ // Public API — init + accessors
131
+ // ---------------------------------------------------------------------------
132
+
133
+ /**
134
+ * initOutageState — installs production dependencies. Called once from
135
+ * `src/server.ts:main()` after `routingConfig` is loaded, before the
136
+ * Socket Mode connect block.
137
+ */
138
+ export function initOutageState(d: OutageStateDeps): void {
139
+ deps = d
140
+ }
141
+
142
+ /**
143
+ * getOutageFlags — returns the live read-only flag set for `channelId`.
144
+ * Returns an empty `ReadonlySet` sentinel when no entry exists yet.
145
+ */
146
+ export function getOutageFlags(channelId: string): ReadonlySet<OutageClass> {
147
+ return entries.get(channelId)?.flags ?? (new Set<OutageClass>() as ReadonlySet<OutageClass>)
148
+ }
149
+
150
+ // ---------------------------------------------------------------------------
151
+ // Public API — mutators
152
+ // ---------------------------------------------------------------------------
153
+
154
+ /**
155
+ * setOutageFlag — raises `cls` for `channelId` and emits an onset Slack
156
+ * message. Same-flag re-raise is a silent no-op (dedupe).
157
+ *
158
+ * State mutates BEFORE the emit so a synchronous throw in `postToChannel`
159
+ * cannot cause double-emission on the next observation.
160
+ */
161
+ export function setOutageFlag(channelId: string, cls: OutageClass, detail?: string): void {
162
+ if (!deps) return
163
+ const entry = entryFor(channelId)
164
+ if (entry.flags.has(cls)) return // same-flag dedupe
165
+ // Mutate state BEFORE emit (SR-V-2.x state-before-emit contract).
166
+ entry.flags.add(cls)
167
+ entry.badStretchClasses.set(cls, { detail })
168
+ deps.postToChannel(channelId, ONSET_TEMPLATES[cls](detail))
169
+ }
170
+
171
+ /**
172
+ * clearOutageFlag — lowers `cls` for `channelId`. Emits the all-clear Slack
173
+ * message ONLY when the clear leaves the flag set empty and there is a
174
+ * non-empty bad-stretch history (i.e., at least one onset was recorded).
175
+ * Intermediate clears (flag set still non-empty after removal) are silent.
176
+ *
177
+ * State mutates BEFORE the emit (same contract as setOutageFlag).
178
+ */
179
+ export function clearOutageFlag(channelId: string, cls: OutageClass): void {
180
+ if (!deps) return
181
+ const entry = entries.get(channelId)
182
+ if (!entry) return
183
+ if (!entry.flags.has(cls)) return // same-state dedupe
184
+ // Mutate state BEFORE emit.
185
+ entry.flags.delete(cls)
186
+ if (entry.flags.size === 0 && entry.badStretchClasses.size > 0) {
187
+ // Snapshot history and reset BEFORE the postToChannel call.
188
+ const snapshot = new Map(entry.badStretchClasses)
189
+ entry.badStretchClasses = new Map()
190
+ deps.postToChannel(channelId, ALL_CLEAR_TEMPLATE(snapshot))
191
+ }
192
+ }
193
+
194
+ /**
195
+ * resetAllToHealthy — silently wipes every channel's flag set and bad-stretch
196
+ * history to a clean slate. No `postToChannel` calls. Used at boot time
197
+ * after `socket.start()` as a defensive boundary for pre-auth observations
198
+ * (boot-time call is added by Epic 2).
199
+ */
200
+ export function resetAllToHealthy(channelIds: string[]): void {
201
+ for (const channelId of channelIds) {
202
+ entries.set(channelId, { flags: new Set(), badStretchClasses: new Map() })
203
+ }
204
+ }
205
+
206
+ // ---------------------------------------------------------------------------
207
+ // Public API — AD verb wrappers
208
+ // ---------------------------------------------------------------------------
209
+
210
+ /**
211
+ * withOutageDetection — centralized wrapper for AD verb calls that should
212
+ * participate in outage detection.
213
+ *
214
+ * On error:
215
+ * - ErrSystemInstallDisappeared → raises 'ad-unreachable' (detail = binaryPath)
216
+ * - ErrTmuxNotAvailable → raises 'tmux-unavailable'
217
+ * - ErrCwdNotFound / ErrCwdNotADirectory → raises 'cwd-unreachable' (detail = routeCwd)
218
+ * UNLESS routeCwd is undefined, in which case logs loudly and rethrows
219
+ * WITHOUT raising the flag (defensive carve-out for verb-class drift).
220
+ * - Other errors → no flag change; rethrow unchanged.
221
+ *
222
+ * On success: clears 'ad-unreachable' and 'tmux-unavailable', returns result.
223
+ *
224
+ * The original error is always rethrown so callers can handle it normally.
225
+ */
226
+ export async function withOutageDetection<T>(
227
+ channelId: string,
228
+ routeCwd: string | undefined,
229
+ fn: (client: Client) => Promise<T>,
230
+ ): Promise<T> {
231
+ if (!deps) {
232
+ throw new Error(
233
+ 'outage-state: withOutageDetection called before initOutageState — caller-site bug',
234
+ )
235
+ }
236
+ try {
237
+ const result = await fn(deps.getClient())
238
+ clearOutageFlag(channelId, 'ad-unreachable')
239
+ clearOutageFlag(channelId, 'tmux-unavailable')
240
+ return result
241
+ } catch (err) {
242
+ if (err instanceof ErrSystemInstallDisappeared) {
243
+ setOutageFlag(channelId, 'ad-unreachable', err.binaryPath)
244
+ } else if (err instanceof ErrTmuxNotAvailable) {
245
+ setOutageFlag(channelId, 'tmux-unavailable')
246
+ } else if (err instanceof ErrCwdNotFound || err instanceof ErrCwdNotADirectory) {
247
+ if (routeCwd !== undefined) {
248
+ setOutageFlag(channelId, 'cwd-unreachable', routeCwd)
249
+ } else {
250
+ console.error(
251
+ `[slack] outage-state: withOutageDetection: cwd error on channel=${channelId} but routeCwd is undefined — verb-class drift; rethrowing without raising flag`,
252
+ err,
253
+ )
254
+ }
255
+ }
256
+ throw err
257
+ }
258
+ }
259
+
260
+ /**
261
+ * withSpawnDetection — like `withOutageDetection` but also clears
262
+ * 'cwd-unreachable' on success. Spawn and resume verbs are the only calls
263
+ * that actually exercise the route's cwd, so cwd health is only confirmed
264
+ * by a successful spawn/resume.
265
+ */
266
+ export async function withSpawnDetection<T>(
267
+ channelId: string,
268
+ routeCwd: string | undefined,
269
+ fn: (client: Client) => Promise<T>,
270
+ ): Promise<T> {
271
+ const result = await withOutageDetection(channelId, routeCwd, fn)
272
+ clearOutageFlag(channelId, 'cwd-unreachable')
273
+ return result
274
+ }
275
+
276
+ // ---------------------------------------------------------------------------
277
+ // Test-only
278
+ // ---------------------------------------------------------------------------
279
+
280
+ /**
281
+ * _resetOutageState — clears all module-scoped state. For tests only.
282
+ * Production code must not call this.
283
+ */
284
+ export function _resetOutageState(): void {
285
+ deps = undefined
286
+ entries.clear()
287
+ }
@@ -16,11 +16,11 @@
16
16
  * SPDX-License-Identifier: MIT
17
17
  */
18
18
 
19
- import { AgentDirectorError, ErrAlreadyDecided } from 'agent-director'
20
- import type { Client } from 'agent-director'
19
+ import { AgentDirectorError, ErrAlreadyDecided, ErrSystemInstallDisappeared, ErrTmuxNotAvailable } from 'agent-director'
21
20
  import type { WebClient } from '@slack/web-api'
22
21
 
23
22
  import { decideWithToken } from './agent-director-client.ts'
23
+ import { withOutageDetection } from './outage-state.ts'
24
24
  import { parsePermissionActionId, type PermissionDecision } from './permission-action-id.ts'
25
25
  import { getLivePermission, markHandled } from './permission-poller.ts'
26
26
  import { emitTrail as defaultEmitTrail } from './permission-trail.ts'
@@ -32,8 +32,6 @@ import type {
32
32
  } from './permission-trail.ts'
33
33
 
34
34
  export interface ClickDeps {
35
- /** Returns an AD Client whose `decide` method this handler will invoke. */
36
- getClient: () => Pick<Client, 'decide'>
37
35
  web: Pick<WebClient, 'chat'>
38
36
  log?: (...args: unknown[]) => void
39
37
  /**
@@ -141,6 +139,21 @@ export async function handlePermissionClick(
141
139
  live_pending: earlyEntry !== undefined,
142
140
  })
143
141
 
142
+ const ctxChannel = context.channel ?? earlyEntry?.channelId
143
+ if (!ctxChannel) {
144
+ logDeps(
145
+ deps,
146
+ `[slack] permission-click-handler: cannot resolve channelId ` +
147
+ `(context.channel=${context.channel ?? 'undefined'}, ` +
148
+ `earlyEntry.channelId=${earlyEntry?.channelId ?? 'undefined'}, ` +
149
+ `claude_instance_id=${claudeInstanceId}, ` +
150
+ `action_id=${actionId}, ` +
151
+ `request_token=${requestToken}) ` +
152
+ `— logging and bypassing this click. This is a CSCB bug — investigate.`,
153
+ )
154
+ return true
155
+ }
156
+
144
157
  // SR-4.1, SR-4.2, SR-7.2: AD is the source of truth. Always call decide
145
158
  // with the decoded request_token — including for stale clicks whose
146
159
  // composite key is no longer in the pending map. This is the click's ONLY
@@ -152,14 +165,29 @@ export async function handlePermissionClick(
152
165
  decision,
153
166
  }
154
167
  try {
155
- await decideWithToken(deps.getClient(), {
156
- claude_instance_id: claudeInstanceId,
157
- decision,
158
- request_token: requestToken,
159
- })
168
+ await withOutageDetection(ctxChannel, undefined, (client) =>
169
+ decideWithToken(client, {
170
+ claude_instance_id: claudeInstanceId,
171
+ decision,
172
+ request_token: requestToken,
173
+ }),
174
+ )
160
175
  // SR-V-2.7 call-side success emission.
161
176
  emit({ ...decideEnvelope, result_class: 'ok' satisfies AdDecideResponseClass })
162
177
  } catch (err) {
178
+ if (err instanceof ErrSystemInstallDisappeared || err instanceof ErrTmuxNotAvailable) {
179
+ // SR-V-2.7 ad/tmux carve-out: the wrapper already raised the outage
180
+ // flag (one Slack onset alert via the state machine). Forensic
181
+ // requirement: the trail JSONL must still carry the typed error name
182
+ // and message so post-incident debugging can correlate the click
183
+ // attempt with the AD-down cause — the loud Slack alert tells the
184
+ // operator something is broken; the trail entry tells the engineer
185
+ // what was actually thrown.
186
+ const result_class: AdDecideResponseClass = err.errName as AdDecideResponseClass
187
+ const raw_error_message = err.message
188
+ emit({ ...decideEnvelope, result_class, raw_error_message })
189
+ return true
190
+ }
163
191
  // SR-V-2.7 call-side failure emission. Classify against the same AD
164
192
  // error names the existing branches discriminate on so the trail's
165
193
  // result_class stays consistent with src/agent-director-errors.ts. The
@@ -35,6 +35,8 @@
35
35
  import {
36
36
  AgentDirectorError,
37
37
  ErrSpawnNotFound,
38
+ ErrSystemInstallDisappeared,
39
+ ErrTmuxNotAvailable,
38
40
  } from 'agent-director'
39
41
  import type {
40
42
  GetResult as ADGetResult,
@@ -50,6 +52,7 @@ import type {
50
52
  GetPermissionParams,
51
53
  GetPermissionResult,
52
54
  } from './agent-director-client.ts'
55
+ import { withOutageDetection } from './outage-state.ts'
53
56
  import { encodePermissionActionId } from './permission-action-id.ts'
54
57
  import { emitTrail as defaultEmitTrail } from './permission-trail.ts'
55
58
  import type {
@@ -303,7 +306,12 @@ function classifySlackError(err: unknown): string {
303
306
  async function runTick(deps: PollerDeps): Promise<void> {
304
307
  if (tickInFlight) {
305
308
  skippedTicks++
306
- if (skippedTicks >= 5) {
309
+ // Fire exactly once when the streak crosses 5 (4 5 transition). Further
310
+ // skips within the same streak are silent; the next successfully-started
311
+ // tick body resets skippedTicks to 0 and re-arms the warning for a future
312
+ // streak. Mirrors src/health-check.ts:55-62 — see that block for the
313
+ // budget-exhaustion rationale.
314
+ if (skippedTicks === 5) {
307
315
  logViaDeps(deps, `[slack] permission-poller: skipped ${skippedTicks} consecutive ticks — tick budget exceeded`)
308
316
  }
309
317
  return
@@ -324,13 +332,23 @@ async function runTick(deps: PollerDeps): Promise<void> {
324
332
  const seenComposite = new Set<string>()
325
333
  const nonConformingInstanceIds = new Set<string>()
326
334
  for (const row of rows) {
335
+ const rowChannelId = row.labels['channel']
336
+ if (!rowChannelId) {
337
+ logViaDeps(deps, `[slack] permission-poller: spawn ${row.claude_instance_id} has no channel label — skipping`)
338
+ continue
339
+ }
340
+
327
341
  let got: GetResultWithPermissionRequests
328
342
  try {
329
- got = (await client.get({
330
- claude_instance_id: row.claude_instance_id,
331
- })) as unknown as GetResultWithPermissionRequests
343
+ got = (await withOutageDetection(rowChannelId, undefined, () =>
344
+ client.get({ claude_instance_id: row.claude_instance_id })
345
+ )) as unknown as GetResultWithPermissionRequests
332
346
  } catch (err) {
333
347
  if (err instanceof ErrSpawnNotFound) continue
348
+ if (err instanceof ErrSystemInstallDisappeared || err instanceof ErrTmuxNotAvailable) {
349
+ // Outage flag raised; skip per-event log.
350
+ continue
351
+ }
334
352
  const e = err instanceof AgentDirectorError ? err : null
335
353
  logViaDeps(deps, `[slack] permission-poller: get failed for ${row.claude_instance_id}: ${e?.errName ?? String(err)}`)
336
354
  continue
@@ -368,8 +386,14 @@ async function runTick(deps: PollerDeps): Promise<void> {
368
386
  for (const entry of closedEntries) {
369
387
  let info: GetPermissionResult
370
388
  try {
371
- info = await getPermission(client, { request_token: entry.requestToken })
389
+ info = await withOutageDetection(entry.channelId, undefined, () =>
390
+ getPermission(client, { request_token: entry.requestToken })
391
+ )
372
392
  } catch (err) {
393
+ if (err instanceof ErrSystemInstallDisappeared || err instanceof ErrTmuxNotAvailable) {
394
+ // Outage flag raised; skip per-event log.
395
+ continue
396
+ }
373
397
  if (isErrPermissionRequestNotFound(err)) {
374
398
  logViaDeps(deps, `[slack] permission-poller: get-permission not-found for ${entry.claudeInstanceId} token=${entry.requestToken} — generic deny + drop`)
375
399
  emitRowDecision(deps, 'not_found_generic_deny', entry.claudeInstanceId, entry.requestToken)
package/src/restart.ts CHANGED
@@ -22,17 +22,10 @@ export interface RestartDeps {
22
22
  isShuttingDown(): boolean
23
23
  }
24
24
 
25
- // ---------------------------------------------------------------------------
26
- // Constants
27
- // ---------------------------------------------------------------------------
28
-
29
- export const MAX_CONSECUTIVE_FAILURES = 3
30
-
31
25
  // ---------------------------------------------------------------------------
32
26
  // Module-scoped state
33
27
  // ---------------------------------------------------------------------------
34
28
 
35
- const failureCounters = new Map<string, number>()
36
29
  const pendingRestartTimers = new Map<string, ReturnType<typeof setTimeout>>()
37
30
  const activeLaunches = new Set<string>()
38
31
  let deps: RestartDeps | null = null
@@ -61,14 +54,6 @@ export function scheduleRestart(channelId: string, cwd: string, sessionId?: stri
61
54
  return
62
55
  }
63
56
 
64
- const failures = failureCounters.get(channelId) ?? 0
65
- if (failures >= MAX_CONSECUTIVE_FAILURES) {
66
- console.error(
67
- `[slack] Max consecutive failures (${MAX_CONSECUTIVE_FAILURES}) reached — giving up on channel=${channelId} cwd="${cwd}"`,
68
- )
69
- return
70
- }
71
-
72
57
  // Cancel any existing timer for this channel
73
58
  const existing = pendingRestartTimers.get(channelId)
74
59
  if (existing !== undefined) {
@@ -130,11 +115,7 @@ export function scheduleRestart(channelId: string, cwd: string, sessionId?: stri
130
115
  }
131
116
 
132
117
  if (!ok) {
133
- const count = (failureCounters.get(channelId) ?? 0) + 1
134
- failureCounters.set(channelId, count)
135
- console.error(
136
- `[slack] Session relaunch failed for channel=${channelId} (failure ${count}/${MAX_CONSECUTIVE_FAILURES})`,
137
- )
118
+ console.error(`[slack] Session relaunch failed for channel=${channelId}`)
138
119
  }
139
120
  } finally {
140
121
  activeLaunches.delete(channelId)
@@ -144,14 +125,6 @@ export function scheduleRestart(channelId: string, cwd: string, sessionId?: stri
144
125
  pendingRestartTimers.set(channelId, timer)
145
126
  }
146
127
 
147
- // ---------------------------------------------------------------------------
148
- // resetFailureCounter
149
- // ---------------------------------------------------------------------------
150
-
151
- export function resetFailureCounter(channelId: string): void {
152
- failureCounters.set(channelId, 0)
153
- }
154
-
155
128
  // ---------------------------------------------------------------------------
156
129
  // cancelAllRestartTimers
157
130
  // ---------------------------------------------------------------------------
@@ -165,17 +138,13 @@ export function cancelAllRestartTimers(): void {
165
138
  }
166
139
 
167
140
  // ---------------------------------------------------------------------------
168
- // isRestartPendingOrActive / hasReachedMaxFailures — query functions
141
+ // isRestartPendingOrActive — query function
169
142
  // ---------------------------------------------------------------------------
170
143
 
171
144
  export function isRestartPendingOrActive(channelId: string): boolean {
172
145
  return pendingRestartTimers.has(channelId) || activeLaunches.has(channelId)
173
146
  }
174
147
 
175
- export function hasReachedMaxFailures(channelId: string): boolean {
176
- return (failureCounters.get(channelId) ?? 0) >= MAX_CONSECUTIVE_FAILURES
177
- }
178
-
179
148
  // ---------------------------------------------------------------------------
180
149
  // _resetRestartState — exported for test cleanup
181
150
  // ---------------------------------------------------------------------------
@@ -184,7 +153,6 @@ export function _resetRestartState(): void {
184
153
  for (const timer of pendingRestartTimers.values()) {
185
154
  clearTimeout(timer)
186
155
  }
187
- failureCounters.clear()
188
156
  pendingRestartTimers.clear()
189
157
  activeLaunches.clear()
190
158
  deps = null
package/src/server.ts CHANGED
@@ -27,6 +27,7 @@ import {
27
27
  chmodSync,
28
28
  existsSync,
29
29
  renameSync,
30
+ promises as fsPromises,
30
31
  } from 'fs'
31
32
 
32
33
  import {
@@ -54,6 +55,7 @@ import {
54
55
  } from './session-manager.ts'
55
56
  import { cleanSession, getCozempicAvailable } from './cozempic.ts'
56
57
  import { ErrSpawnNotFound } from 'agent-director'
58
+ import { ErrSystemInstallDisappeared, ErrTmuxNotAvailable } from './agent-director-errors.ts'
57
59
  import { getClient, closeClient } from './agent-director-client.ts'
58
60
  import {
59
61
  emitBlockActionReceived,
@@ -64,10 +66,8 @@ import { startPermissionPoller, stopPermissionPoller } from './permission-poller
64
66
  import {
65
67
  initRestart,
66
68
  scheduleRestart,
67
- resetFailureCounter,
68
69
  cancelAllRestartTimers,
69
70
  isRestartPendingOrActive,
70
- hasReachedMaxFailures,
71
71
  } from './restart.ts'
72
72
  import { initHealthCheck, startHealthCheck, stopHealthCheck } from './health-check.ts'
73
73
  import { loadTokens, isDryRun } from './tokens.ts'
@@ -99,6 +99,7 @@ import {
99
99
  } from './registry.ts'
100
100
  import { runAgentDirectorStartupGate } from './agent-director-startup.ts'
101
101
  import { installSlackChannelBotTemplate } from './agent-director-template.ts'
102
+ import { initOutageState, setOutageFlag, clearOutageFlag, resetAllToHealthy, withOutageDetection } from './outage-state.ts'
102
103
 
103
104
  // Re-export constants so they stay in one place (lib.ts)
104
105
  export { MAX_PENDING, MAX_PAIRING_REPLIES, PAIRING_EXPIRY_MS } from './lib.ts'
@@ -521,9 +522,6 @@ async function handleInitialized(
521
522
  console.error(`[slack] Session replaced existing connection for CWD "${normalizedCwd}"`)
522
523
  }
523
524
  console.error(`[slack] Session connected: channel=${matchedChannelId} cwd="${normalizedCwd}"`)
524
-
525
- // Reset failure counter — session reconnected successfully
526
- resetFailureCounter(matchedChannelId)
527
525
  }
528
526
 
529
527
  // ---------------------------------------------------------------------------
@@ -756,7 +754,7 @@ socket.on('interactive', async (evt) => {
756
754
 
757
755
  const handled = await handlePermissionClick(
758
756
  actionId,
759
- { getClient, web },
757
+ { web },
760
758
  { channel: channelId, messageTs, user: userId },
761
759
  )
762
760
  if (handled) {
@@ -838,6 +836,97 @@ async function shutdown(signal: string): Promise<void> {
838
836
  process.on('SIGTERM', () => { shutdown('SIGTERM').catch(() => process.exit(1)) })
839
837
  process.on('SIGINT', () => { shutdown('SIGINT').catch(() => process.exit(1)) })
840
838
 
839
+ // ---------------------------------------------------------------------------
840
+ // _buildIsSessionAliveAdapter
841
+ // ---------------------------------------------------------------------------
842
+
843
+ /**
844
+ * _buildIsSessionAliveAdapter — test-only factory for the tick-path liveness
845
+ * probe. Production code wires this via main() as `isSessionAliveAdapter`;
846
+ * tests call it directly to exercise the four SRD § Liveness probe branches
847
+ * without importing the private closure inside main().
848
+ *
849
+ * @internal
850
+ */
851
+ export function _buildIsSessionAliveAdapter(
852
+ getRoutingConfig: () => RoutingConfig | null | undefined,
853
+ ): (channelId: string) => Promise<boolean> {
854
+ return async (channelId: string) => {
855
+ const routingConfig = getRoutingConfig()
856
+ if (!routingConfig?.routes[channelId]) return false
857
+ const claude_instance_id = instanceIdFor(channelId, routingConfig.routes[channelId]?.normalizedName)
858
+ try {
859
+ const r = await getClient().status({ claude_instance_id })
860
+ clearOutageFlag(channelId, 'ad-unreachable')
861
+ clearOutageFlag(channelId, 'tmux-unavailable')
862
+ return AGENT_DIRECTOR_LIVE_STATES.has(r.state)
863
+ } catch (err) {
864
+ if (err instanceof ErrSpawnNotFound) {
865
+ clearOutageFlag(channelId, 'ad-unreachable')
866
+ clearOutageFlag(channelId, 'tmux-unavailable')
867
+ return false
868
+ }
869
+ if (err instanceof ErrSystemInstallDisappeared) {
870
+ setOutageFlag(channelId, 'ad-unreachable', err.binaryPath)
871
+ return false
872
+ }
873
+ if (err instanceof ErrTmuxNotAvailable) {
874
+ setOutageFlag(channelId, 'tmux-unavailable')
875
+ return false
876
+ }
877
+ console.error(`[slack] isSessionAlive: status error for channel=${channelId}:`, err)
878
+ return false
879
+ }
880
+ }
881
+ }
882
+
883
+ // ---------------------------------------------------------------------------
884
+ // _buildStatRouteImpl
885
+ // ---------------------------------------------------------------------------
886
+
887
+ /**
888
+ * _buildStatRouteImpl — test-only factory for the health-check tick's
889
+ * statRoute dependency. Production code wires this via main()'s
890
+ * initHealthCheck call with no deps; tests inject `stat` / `setTimeout` /
891
+ * `clearTimeout` to exercise the 5-second timeout budget and the
892
+ * fsPromises.stat error swallowing against the REAL factory (not a replica).
893
+ *
894
+ * @internal
895
+ */
896
+ export function _buildStatRouteImpl(deps?: {
897
+ stat?: (path: string) => Promise<{ isDirectory(): boolean }>
898
+ setTimeout?: (fn: () => void, ms: number) => ReturnType<typeof setTimeout>
899
+ clearTimeout?: (handle: ReturnType<typeof setTimeout> | undefined) => void
900
+ }): (cwd: string) => Promise<boolean> {
901
+ const stat = deps?.stat ?? ((p: string) => fsPromises.stat(p) as Promise<{ isDirectory(): boolean }>)
902
+ const setT = deps?.setTimeout ?? ((fn: () => void, ms: number) => setTimeout(fn, ms))
903
+ const clearT = deps?.clearTimeout ?? ((h: ReturnType<typeof setTimeout> | undefined) => clearTimeout(h))
904
+ const STAT_TIMEOUT_MS = 5_000
905
+ return async (cwd: string) => {
906
+ const statPromise = (async () => {
907
+ try {
908
+ const st = await stat(cwd)
909
+ return st.isDirectory()
910
+ } catch (err) {
911
+ console.error(`[slack] health-check: statRoute(${cwd}) failed:`, err)
912
+ return false
913
+ }
914
+ })()
915
+ let timeoutHandle: ReturnType<typeof setTimeout> | undefined
916
+ const timeoutPromise = new Promise<boolean>((resolve) => {
917
+ timeoutHandle = setT(() => {
918
+ console.error(`[slack] health-check: statRoute(${cwd}) timed out after ${STAT_TIMEOUT_MS}ms — treating as unreachable`)
919
+ resolve(false)
920
+ }, STAT_TIMEOUT_MS)
921
+ })
922
+ try {
923
+ return await Promise.race([statPromise, timeoutPromise])
924
+ } finally {
925
+ clearT(timeoutHandle)
926
+ }
927
+ }
928
+ }
929
+
841
930
  // ---------------------------------------------------------------------------
842
931
  // Main
843
932
  //
@@ -910,6 +999,15 @@ export async function main(): Promise<void> {
910
999
  }
911
1000
  }
912
1001
 
1002
+ initOutageState({
1003
+ postToChannel: (channelId, text) => {
1004
+ web.chat.postMessage({ channel: channelId, text }).catch((err) => {
1005
+ console.error(`[slack] outage-state: postMessage failed for channel=${channelId}:`, err)
1006
+ })
1007
+ },
1008
+ getClient,
1009
+ })
1010
+
913
1011
  if (isDryRun()) {
914
1012
  console.error('[slack] Running in dry-run mode — Slack disabled')
915
1013
  botUserId = 'U000DRY'
@@ -926,6 +1024,10 @@ export async function main(): Promise<void> {
926
1024
  await socket.start()
927
1025
  console.error('[slack] Socket Mode connected')
928
1026
 
1027
+ if (routingConfig) {
1028
+ resetAllToHealthy(Object.keys(routingConfig.routes))
1029
+ }
1030
+
929
1031
  // SR-2.1 permission poller — single-threaded interval loop monitors AD
930
1032
  // state for spawns in check_permission and posts Block Kit prompts.
931
1033
  if (routingConfig) {
@@ -1127,18 +1229,7 @@ export async function main(): Promise<void> {
1127
1229
  // SR-11 Event 6a. Any AGENT_DIRECTOR_LIVE_STATES value → alive; terminal
1128
1230
  // states (ended, missing) and ErrSpawnNotFound → dead. Other errors fall
1129
1231
  // back to "dead" defensively — health-check will retry.
1130
- const isSessionAliveAdapter = async (channelId: string): Promise<boolean> => {
1131
- if (!routingConfig?.routes[channelId]) return false
1132
- const claude_instance_id = instanceIdFor(channelId, routingConfig.routes[channelId]?.normalizedName)
1133
- try {
1134
- const r = await getClient().status({ claude_instance_id })
1135
- return AGENT_DIRECTOR_LIVE_STATES.has(r.state)
1136
- } catch (err) {
1137
- if (err instanceof ErrSpawnNotFound) return false
1138
- console.error(`[slack] isSessionAlive: status error for channel=${channelId}:`, err)
1139
- return false
1140
- }
1141
- }
1232
+ const isSessionAliveAdapter = _buildIsSessionAliveAdapter(() => routingConfig)
1142
1233
 
1143
1234
  // Initialize restart module with library-backed adapters
1144
1235
  initRestart({
@@ -1153,9 +1244,12 @@ export async function main(): Promise<void> {
1153
1244
  killSession: async (channelId) => {
1154
1245
  try {
1155
1246
  const normalizedName = routingConfig?.routes[channelId]?.normalizedName
1156
- await getClient().kill({ claude_instance_id: instanceIdFor(channelId, normalizedName) })
1247
+ await withOutageDetection(channelId, undefined, (client) =>
1248
+ client.kill({ claude_instance_id: instanceIdFor(channelId, normalizedName) })
1249
+ )
1157
1250
  } catch (err) {
1158
1251
  if (err instanceof ErrSpawnNotFound) return
1252
+ if (err instanceof ErrSystemInstallDisappeared || err instanceof ErrTmuxNotAvailable) return
1159
1253
  console.error(`[slack] killSession (restart adapter): error for channel=${channelId}:`, err)
1160
1254
  }
1161
1255
  },
@@ -1226,7 +1320,7 @@ export async function main(): Promise<void> {
1226
1320
  initHealthCheck({
1227
1321
  isSessionAlive: isSessionAliveAdapter,
1228
1322
  isRestartPendingOrActive,
1229
- hasReachedMaxFailures,
1323
+ statRoute: _buildStatRouteImpl(),
1230
1324
  scheduleRestart,
1231
1325
  isShuttingDown: () => shuttingDown,
1232
1326
  getRoutes: () => {
@@ -38,6 +38,13 @@ import type { WebClient } from '@slack/web-api'
38
38
  import { checkCozempicAvailable } from './cozempic.ts'
39
39
  import { type RoutingConfig, MCP_SERVER_NAME, normalizeChannelName } from './config.ts'
40
40
  import { getClient } from './agent-director-client.ts'
41
+ import { withOutageDetection, withSpawnDetection } from './outage-state.ts'
42
+ import {
43
+ ErrSystemInstallDisappeared,
44
+ ErrTmuxNotAvailable,
45
+ ErrCwdNotFound,
46
+ ErrCwdNotADirectory,
47
+ } from './agent-director-errors.ts'
41
48
  import { recordStartupError } from './startup-errors.ts'
42
49
  import { isDryRun } from './tokens.ts'
43
50
 
@@ -184,12 +191,13 @@ export async function reconnectMcp(
184
191
  const claude_instance_id = instanceIdFor(channelId, routingConfig?.routes[channelId]?.normalizedName)
185
192
  console.error(`[slack] reconnecting MCP server "${MCP_SERVER_NAME}": channel=${channelId}`)
186
193
  try {
187
- await getClient().sendKeys({
194
+ await withOutageDetection(channelId, undefined, (client) => client.sendKeys({
188
195
  claude_instance_id,
189
196
  text: `/mcp reconnect ${MCP_SERVER_NAME}`,
190
- })
197
+ }))
191
198
  return true
192
199
  } catch (err) {
200
+ if (err instanceof ErrSystemInstallDisappeared || err instanceof ErrTmuxNotAvailable) return false
193
201
  const e = err instanceof AgentDirectorError ? err : new AgentDirectorError('send-keys', 'UnknownError', String(err))
194
202
  console.error(`[slack] reconnectMcp: send-keys failed for channel=${channelId}: ${e.errName}`)
195
203
  postSpawnFailureToChannel(channelId, e, web)
@@ -263,20 +271,23 @@ export async function approveDevChannelsDialog(
263
271
  ): Promise<void> {
264
272
  void web
265
273
  const claude_instance_id = instanceIdFor(channelId, normalizedName)
266
- const client = getClient()
267
274
  const deadline = Date.now() + _dialogPollTimeoutMs
268
275
 
269
276
  let approved = false
270
277
  while (Date.now() < deadline) {
271
278
  try {
272
- const { pane } = await client.readPane({ claude_instance_id, n_lines: 40, allow_pending: true })
279
+ const { pane } = await withOutageDetection(channelId, undefined, (client) => client.readPane({ claude_instance_id, n_lines: 40, allow_pending: true }))
273
280
  if (pane.includes(DEV_CHANNELS_DIALOG_NEEDLE)) {
274
- await client.sendKeys({ claude_instance_id, text: '', allow_pending: true })
281
+ await withOutageDetection(channelId, undefined, (client) => client.sendKeys({ claude_instance_id, text: '', allow_pending: true }))
275
282
  approved = true
276
283
  break
277
284
  }
278
285
  } catch (err) {
279
- console.error(`[slack] approveDevChannelsDialog: readPane error channel=${channelId}: ${String(err)}`)
286
+ if (err instanceof ErrSystemInstallDisappeared || err instanceof ErrTmuxNotAvailable) {
287
+ // Outage flag raised; continue polling
288
+ } else {
289
+ console.error(`[slack] approveDevChannelsDialog: readPane error channel=${channelId}: ${String(err)}`)
290
+ }
280
291
  }
281
292
  await new Promise((r) => setTimeout(r, _dialogPollIntervalMs))
282
293
  }
@@ -292,7 +303,7 @@ export async function approveDevChannelsDialog(
292
303
  while (Date.now() < deadline && misses < DIALOG_GONE_CONFIRMS_REQUIRED) {
293
304
  await new Promise((r) => setTimeout(r, _dialogPollIntervalMs))
294
305
  try {
295
- const { pane } = await client.readPane({ claude_instance_id, n_lines: 40, allow_pending: true })
306
+ const { pane } = await withOutageDetection(channelId, undefined, (client) => client.readPane({ claude_instance_id, n_lines: 40, allow_pending: true }))
296
307
  misses = pane.includes(DEV_CHANNELS_DIALOG_NEEDLE) ? 0 : misses + 1
297
308
  } catch {
298
309
  /* tolerate transient readPane failure */
@@ -362,20 +373,23 @@ export async function approveTrustFolderDialog(
362
373
  ): Promise<void> {
363
374
  void web
364
375
  const claude_instance_id = instanceIdFor(channelId, normalizedName)
365
- const client = getClient()
366
376
  const deadline = Date.now() + _trustDialogPollTimeoutMs
367
377
 
368
378
  let approved = false
369
379
  while (Date.now() < deadline) {
370
380
  try {
371
- const { pane } = await client.readPane({ claude_instance_id, n_lines: 40, allow_pending: true })
381
+ const { pane } = await withOutageDetection(channelId, undefined, (client) => client.readPane({ claude_instance_id, n_lines: 40, allow_pending: true }))
372
382
  if (pane.includes(TRUST_DIALOG_NEEDLE)) {
373
- await client.sendKeys({ claude_instance_id, text: '', allow_pending: true })
383
+ await withOutageDetection(channelId, undefined, (client) => client.sendKeys({ claude_instance_id, text: '', allow_pending: true }))
374
384
  approved = true
375
385
  break
376
386
  }
377
387
  } catch (err) {
378
- console.error(`[slack] approveTrustFolderDialog: readPane error channel=${channelId}: ${String(err)}`)
388
+ if (err instanceof ErrSystemInstallDisappeared || err instanceof ErrTmuxNotAvailable) {
389
+ // Outage flag raised; continue polling
390
+ } else {
391
+ console.error(`[slack] approveTrustFolderDialog: readPane error channel=${channelId}: ${String(err)}`)
392
+ }
379
393
  }
380
394
  await new Promise((r) => setTimeout(r, _trustDialogPollIntervalMs))
381
395
  }
@@ -389,7 +403,7 @@ export async function approveTrustFolderDialog(
389
403
  while (Date.now() < deadline && misses < DIALOG_GONE_CONFIRMS_REQUIRED) {
390
404
  await new Promise((r) => setTimeout(r, _trustDialogPollIntervalMs))
391
405
  try {
392
- const { pane } = await client.readPane({ claude_instance_id, n_lines: 40, allow_pending: true })
406
+ const { pane } = await withOutageDetection(channelId, undefined, (client) => client.readPane({ claude_instance_id, n_lines: 40, allow_pending: true }))
393
407
  misses = pane.includes(TRUST_DIALOG_NEEDLE) ? 0 : misses + 1
394
408
  } catch {
395
409
  /* tolerate transient readPane failure */
@@ -439,13 +453,16 @@ export async function waitForWaitingAndReconnect(
439
453
  while (Date.now() < deadline) {
440
454
  let state: string
441
455
  try {
442
- const r = await getClient().status({ claude_instance_id })
456
+ const r = await withOutageDetection(channelId, undefined, (client) => client.status({ claude_instance_id }))
443
457
  state = r.state
444
458
  } catch (err) {
445
459
  if (err instanceof ErrSpawnNotFound) {
446
460
  console.error(`[slack] waitForWaitingAndReconnect: spawn not found for channel=${channelId} — aborting poll`)
447
461
  return true
448
462
  }
463
+ if (err instanceof ErrSystemInstallDisappeared || err instanceof ErrTmuxNotAvailable) {
464
+ return false
465
+ }
449
466
  const e = err instanceof AgentDirectorError ? err : new AgentDirectorError('status', 'UnknownError', String(err))
450
467
  console.error(`[slack] waitForWaitingAndReconnect: status error for channel=${channelId}: ${e.errName}`)
451
468
  postSpawnFailureToChannel(channelId, e, web)
@@ -507,7 +524,7 @@ function buildSpawnParams(
507
524
  /** Best-effort kill — never throws. */
508
525
  async function tryKill(channelId: string, normalizedName: string | undefined): Promise<void> {
509
526
  try {
510
- await getClient().kill({ claude_instance_id: instanceIdFor(channelId, normalizedName) })
527
+ await withOutageDetection(channelId, undefined, (client) => client.kill({ claude_instance_id: instanceIdFor(channelId, normalizedName) }))
511
528
  } catch {
512
529
  /* ignore */
513
530
  }
@@ -521,9 +538,12 @@ async function tryDelete(
521
538
  isStartup: boolean,
522
539
  ): Promise<boolean> {
523
540
  try {
524
- await getClient().delete({ claude_instance_id: [instanceIdFor(channelId, normalizedName)] })
541
+ await withOutageDetection(channelId, undefined, (client) => client.delete({ claude_instance_id: [instanceIdFor(channelId, normalizedName)] }))
525
542
  return true
526
543
  } catch (err) {
544
+ if (err instanceof ErrSystemInstallDisappeared || err instanceof ErrTmuxNotAvailable) {
545
+ return false
546
+ }
527
547
  const e = err instanceof AgentDirectorError ? err : new AgentDirectorError('delete', 'UnknownError', String(err))
528
548
  console.error(`[slack] tryDelete: failed for channel=${channelId}: ${e.errName}`)
529
549
  if (isStartup) recordStartupError('spawn-failed', `delete failed for channel=${channelId}: ${e.errName}`, e)
@@ -560,43 +580,58 @@ export async function spawnForRoute(
560
580
 
561
581
  const params = buildSpawnParams(channelId, route, routingConfig)
562
582
  const normalizedName = routingConfig.routes[channelId]?.normalizedName
563
- const client = getClient()
564
583
 
565
584
  // Attempt fresh spawn ---
566
585
  try {
567
- const r = await client.spawn(params)
586
+ const r = await withSpawnDetection(channelId, route.cwd, (client) => client.spawn(params))
568
587
  console.error(`[slack] spawnForRoute: spawned channel=${channelId} instanceId=${r.claude_instance_id}`)
569
588
  await approveTrustFolderDialog(channelId, web, isStartup, normalizedName)
570
589
  await approveDevChannelsDialog(channelId, web, isStartup, normalizedName)
571
590
  return { channelId, action: 'spawned' }
572
591
  } catch (err) {
573
- if (!(err instanceof ErrInstanceIdCollision)) {
592
+ if (err instanceof ErrInstanceIdCollision) {
593
+ // Collision → fall through to get-then-act
594
+ console.error(`[slack] spawnForRoute: ErrInstanceIdCollision for channel=${channelId} — fetching current state`)
595
+ } else if (
596
+ err instanceof ErrSystemInstallDisappeared ||
597
+ err instanceof ErrTmuxNotAvailable ||
598
+ err instanceof ErrCwdNotFound ||
599
+ err instanceof ErrCwdNotADirectory
600
+ ) {
601
+ return { channelId, action: 'failed' }
602
+ } else {
574
603
  const e = err instanceof AgentDirectorError ? err : new AgentDirectorError('spawn', 'UnknownError', String(err))
575
604
  console.error(`[slack] spawnForRoute: spawn failed for channel=${channelId}: ${e.errName}`)
576
605
  if (isStartup) recordStartupError('spawn-failed', `spawn failed for channel=${channelId}: ${e.errName}`, e)
577
606
  postSpawnFailureToChannel(channelId, e, web, isStartup)
578
607
  return { channelId, action: 'failed' }
579
608
  }
580
- // Collision → fall through to get-then-act
581
- console.error(`[slack] spawnForRoute: ErrInstanceIdCollision for channel=${channelId} — fetching current state`)
582
609
  }
583
610
 
584
611
  // Collision-handling: get-then-act ---
585
612
  let state: string
586
613
  try {
587
- const r = await client.get({ claude_instance_id: instanceIdFor(channelId, normalizedName) })
614
+ const r = await withOutageDetection(channelId, undefined, (client) => client.get({ claude_instance_id: instanceIdFor(channelId, normalizedName) }))
588
615
  state = r.state
589
616
  } catch (err) {
590
617
  if (err instanceof ErrSpawnNotFound) {
591
618
  // Race: row deleted between spawn-collision and get. Retry spawn once.
592
619
  console.error(`[slack] spawnForRoute: ErrSpawnNotFound after collision for channel=${channelId} — retrying spawn (single retry)`)
593
620
  try {
594
- const r = await client.spawn(params)
621
+ const r = await withSpawnDetection(channelId, route.cwd, (client) => client.spawn(params))
595
622
  console.error(`[slack] spawnForRoute: retry-spawn succeeded for channel=${channelId} instanceId=${r.claude_instance_id}`)
596
623
  await approveTrustFolderDialog(channelId, web, isStartup, normalizedName)
597
624
  await approveDevChannelsDialog(channelId, web, isStartup, normalizedName)
598
625
  return { channelId, action: 'spawned' }
599
626
  } catch (err2) {
627
+ if (
628
+ err2 instanceof ErrSystemInstallDisappeared ||
629
+ err2 instanceof ErrTmuxNotAvailable ||
630
+ err2 instanceof ErrCwdNotFound ||
631
+ err2 instanceof ErrCwdNotADirectory
632
+ ) {
633
+ return { channelId, action: 'failed' }
634
+ }
600
635
  const e = err2 instanceof AgentDirectorError ? err2 : new AgentDirectorError('spawn', 'UnknownError', String(err2))
601
636
  console.error(`[slack] spawnForRoute: retry-spawn also failed for channel=${channelId}: ${e.errName}`)
602
637
  if (isStartup) recordStartupError('spawn-failed', `retry-spawn failed for channel=${channelId}: ${e.errName}`, e)
@@ -604,6 +639,9 @@ export async function spawnForRoute(
604
639
  return { channelId, action: 'failed' }
605
640
  }
606
641
  }
642
+ if (err instanceof ErrSystemInstallDisappeared || err instanceof ErrTmuxNotAvailable) {
643
+ return { channelId, action: 'failed' }
644
+ }
607
645
  const e = err instanceof AgentDirectorError ? err : new AgentDirectorError('get', 'UnknownError', String(err))
608
646
  console.error(`[slack] spawnForRoute: get failed for channel=${channelId}: ${e.errName}`)
609
647
  postSpawnFailureToChannel(channelId, e, web, isStartup)
@@ -618,12 +656,20 @@ export async function spawnForRoute(
618
656
  await tryKill(channelId, normalizedName)
619
657
  if (!(await tryDelete(channelId, normalizedName, web, isStartup))) return { channelId, action: 'failed' }
620
658
  try {
621
- await client.spawn(params)
659
+ await withSpawnDetection(channelId, route.cwd, (client) => client.spawn(params))
622
660
  console.error(`[slack] spawnForRoute: fresh-spawned (after kill+delete) for channel=${channelId}`)
623
661
  await approveTrustFolderDialog(channelId, web, isStartup, normalizedName)
624
662
  await approveDevChannelsDialog(channelId, web, isStartup, normalizedName)
625
663
  return { channelId, action: 'spawned' }
626
664
  } catch (err) {
665
+ if (
666
+ err instanceof ErrSystemInstallDisappeared ||
667
+ err instanceof ErrTmuxNotAvailable ||
668
+ err instanceof ErrCwdNotFound ||
669
+ err instanceof ErrCwdNotADirectory
670
+ ) {
671
+ return { channelId, action: 'failed' }
672
+ }
627
673
  const e = err instanceof AgentDirectorError ? err : new AgentDirectorError('spawn', 'UnknownError', String(err))
628
674
  console.error(`[slack] spawnForRoute: fresh spawn after delete failed for channel=${channelId}: ${e.errName}`)
629
675
  if (isStartup) recordStartupError('spawn-failed', `fresh spawn after delete failed for channel=${channelId}: ${e.errName}`, e)
@@ -635,7 +681,7 @@ export async function spawnForRoute(
635
681
  // resume_enabled: attempt resume
636
682
  console.error(`[slack] spawnForRoute: attempting resume for channel=${channelId}`)
637
683
  try {
638
- await client.resume({ claude_instance_id: instanceIdFor(channelId, normalizedName) })
684
+ await withSpawnDetection(channelId, route.cwd, (client) => client.resume({ claude_instance_id: instanceIdFor(channelId, normalizedName) }))
639
685
  console.error(`[slack] spawnForRoute: resumed channel=${channelId}`)
640
686
  return { channelId, action: 'resumed' }
641
687
  } catch (err) {
@@ -643,12 +689,20 @@ export async function spawnForRoute(
643
689
  console.error(`[slack] spawnForRoute: ${err.errName} on resume for channel=${channelId} — delete+fresh`)
644
690
  if (!(await tryDelete(channelId, normalizedName, web, isStartup))) return { channelId, action: 'failed' }
645
691
  try {
646
- await client.spawn(params)
692
+ await withSpawnDetection(channelId, route.cwd, (client) => client.spawn(params))
647
693
  console.error(`[slack] spawnForRoute: fresh-spawned (after delete) for channel=${channelId}`)
648
694
  await approveTrustFolderDialog(channelId, web, isStartup, normalizedName)
649
695
  await approveDevChannelsDialog(channelId, web, isStartup, normalizedName)
650
696
  return { channelId, action: 'spawned' }
651
697
  } catch (err2) {
698
+ if (
699
+ err2 instanceof ErrSystemInstallDisappeared ||
700
+ err2 instanceof ErrTmuxNotAvailable ||
701
+ err2 instanceof ErrCwdNotFound ||
702
+ err2 instanceof ErrCwdNotADirectory
703
+ ) {
704
+ return { channelId, action: 'failed' }
705
+ }
652
706
  const e = err2 instanceof AgentDirectorError ? err2 : new AgentDirectorError('spawn', 'UnknownError', String(err2))
653
707
  console.error(`[slack] spawnForRoute: fresh spawn after delete failed for channel=${channelId}: ${e.errName}`)
654
708
  if (isStartup) recordStartupError('spawn-failed', `fresh spawn after delete failed for channel=${channelId}: ${e.errName}`, e)
@@ -662,17 +716,33 @@ export async function spawnForRoute(
662
716
  await tryKill(channelId, normalizedName)
663
717
  if (!(await tryDelete(channelId, normalizedName, web, isStartup))) return { channelId, action: 'failed' }
664
718
  try {
665
- await client.spawn(params)
719
+ await withSpawnDetection(channelId, route.cwd, (client) => client.spawn(params))
666
720
  await approveTrustFolderDialog(channelId, web, isStartup, normalizedName)
667
721
  await approveDevChannelsDialog(channelId, web, isStartup, normalizedName)
668
722
  return { channelId, action: 'spawned' }
669
723
  } catch (err2) {
724
+ if (
725
+ err2 instanceof ErrSystemInstallDisappeared ||
726
+ err2 instanceof ErrTmuxNotAvailable ||
727
+ err2 instanceof ErrCwdNotFound ||
728
+ err2 instanceof ErrCwdNotADirectory
729
+ ) {
730
+ return { channelId, action: 'failed' }
731
+ }
670
732
  const e = err2 instanceof AgentDirectorError ? err2 : new AgentDirectorError('spawn', 'UnknownError', String(err2))
671
733
  if (isStartup) recordStartupError('spawn-failed', `fresh spawn failed for channel=${channelId}: ${e.errName}`, e)
672
734
  postSpawnFailureToChannel(channelId, e, web, isStartup)
673
735
  return { channelId, action: 'failed' }
674
736
  }
675
737
  }
738
+ if (
739
+ err instanceof ErrSystemInstallDisappeared ||
740
+ err instanceof ErrTmuxNotAvailable ||
741
+ err instanceof ErrCwdNotFound ||
742
+ err instanceof ErrCwdNotADirectory
743
+ ) {
744
+ return { channelId, action: 'failed' }
745
+ }
676
746
  const e = err instanceof AgentDirectorError ? err : new AgentDirectorError('resume', 'UnknownError', String(err))
677
747
  console.error(`[slack] spawnForRoute: resume failed for channel=${channelId}: ${e.errName}`)
678
748
  postSpawnFailureToChannel(channelId, e, web, isStartup)
@@ -985,9 +1055,13 @@ export async function reconcileInstanceIds(
985
1055
  `[slack] reconcileInstanceIds: deleting stale row channel=${o.channelId} instanceId=${o.oldInstanceId}`,
986
1056
  )
987
1057
  try {
988
- await client.delete({ claude_instance_id: [o.oldInstanceId] })
1058
+ await withOutageDetection(o.channelId, undefined, (client) => client.delete({ claude_instance_id: [o.oldInstanceId] }))
989
1059
  deleted++
990
1060
  } catch (err) {
1061
+ if (err instanceof ErrSystemInstallDisappeared || err instanceof ErrTmuxNotAvailable) {
1062
+ failed++
1063
+ continue
1064
+ }
991
1065
  const e = err instanceof AgentDirectorError ? err : new AgentDirectorError('delete', 'UnknownError', String(err))
992
1066
  console.error(
993
1067
  `[slack] reconcileInstanceIds: delete failed for channel=${o.channelId} instanceId=${o.oldInstanceId}: ${e.errName}`,