claude-slack-channel-bots 0.7.2 → 0.8.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +2 -2
- package/src/agent-director-errors.ts +8 -0
- package/src/health-check.ts +44 -14
- package/src/outage-state.ts +287 -0
- package/src/permission-click-handler.ts +37 -9
- package/src/permission-poller.ts +29 -5
- package/src/restart.ts +2 -34
- package/src/server.ts +114 -20
- package/src/session-manager.ts +101 -27
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "claude-slack-channel-bots",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "0.8.0",
|
|
4
4
|
"description": "Multi-session Slack-to-Claude bridge — run multiple Claude Code bots across Slack channels via Socket Mode",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"bin": {
|
|
@@ -31,7 +31,7 @@
|
|
|
31
31
|
"@modelcontextprotocol/sdk": "^1.0.0",
|
|
32
32
|
"@slack/socket-mode": "^2.0.0",
|
|
33
33
|
"@slack/web-api": "^7.0.0",
|
|
34
|
-
"agent-director": "^0.7.
|
|
34
|
+
"agent-director": "^0.7.8",
|
|
35
35
|
"semver": "^7.6.0"
|
|
36
36
|
},
|
|
37
37
|
"devDependencies": {
|
|
@@ -11,6 +11,10 @@
|
|
|
11
11
|
* - ErrSystemInstallNotFound (Client.create / resolveSystemBinary — no `agent-director` on PATH or at ~/.agent-director)
|
|
12
12
|
* - ErrSystemInstallTooOld (Client.create / resolveSystemBinary — system binary older than required minimum)
|
|
13
13
|
* - ErrSystemInstallUnreachable (Client.create / resolveSystemBinary — system binary present but not executable or fails --version)
|
|
14
|
+
* - ErrSystemInstallDisappeared (any verb / binary gone after valid construction — b.xht)
|
|
15
|
+
* - ErrTmuxNotAvailable (spawn / tmux binary not found or not executable)
|
|
16
|
+
* - ErrCwdNotFound (spawn / route cwd does not exist on disk)
|
|
17
|
+
* - ErrCwdNotADirectory (spawn / route cwd path exists but is not a directory)
|
|
14
18
|
* - ErrInstanceIdCollision (spawn / SR-1.4 idempotency)
|
|
15
19
|
* - ErrSpawnNotFound (get / status / decide on missing row)
|
|
16
20
|
* - ErrNoSessionId (resume / SR-1.3 fallthrough)
|
|
@@ -41,6 +45,10 @@ export {
|
|
|
41
45
|
ErrSystemInstallNotFound,
|
|
42
46
|
ErrSystemInstallTooOld,
|
|
43
47
|
ErrSystemInstallUnreachable,
|
|
48
|
+
ErrSystemInstallDisappeared,
|
|
49
|
+
ErrTmuxNotAvailable,
|
|
50
|
+
ErrCwdNotFound,
|
|
51
|
+
ErrCwdNotADirectory,
|
|
44
52
|
ErrCallTimeout,
|
|
45
53
|
ErrInstanceIdCollision,
|
|
46
54
|
ErrSpawnNotFound,
|
package/src/health-check.ts
CHANGED
|
@@ -8,6 +8,8 @@
|
|
|
8
8
|
* SPDX-License-Identifier: MIT
|
|
9
9
|
*/
|
|
10
10
|
|
|
11
|
+
import { setOutageFlag, clearOutageFlag } from './outage-state.ts'
|
|
12
|
+
|
|
11
13
|
// ---------------------------------------------------------------------------
|
|
12
14
|
// Types
|
|
13
15
|
// ---------------------------------------------------------------------------
|
|
@@ -15,7 +17,7 @@
|
|
|
15
17
|
export interface HealthCheckDeps {
|
|
16
18
|
isSessionAlive(channelId: string): Promise<boolean>
|
|
17
19
|
isRestartPendingOrActive(channelId: string): boolean
|
|
18
|
-
|
|
20
|
+
statRoute(cwd: string): Promise<boolean>
|
|
19
21
|
scheduleRestart(channelId: string, cwd: string): void
|
|
20
22
|
isShuttingDown(): boolean
|
|
21
23
|
getRoutes(): Record<string, string>
|
|
@@ -27,6 +29,8 @@ export interface HealthCheckDeps {
|
|
|
27
29
|
|
|
28
30
|
let deps: HealthCheckDeps | null = null
|
|
29
31
|
let intervalId: ReturnType<typeof setInterval> | null = null
|
|
32
|
+
let tickInFlight = false
|
|
33
|
+
let skippedTicks = 0
|
|
30
34
|
|
|
31
35
|
// ---------------------------------------------------------------------------
|
|
32
36
|
// initHealthCheck
|
|
@@ -46,21 +50,45 @@ export function startHealthCheck(intervalSeconds: number): void {
|
|
|
46
50
|
intervalId = setInterval(async () => {
|
|
47
51
|
if (!deps) return
|
|
48
52
|
if (deps.isShuttingDown()) return
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
53
|
+
if (tickInFlight) {
|
|
54
|
+
skippedTicks++
|
|
55
|
+
// Fire exactly once when the streak crosses 5 (4 → 5 transition). At the
|
|
56
|
+
// 120 s interval, five consecutive skips represents ~10 minutes of
|
|
57
|
+
// tick-budget exhaustion — long enough to indicate genuine health-check
|
|
58
|
+
// wedging (e.g., a persistently hung statRoute or isSessionAliveAdapter)
|
|
59
|
+
// rather than transient slowness. Further skips within the same streak
|
|
60
|
+
// are silent; the next successfully-started tick body resets the counter
|
|
61
|
+
// and re-arms the warning for a future streak.
|
|
62
|
+
if (skippedTicks === 5) {
|
|
63
|
+
console.error('[slack] health-check: tick body in flight; skipped 5 consecutive ticks — investigate budget exhaustion')
|
|
64
|
+
}
|
|
65
|
+
return
|
|
66
|
+
}
|
|
67
|
+
tickInFlight = true
|
|
68
|
+
skippedTicks = 0
|
|
69
|
+
try {
|
|
70
|
+
const routes = deps.getRoutes()
|
|
71
|
+
|
|
72
|
+
for (const [channelId, cwd] of Object.entries(routes)) {
|
|
73
|
+
try {
|
|
74
|
+
if (deps.isRestartPendingOrActive(channelId)) continue
|
|
75
|
+
|
|
76
|
+
if (await deps.statRoute(cwd)) {
|
|
77
|
+
clearOutageFlag(channelId, 'cwd-unreachable')
|
|
78
|
+
} else {
|
|
79
|
+
setOutageFlag(channelId, 'cwd-unreachable', cwd)
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
const alive = await deps.isSessionAlive(channelId)
|
|
83
|
+
if (!alive) {
|
|
84
|
+
deps.scheduleRestart(channelId, cwd)
|
|
85
|
+
}
|
|
86
|
+
} catch (err) {
|
|
87
|
+
console.error(`[slack] health-check: error checking channel=${channelId}:`, err)
|
|
60
88
|
}
|
|
61
|
-
} catch (err) {
|
|
62
|
-
console.error(`[slack] health-check: error checking channel=${channelId}:`, err)
|
|
63
89
|
}
|
|
90
|
+
} finally {
|
|
91
|
+
tickInFlight = false
|
|
64
92
|
}
|
|
65
93
|
}, intervalSeconds * 1000)
|
|
66
94
|
}
|
|
@@ -86,4 +114,6 @@ export function _resetHealthCheckState(): void {
|
|
|
86
114
|
intervalId = null
|
|
87
115
|
}
|
|
88
116
|
deps = null
|
|
117
|
+
tickInFlight = false
|
|
118
|
+
skippedTicks = 0
|
|
89
119
|
}
|
|
@@ -0,0 +1,287 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* outage-state.ts — Channel-scoped outage-flag state machine + Slack emit surface.
|
|
3
|
+
*
|
|
4
|
+
* Tracks three orthogonal outage classes per routed Slack channel and emits
|
|
5
|
+
* onset / all-clear messages via the injected `postToChannel` hook. The module
|
|
6
|
+
* is intentionally free of Date / timestamp logic — operators scroll back to
|
|
7
|
+
* the onset message for timing context.
|
|
8
|
+
*
|
|
9
|
+
* Public API surface (all exported):
|
|
10
|
+
* - initOutageState(deps) — install production dependencies
|
|
11
|
+
* - getOutageFlags(channelId) — read live flag set
|
|
12
|
+
* - setOutageFlag(channelId, cls, detail?) — raise flag + emit onset message
|
|
13
|
+
* - clearOutageFlag(channelId, cls) — lower flag; emits all-clear when set empties
|
|
14
|
+
* - resetAllToHealthy(channelIds) — silent bulk wipe (boot-time reset)
|
|
15
|
+
* - withOutageDetection(ch, cwd, fn) — AD verb wrapper; raises/clears flags on error/success
|
|
16
|
+
* - withSpawnDetection(ch, cwd, fn) — like withOutageDetection + clears cwd-unreachable on success
|
|
17
|
+
* - _resetOutageState() — test-only state reset
|
|
18
|
+
*
|
|
19
|
+
* Template exports (used by tests):
|
|
20
|
+
* - ONSET_TEMPLATES
|
|
21
|
+
* - ALL_CLEAR_TEMPLATE
|
|
22
|
+
*
|
|
23
|
+
* SPDX-License-Identifier: MIT
|
|
24
|
+
*/
|
|
25
|
+
|
|
26
|
+
import type { Client } from 'agent-director'
|
|
27
|
+
import {
|
|
28
|
+
ErrSystemInstallDisappeared,
|
|
29
|
+
ErrTmuxNotAvailable,
|
|
30
|
+
ErrCwdNotFound,
|
|
31
|
+
ErrCwdNotADirectory,
|
|
32
|
+
} from './agent-director-errors.ts'
|
|
33
|
+
|
|
34
|
+
// ---------------------------------------------------------------------------
|
|
35
|
+
// Data types
|
|
36
|
+
// ---------------------------------------------------------------------------
|
|
37
|
+
|
|
38
|
+
/** Union of all outage classifications. No 'healthy' member — absence == healthy. */
|
|
39
|
+
export type OutageClass = 'ad-unreachable' | 'cwd-unreachable' | 'tmux-unavailable'
|
|
40
|
+
|
|
41
|
+
/**
|
|
42
|
+
* Detail record for a single outage class in a bad stretch.
|
|
43
|
+
* No `enteredAtIso` — timestamps are intentionally absent from the all-clear template.
|
|
44
|
+
*/
|
|
45
|
+
export interface ClassRecord {
|
|
46
|
+
detail?: string
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
/** Per-channel state entry. */
|
|
50
|
+
interface ChannelEntry {
|
|
51
|
+
/** Currently active outage flags. */
|
|
52
|
+
flags: Set<OutageClass>
|
|
53
|
+
/** Class → detail record for the current bad stretch. Reset to empty Map on all-clear. */
|
|
54
|
+
badStretchClasses: Map<OutageClass, ClassRecord>
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
/** Dependencies injected via `initOutageState` — wires the module to production Slack + AD. */
|
|
58
|
+
export interface OutageStateDeps {
|
|
59
|
+
/** Fire-and-forget Slack post; errors MUST be handled internally by the caller. */
|
|
60
|
+
postToChannel(channelId: string, text: string): void
|
|
61
|
+
/** Return the singleton AD Client. Same semantics as getClient() in agent-director-client.ts. */
|
|
62
|
+
getClient(): Client
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
// ---------------------------------------------------------------------------
|
|
66
|
+
// Module-scoped state
|
|
67
|
+
// ---------------------------------------------------------------------------
|
|
68
|
+
|
|
69
|
+
let deps: OutageStateDeps | undefined
|
|
70
|
+
const entries = new Map<string, ChannelEntry>()
|
|
71
|
+
|
|
72
|
+
// ---------------------------------------------------------------------------
|
|
73
|
+
// Helpers
|
|
74
|
+
// ---------------------------------------------------------------------------
|
|
75
|
+
|
|
76
|
+
/** Lazy-create a ChannelEntry for `channelId` on first access. */
|
|
77
|
+
function entryFor(channelId: string): ChannelEntry {
|
|
78
|
+
let entry = entries.get(channelId)
|
|
79
|
+
if (!entry) {
|
|
80
|
+
entry = { flags: new Set(), badStretchClasses: new Map() }
|
|
81
|
+
entries.set(channelId, entry)
|
|
82
|
+
}
|
|
83
|
+
return entry
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
// ---------------------------------------------------------------------------
|
|
87
|
+
// Slack message templates
|
|
88
|
+
// ---------------------------------------------------------------------------
|
|
89
|
+
|
|
90
|
+
/** Stable iteration order for the all-clear template. */
|
|
91
|
+
const STABLE_CLASS_ORDER: OutageClass[] = [
|
|
92
|
+
'ad-unreachable',
|
|
93
|
+
'cwd-unreachable',
|
|
94
|
+
'tmux-unavailable',
|
|
95
|
+
]
|
|
96
|
+
|
|
97
|
+
/**
|
|
98
|
+
* ONSET_TEMPLATES — one template function per outage class.
|
|
99
|
+
* The optional `detail` parameter carries class-specific context
|
|
100
|
+
* (binary path for ad-unreachable; cwd path for cwd-unreachable).
|
|
101
|
+
*/
|
|
102
|
+
export const ONSET_TEMPLATES: Record<OutageClass, (detail?: string) => string> = {
|
|
103
|
+
'ad-unreachable': (binaryPath?: string) =>
|
|
104
|
+
`:rotating_light: *agent-director unreachable* — affects every routed channel.\nBinary: \`${binaryPath ?? '<unknown>'}\`\nRemediation: reinstall agent-director.`,
|
|
105
|
+
|
|
106
|
+
'tmux-unavailable': (_detail?: string) =>
|
|
107
|
+
`:rotating_light: *tmux unavailable* — affects every routed channel.\nRemediation: install or repair tmux.`,
|
|
108
|
+
|
|
109
|
+
'cwd-unreachable': (cwd?: string) =>
|
|
110
|
+
`:rotating_light: *Route cwd unreachable* — \`${cwd ?? '<unknown>'}\`\nRemediation: restore the directory or remove this route from \`config.json\`.`,
|
|
111
|
+
}
|
|
112
|
+
|
|
113
|
+
/**
|
|
114
|
+
* ALL_CLEAR_TEMPLATE — renders a Slack all-clear message from the bad-stretch
|
|
115
|
+
* history snapshot. Entries are emitted in the stable class order regardless
|
|
116
|
+
* of the order flags were raised. No timestamps.
|
|
117
|
+
*/
|
|
118
|
+
export function ALL_CLEAR_TEMPLATE(resolved: Map<OutageClass, ClassRecord>): string {
|
|
119
|
+
const parts: string[] = []
|
|
120
|
+
for (const cls of STABLE_CLASS_ORDER) {
|
|
121
|
+
const rec = resolved.get(cls)
|
|
122
|
+
if (rec === undefined) continue
|
|
123
|
+
const detailSuffix = rec.detail !== undefined ? ` (\`${rec.detail}\`)` : ''
|
|
124
|
+
parts.push(`\`${cls}\`${detailSuffix}`)
|
|
125
|
+
}
|
|
126
|
+
return `:white_check_mark: *All clear.* Resolved: ${parts.join(', ')}.`
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
// ---------------------------------------------------------------------------
|
|
130
|
+
// Public API — init + accessors
|
|
131
|
+
// ---------------------------------------------------------------------------
|
|
132
|
+
|
|
133
|
+
/**
|
|
134
|
+
* initOutageState — installs production dependencies. Called once from
|
|
135
|
+
* `src/server.ts:main()` after `routingConfig` is loaded, before the
|
|
136
|
+
* Socket Mode connect block.
|
|
137
|
+
*/
|
|
138
|
+
export function initOutageState(d: OutageStateDeps): void {
|
|
139
|
+
deps = d
|
|
140
|
+
}
|
|
141
|
+
|
|
142
|
+
/**
|
|
143
|
+
* getOutageFlags — returns the live read-only flag set for `channelId`.
|
|
144
|
+
* Returns an empty `ReadonlySet` sentinel when no entry exists yet.
|
|
145
|
+
*/
|
|
146
|
+
export function getOutageFlags(channelId: string): ReadonlySet<OutageClass> {
|
|
147
|
+
return entries.get(channelId)?.flags ?? (new Set<OutageClass>() as ReadonlySet<OutageClass>)
|
|
148
|
+
}
|
|
149
|
+
|
|
150
|
+
// ---------------------------------------------------------------------------
|
|
151
|
+
// Public API — mutators
|
|
152
|
+
// ---------------------------------------------------------------------------
|
|
153
|
+
|
|
154
|
+
/**
|
|
155
|
+
* setOutageFlag — raises `cls` for `channelId` and emits an onset Slack
|
|
156
|
+
* message. Same-flag re-raise is a silent no-op (dedupe).
|
|
157
|
+
*
|
|
158
|
+
* State mutates BEFORE the emit so a synchronous throw in `postToChannel`
|
|
159
|
+
* cannot cause double-emission on the next observation.
|
|
160
|
+
*/
|
|
161
|
+
export function setOutageFlag(channelId: string, cls: OutageClass, detail?: string): void {
|
|
162
|
+
if (!deps) return
|
|
163
|
+
const entry = entryFor(channelId)
|
|
164
|
+
if (entry.flags.has(cls)) return // same-flag dedupe
|
|
165
|
+
// Mutate state BEFORE emit (SR-V-2.x state-before-emit contract).
|
|
166
|
+
entry.flags.add(cls)
|
|
167
|
+
entry.badStretchClasses.set(cls, { detail })
|
|
168
|
+
deps.postToChannel(channelId, ONSET_TEMPLATES[cls](detail))
|
|
169
|
+
}
|
|
170
|
+
|
|
171
|
+
/**
|
|
172
|
+
* clearOutageFlag — lowers `cls` for `channelId`. Emits the all-clear Slack
|
|
173
|
+
* message ONLY when the clear leaves the flag set empty and there is a
|
|
174
|
+
* non-empty bad-stretch history (i.e., at least one onset was recorded).
|
|
175
|
+
* Intermediate clears (flag set still non-empty after removal) are silent.
|
|
176
|
+
*
|
|
177
|
+
* State mutates BEFORE the emit (same contract as setOutageFlag).
|
|
178
|
+
*/
|
|
179
|
+
export function clearOutageFlag(channelId: string, cls: OutageClass): void {
|
|
180
|
+
if (!deps) return
|
|
181
|
+
const entry = entries.get(channelId)
|
|
182
|
+
if (!entry) return
|
|
183
|
+
if (!entry.flags.has(cls)) return // same-state dedupe
|
|
184
|
+
// Mutate state BEFORE emit.
|
|
185
|
+
entry.flags.delete(cls)
|
|
186
|
+
if (entry.flags.size === 0 && entry.badStretchClasses.size > 0) {
|
|
187
|
+
// Snapshot history and reset BEFORE the postToChannel call.
|
|
188
|
+
const snapshot = new Map(entry.badStretchClasses)
|
|
189
|
+
entry.badStretchClasses = new Map()
|
|
190
|
+
deps.postToChannel(channelId, ALL_CLEAR_TEMPLATE(snapshot))
|
|
191
|
+
}
|
|
192
|
+
}
|
|
193
|
+
|
|
194
|
+
/**
|
|
195
|
+
* resetAllToHealthy — silently wipes every channel's flag set and bad-stretch
|
|
196
|
+
* history to a clean slate. No `postToChannel` calls. Used at boot time
|
|
197
|
+
* after `socket.start()` as a defensive boundary for pre-auth observations
|
|
198
|
+
* (boot-time call is added by Epic 2).
|
|
199
|
+
*/
|
|
200
|
+
export function resetAllToHealthy(channelIds: string[]): void {
|
|
201
|
+
for (const channelId of channelIds) {
|
|
202
|
+
entries.set(channelId, { flags: new Set(), badStretchClasses: new Map() })
|
|
203
|
+
}
|
|
204
|
+
}
|
|
205
|
+
|
|
206
|
+
// ---------------------------------------------------------------------------
|
|
207
|
+
// Public API — AD verb wrappers
|
|
208
|
+
// ---------------------------------------------------------------------------
|
|
209
|
+
|
|
210
|
+
/**
|
|
211
|
+
* withOutageDetection — centralized wrapper for AD verb calls that should
|
|
212
|
+
* participate in outage detection.
|
|
213
|
+
*
|
|
214
|
+
* On error:
|
|
215
|
+
* - ErrSystemInstallDisappeared → raises 'ad-unreachable' (detail = binaryPath)
|
|
216
|
+
* - ErrTmuxNotAvailable → raises 'tmux-unavailable'
|
|
217
|
+
* - ErrCwdNotFound / ErrCwdNotADirectory → raises 'cwd-unreachable' (detail = routeCwd)
|
|
218
|
+
* UNLESS routeCwd is undefined, in which case logs loudly and rethrows
|
|
219
|
+
* WITHOUT raising the flag (defensive carve-out for verb-class drift).
|
|
220
|
+
* - Other errors → no flag change; rethrow unchanged.
|
|
221
|
+
*
|
|
222
|
+
* On success: clears 'ad-unreachable' and 'tmux-unavailable', returns result.
|
|
223
|
+
*
|
|
224
|
+
* The original error is always rethrown so callers can handle it normally.
|
|
225
|
+
*/
|
|
226
|
+
export async function withOutageDetection<T>(
|
|
227
|
+
channelId: string,
|
|
228
|
+
routeCwd: string | undefined,
|
|
229
|
+
fn: (client: Client) => Promise<T>,
|
|
230
|
+
): Promise<T> {
|
|
231
|
+
if (!deps) {
|
|
232
|
+
throw new Error(
|
|
233
|
+
'outage-state: withOutageDetection called before initOutageState — caller-site bug',
|
|
234
|
+
)
|
|
235
|
+
}
|
|
236
|
+
try {
|
|
237
|
+
const result = await fn(deps.getClient())
|
|
238
|
+
clearOutageFlag(channelId, 'ad-unreachable')
|
|
239
|
+
clearOutageFlag(channelId, 'tmux-unavailable')
|
|
240
|
+
return result
|
|
241
|
+
} catch (err) {
|
|
242
|
+
if (err instanceof ErrSystemInstallDisappeared) {
|
|
243
|
+
setOutageFlag(channelId, 'ad-unreachable', err.binaryPath)
|
|
244
|
+
} else if (err instanceof ErrTmuxNotAvailable) {
|
|
245
|
+
setOutageFlag(channelId, 'tmux-unavailable')
|
|
246
|
+
} else if (err instanceof ErrCwdNotFound || err instanceof ErrCwdNotADirectory) {
|
|
247
|
+
if (routeCwd !== undefined) {
|
|
248
|
+
setOutageFlag(channelId, 'cwd-unreachable', routeCwd)
|
|
249
|
+
} else {
|
|
250
|
+
console.error(
|
|
251
|
+
`[slack] outage-state: withOutageDetection: cwd error on channel=${channelId} but routeCwd is undefined — verb-class drift; rethrowing without raising flag`,
|
|
252
|
+
err,
|
|
253
|
+
)
|
|
254
|
+
}
|
|
255
|
+
}
|
|
256
|
+
throw err
|
|
257
|
+
}
|
|
258
|
+
}
|
|
259
|
+
|
|
260
|
+
/**
|
|
261
|
+
* withSpawnDetection — like `withOutageDetection` but also clears
|
|
262
|
+
* 'cwd-unreachable' on success. Spawn and resume verbs are the only calls
|
|
263
|
+
* that actually exercise the route's cwd, so cwd health is only confirmed
|
|
264
|
+
* by a successful spawn/resume.
|
|
265
|
+
*/
|
|
266
|
+
export async function withSpawnDetection<T>(
|
|
267
|
+
channelId: string,
|
|
268
|
+
routeCwd: string | undefined,
|
|
269
|
+
fn: (client: Client) => Promise<T>,
|
|
270
|
+
): Promise<T> {
|
|
271
|
+
const result = await withOutageDetection(channelId, routeCwd, fn)
|
|
272
|
+
clearOutageFlag(channelId, 'cwd-unreachable')
|
|
273
|
+
return result
|
|
274
|
+
}
|
|
275
|
+
|
|
276
|
+
// ---------------------------------------------------------------------------
|
|
277
|
+
// Test-only
|
|
278
|
+
// ---------------------------------------------------------------------------
|
|
279
|
+
|
|
280
|
+
/**
|
|
281
|
+
* _resetOutageState — clears all module-scoped state. For tests only.
|
|
282
|
+
* Production code must not call this.
|
|
283
|
+
*/
|
|
284
|
+
export function _resetOutageState(): void {
|
|
285
|
+
deps = undefined
|
|
286
|
+
entries.clear()
|
|
287
|
+
}
|
|
@@ -16,11 +16,11 @@
|
|
|
16
16
|
* SPDX-License-Identifier: MIT
|
|
17
17
|
*/
|
|
18
18
|
|
|
19
|
-
import { AgentDirectorError, ErrAlreadyDecided } from 'agent-director'
|
|
20
|
-
import type { Client } from 'agent-director'
|
|
19
|
+
import { AgentDirectorError, ErrAlreadyDecided, ErrSystemInstallDisappeared, ErrTmuxNotAvailable } from 'agent-director'
|
|
21
20
|
import type { WebClient } from '@slack/web-api'
|
|
22
21
|
|
|
23
22
|
import { decideWithToken } from './agent-director-client.ts'
|
|
23
|
+
import { withOutageDetection } from './outage-state.ts'
|
|
24
24
|
import { parsePermissionActionId, type PermissionDecision } from './permission-action-id.ts'
|
|
25
25
|
import { getLivePermission, markHandled } from './permission-poller.ts'
|
|
26
26
|
import { emitTrail as defaultEmitTrail } from './permission-trail.ts'
|
|
@@ -32,8 +32,6 @@ import type {
|
|
|
32
32
|
} from './permission-trail.ts'
|
|
33
33
|
|
|
34
34
|
export interface ClickDeps {
|
|
35
|
-
/** Returns an AD Client whose `decide` method this handler will invoke. */
|
|
36
|
-
getClient: () => Pick<Client, 'decide'>
|
|
37
35
|
web: Pick<WebClient, 'chat'>
|
|
38
36
|
log?: (...args: unknown[]) => void
|
|
39
37
|
/**
|
|
@@ -141,6 +139,21 @@ export async function handlePermissionClick(
|
|
|
141
139
|
live_pending: earlyEntry !== undefined,
|
|
142
140
|
})
|
|
143
141
|
|
|
142
|
+
const ctxChannel = context.channel ?? earlyEntry?.channelId
|
|
143
|
+
if (!ctxChannel) {
|
|
144
|
+
logDeps(
|
|
145
|
+
deps,
|
|
146
|
+
`[slack] permission-click-handler: cannot resolve channelId ` +
|
|
147
|
+
`(context.channel=${context.channel ?? 'undefined'}, ` +
|
|
148
|
+
`earlyEntry.channelId=${earlyEntry?.channelId ?? 'undefined'}, ` +
|
|
149
|
+
`claude_instance_id=${claudeInstanceId}, ` +
|
|
150
|
+
`action_id=${actionId}, ` +
|
|
151
|
+
`request_token=${requestToken}) ` +
|
|
152
|
+
`— logging and bypassing this click. This is a CSCB bug — investigate.`,
|
|
153
|
+
)
|
|
154
|
+
return true
|
|
155
|
+
}
|
|
156
|
+
|
|
144
157
|
// SR-4.1, SR-4.2, SR-7.2: AD is the source of truth. Always call decide
|
|
145
158
|
// with the decoded request_token — including for stale clicks whose
|
|
146
159
|
// composite key is no longer in the pending map. This is the click's ONLY
|
|
@@ -152,14 +165,29 @@ export async function handlePermissionClick(
|
|
|
152
165
|
decision,
|
|
153
166
|
}
|
|
154
167
|
try {
|
|
155
|
-
await
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
168
|
+
await withOutageDetection(ctxChannel, undefined, (client) =>
|
|
169
|
+
decideWithToken(client, {
|
|
170
|
+
claude_instance_id: claudeInstanceId,
|
|
171
|
+
decision,
|
|
172
|
+
request_token: requestToken,
|
|
173
|
+
}),
|
|
174
|
+
)
|
|
160
175
|
// SR-V-2.7 call-side success emission.
|
|
161
176
|
emit({ ...decideEnvelope, result_class: 'ok' satisfies AdDecideResponseClass })
|
|
162
177
|
} catch (err) {
|
|
178
|
+
if (err instanceof ErrSystemInstallDisappeared || err instanceof ErrTmuxNotAvailable) {
|
|
179
|
+
// SR-V-2.7 ad/tmux carve-out: the wrapper already raised the outage
|
|
180
|
+
// flag (one Slack onset alert via the state machine). Forensic
|
|
181
|
+
// requirement: the trail JSONL must still carry the typed error name
|
|
182
|
+
// and message so post-incident debugging can correlate the click
|
|
183
|
+
// attempt with the AD-down cause — the loud Slack alert tells the
|
|
184
|
+
// operator something is broken; the trail entry tells the engineer
|
|
185
|
+
// what was actually thrown.
|
|
186
|
+
const result_class: AdDecideResponseClass = err.errName as AdDecideResponseClass
|
|
187
|
+
const raw_error_message = err.message
|
|
188
|
+
emit({ ...decideEnvelope, result_class, raw_error_message })
|
|
189
|
+
return true
|
|
190
|
+
}
|
|
163
191
|
// SR-V-2.7 call-side failure emission. Classify against the same AD
|
|
164
192
|
// error names the existing branches discriminate on so the trail's
|
|
165
193
|
// result_class stays consistent with src/agent-director-errors.ts. The
|
package/src/permission-poller.ts
CHANGED
|
@@ -35,6 +35,8 @@
|
|
|
35
35
|
import {
|
|
36
36
|
AgentDirectorError,
|
|
37
37
|
ErrSpawnNotFound,
|
|
38
|
+
ErrSystemInstallDisappeared,
|
|
39
|
+
ErrTmuxNotAvailable,
|
|
38
40
|
} from 'agent-director'
|
|
39
41
|
import type {
|
|
40
42
|
GetResult as ADGetResult,
|
|
@@ -50,6 +52,7 @@ import type {
|
|
|
50
52
|
GetPermissionParams,
|
|
51
53
|
GetPermissionResult,
|
|
52
54
|
} from './agent-director-client.ts'
|
|
55
|
+
import { withOutageDetection } from './outage-state.ts'
|
|
53
56
|
import { encodePermissionActionId } from './permission-action-id.ts'
|
|
54
57
|
import { emitTrail as defaultEmitTrail } from './permission-trail.ts'
|
|
55
58
|
import type {
|
|
@@ -303,7 +306,12 @@ function classifySlackError(err: unknown): string {
|
|
|
303
306
|
async function runTick(deps: PollerDeps): Promise<void> {
|
|
304
307
|
if (tickInFlight) {
|
|
305
308
|
skippedTicks++
|
|
306
|
-
|
|
309
|
+
// Fire exactly once when the streak crosses 5 (4 → 5 transition). Further
|
|
310
|
+
// skips within the same streak are silent; the next successfully-started
|
|
311
|
+
// tick body resets skippedTicks to 0 and re-arms the warning for a future
|
|
312
|
+
// streak. Mirrors src/health-check.ts:55-62 — see that block for the
|
|
313
|
+
// budget-exhaustion rationale.
|
|
314
|
+
if (skippedTicks === 5) {
|
|
307
315
|
logViaDeps(deps, `[slack] permission-poller: skipped ${skippedTicks} consecutive ticks — tick budget exceeded`)
|
|
308
316
|
}
|
|
309
317
|
return
|
|
@@ -324,13 +332,23 @@ async function runTick(deps: PollerDeps): Promise<void> {
|
|
|
324
332
|
const seenComposite = new Set<string>()
|
|
325
333
|
const nonConformingInstanceIds = new Set<string>()
|
|
326
334
|
for (const row of rows) {
|
|
335
|
+
const rowChannelId = row.labels['channel']
|
|
336
|
+
if (!rowChannelId) {
|
|
337
|
+
logViaDeps(deps, `[slack] permission-poller: spawn ${row.claude_instance_id} has no channel label — skipping`)
|
|
338
|
+
continue
|
|
339
|
+
}
|
|
340
|
+
|
|
327
341
|
let got: GetResultWithPermissionRequests
|
|
328
342
|
try {
|
|
329
|
-
got = (await
|
|
330
|
-
claude_instance_id: row.claude_instance_id
|
|
331
|
-
|
|
343
|
+
got = (await withOutageDetection(rowChannelId, undefined, () =>
|
|
344
|
+
client.get({ claude_instance_id: row.claude_instance_id })
|
|
345
|
+
)) as unknown as GetResultWithPermissionRequests
|
|
332
346
|
} catch (err) {
|
|
333
347
|
if (err instanceof ErrSpawnNotFound) continue
|
|
348
|
+
if (err instanceof ErrSystemInstallDisappeared || err instanceof ErrTmuxNotAvailable) {
|
|
349
|
+
// Outage flag raised; skip per-event log.
|
|
350
|
+
continue
|
|
351
|
+
}
|
|
334
352
|
const e = err instanceof AgentDirectorError ? err : null
|
|
335
353
|
logViaDeps(deps, `[slack] permission-poller: get failed for ${row.claude_instance_id}: ${e?.errName ?? String(err)}`)
|
|
336
354
|
continue
|
|
@@ -368,8 +386,14 @@ async function runTick(deps: PollerDeps): Promise<void> {
|
|
|
368
386
|
for (const entry of closedEntries) {
|
|
369
387
|
let info: GetPermissionResult
|
|
370
388
|
try {
|
|
371
|
-
info = await
|
|
389
|
+
info = await withOutageDetection(entry.channelId, undefined, () =>
|
|
390
|
+
getPermission(client, { request_token: entry.requestToken })
|
|
391
|
+
)
|
|
372
392
|
} catch (err) {
|
|
393
|
+
if (err instanceof ErrSystemInstallDisappeared || err instanceof ErrTmuxNotAvailable) {
|
|
394
|
+
// Outage flag raised; skip per-event log.
|
|
395
|
+
continue
|
|
396
|
+
}
|
|
373
397
|
if (isErrPermissionRequestNotFound(err)) {
|
|
374
398
|
logViaDeps(deps, `[slack] permission-poller: get-permission not-found for ${entry.claudeInstanceId} token=${entry.requestToken} — generic deny + drop`)
|
|
375
399
|
emitRowDecision(deps, 'not_found_generic_deny', entry.claudeInstanceId, entry.requestToken)
|
package/src/restart.ts
CHANGED
|
@@ -22,17 +22,10 @@ export interface RestartDeps {
|
|
|
22
22
|
isShuttingDown(): boolean
|
|
23
23
|
}
|
|
24
24
|
|
|
25
|
-
// ---------------------------------------------------------------------------
|
|
26
|
-
// Constants
|
|
27
|
-
// ---------------------------------------------------------------------------
|
|
28
|
-
|
|
29
|
-
export const MAX_CONSECUTIVE_FAILURES = 3
|
|
30
|
-
|
|
31
25
|
// ---------------------------------------------------------------------------
|
|
32
26
|
// Module-scoped state
|
|
33
27
|
// ---------------------------------------------------------------------------
|
|
34
28
|
|
|
35
|
-
const failureCounters = new Map<string, number>()
|
|
36
29
|
const pendingRestartTimers = new Map<string, ReturnType<typeof setTimeout>>()
|
|
37
30
|
const activeLaunches = new Set<string>()
|
|
38
31
|
let deps: RestartDeps | null = null
|
|
@@ -61,14 +54,6 @@ export function scheduleRestart(channelId: string, cwd: string, sessionId?: stri
|
|
|
61
54
|
return
|
|
62
55
|
}
|
|
63
56
|
|
|
64
|
-
const failures = failureCounters.get(channelId) ?? 0
|
|
65
|
-
if (failures >= MAX_CONSECUTIVE_FAILURES) {
|
|
66
|
-
console.error(
|
|
67
|
-
`[slack] Max consecutive failures (${MAX_CONSECUTIVE_FAILURES}) reached — giving up on channel=${channelId} cwd="${cwd}"`,
|
|
68
|
-
)
|
|
69
|
-
return
|
|
70
|
-
}
|
|
71
|
-
|
|
72
57
|
// Cancel any existing timer for this channel
|
|
73
58
|
const existing = pendingRestartTimers.get(channelId)
|
|
74
59
|
if (existing !== undefined) {
|
|
@@ -130,11 +115,7 @@ export function scheduleRestart(channelId: string, cwd: string, sessionId?: stri
|
|
|
130
115
|
}
|
|
131
116
|
|
|
132
117
|
if (!ok) {
|
|
133
|
-
|
|
134
|
-
failureCounters.set(channelId, count)
|
|
135
|
-
console.error(
|
|
136
|
-
`[slack] Session relaunch failed for channel=${channelId} (failure ${count}/${MAX_CONSECUTIVE_FAILURES})`,
|
|
137
|
-
)
|
|
118
|
+
console.error(`[slack] Session relaunch failed for channel=${channelId}`)
|
|
138
119
|
}
|
|
139
120
|
} finally {
|
|
140
121
|
activeLaunches.delete(channelId)
|
|
@@ -144,14 +125,6 @@ export function scheduleRestart(channelId: string, cwd: string, sessionId?: stri
|
|
|
144
125
|
pendingRestartTimers.set(channelId, timer)
|
|
145
126
|
}
|
|
146
127
|
|
|
147
|
-
// ---------------------------------------------------------------------------
|
|
148
|
-
// resetFailureCounter
|
|
149
|
-
// ---------------------------------------------------------------------------
|
|
150
|
-
|
|
151
|
-
export function resetFailureCounter(channelId: string): void {
|
|
152
|
-
failureCounters.set(channelId, 0)
|
|
153
|
-
}
|
|
154
|
-
|
|
155
128
|
// ---------------------------------------------------------------------------
|
|
156
129
|
// cancelAllRestartTimers
|
|
157
130
|
// ---------------------------------------------------------------------------
|
|
@@ -165,17 +138,13 @@ export function cancelAllRestartTimers(): void {
|
|
|
165
138
|
}
|
|
166
139
|
|
|
167
140
|
// ---------------------------------------------------------------------------
|
|
168
|
-
// isRestartPendingOrActive
|
|
141
|
+
// isRestartPendingOrActive — query function
|
|
169
142
|
// ---------------------------------------------------------------------------
|
|
170
143
|
|
|
171
144
|
export function isRestartPendingOrActive(channelId: string): boolean {
|
|
172
145
|
return pendingRestartTimers.has(channelId) || activeLaunches.has(channelId)
|
|
173
146
|
}
|
|
174
147
|
|
|
175
|
-
export function hasReachedMaxFailures(channelId: string): boolean {
|
|
176
|
-
return (failureCounters.get(channelId) ?? 0) >= MAX_CONSECUTIVE_FAILURES
|
|
177
|
-
}
|
|
178
|
-
|
|
179
148
|
// ---------------------------------------------------------------------------
|
|
180
149
|
// _resetRestartState — exported for test cleanup
|
|
181
150
|
// ---------------------------------------------------------------------------
|
|
@@ -184,7 +153,6 @@ export function _resetRestartState(): void {
|
|
|
184
153
|
for (const timer of pendingRestartTimers.values()) {
|
|
185
154
|
clearTimeout(timer)
|
|
186
155
|
}
|
|
187
|
-
failureCounters.clear()
|
|
188
156
|
pendingRestartTimers.clear()
|
|
189
157
|
activeLaunches.clear()
|
|
190
158
|
deps = null
|
package/src/server.ts
CHANGED
|
@@ -27,6 +27,7 @@ import {
|
|
|
27
27
|
chmodSync,
|
|
28
28
|
existsSync,
|
|
29
29
|
renameSync,
|
|
30
|
+
promises as fsPromises,
|
|
30
31
|
} from 'fs'
|
|
31
32
|
|
|
32
33
|
import {
|
|
@@ -54,6 +55,7 @@ import {
|
|
|
54
55
|
} from './session-manager.ts'
|
|
55
56
|
import { cleanSession, getCozempicAvailable } from './cozempic.ts'
|
|
56
57
|
import { ErrSpawnNotFound } from 'agent-director'
|
|
58
|
+
import { ErrSystemInstallDisappeared, ErrTmuxNotAvailable } from './agent-director-errors.ts'
|
|
57
59
|
import { getClient, closeClient } from './agent-director-client.ts'
|
|
58
60
|
import {
|
|
59
61
|
emitBlockActionReceived,
|
|
@@ -64,10 +66,8 @@ import { startPermissionPoller, stopPermissionPoller } from './permission-poller
|
|
|
64
66
|
import {
|
|
65
67
|
initRestart,
|
|
66
68
|
scheduleRestart,
|
|
67
|
-
resetFailureCounter,
|
|
68
69
|
cancelAllRestartTimers,
|
|
69
70
|
isRestartPendingOrActive,
|
|
70
|
-
hasReachedMaxFailures,
|
|
71
71
|
} from './restart.ts'
|
|
72
72
|
import { initHealthCheck, startHealthCheck, stopHealthCheck } from './health-check.ts'
|
|
73
73
|
import { loadTokens, isDryRun } from './tokens.ts'
|
|
@@ -99,6 +99,7 @@ import {
|
|
|
99
99
|
} from './registry.ts'
|
|
100
100
|
import { runAgentDirectorStartupGate } from './agent-director-startup.ts'
|
|
101
101
|
import { installSlackChannelBotTemplate } from './agent-director-template.ts'
|
|
102
|
+
import { initOutageState, setOutageFlag, clearOutageFlag, resetAllToHealthy, withOutageDetection } from './outage-state.ts'
|
|
102
103
|
|
|
103
104
|
// Re-export constants so they stay in one place (lib.ts)
|
|
104
105
|
export { MAX_PENDING, MAX_PAIRING_REPLIES, PAIRING_EXPIRY_MS } from './lib.ts'
|
|
@@ -521,9 +522,6 @@ async function handleInitialized(
|
|
|
521
522
|
console.error(`[slack] Session replaced existing connection for CWD "${normalizedCwd}"`)
|
|
522
523
|
}
|
|
523
524
|
console.error(`[slack] Session connected: channel=${matchedChannelId} cwd="${normalizedCwd}"`)
|
|
524
|
-
|
|
525
|
-
// Reset failure counter — session reconnected successfully
|
|
526
|
-
resetFailureCounter(matchedChannelId)
|
|
527
525
|
}
|
|
528
526
|
|
|
529
527
|
// ---------------------------------------------------------------------------
|
|
@@ -756,7 +754,7 @@ socket.on('interactive', async (evt) => {
|
|
|
756
754
|
|
|
757
755
|
const handled = await handlePermissionClick(
|
|
758
756
|
actionId,
|
|
759
|
-
{
|
|
757
|
+
{ web },
|
|
760
758
|
{ channel: channelId, messageTs, user: userId },
|
|
761
759
|
)
|
|
762
760
|
if (handled) {
|
|
@@ -838,6 +836,97 @@ async function shutdown(signal: string): Promise<void> {
|
|
|
838
836
|
process.on('SIGTERM', () => { shutdown('SIGTERM').catch(() => process.exit(1)) })
|
|
839
837
|
process.on('SIGINT', () => { shutdown('SIGINT').catch(() => process.exit(1)) })
|
|
840
838
|
|
|
839
|
+
// ---------------------------------------------------------------------------
|
|
840
|
+
// _buildIsSessionAliveAdapter
|
|
841
|
+
// ---------------------------------------------------------------------------
|
|
842
|
+
|
|
843
|
+
/**
|
|
844
|
+
* _buildIsSessionAliveAdapter — test-only factory for the tick-path liveness
|
|
845
|
+
* probe. Production code wires this via main() as `isSessionAliveAdapter`;
|
|
846
|
+
* tests call it directly to exercise the four SRD § Liveness probe branches
|
|
847
|
+
* without importing the private closure inside main().
|
|
848
|
+
*
|
|
849
|
+
* @internal
|
|
850
|
+
*/
|
|
851
|
+
export function _buildIsSessionAliveAdapter(
|
|
852
|
+
getRoutingConfig: () => RoutingConfig | null | undefined,
|
|
853
|
+
): (channelId: string) => Promise<boolean> {
|
|
854
|
+
return async (channelId: string) => {
|
|
855
|
+
const routingConfig = getRoutingConfig()
|
|
856
|
+
if (!routingConfig?.routes[channelId]) return false
|
|
857
|
+
const claude_instance_id = instanceIdFor(channelId, routingConfig.routes[channelId]?.normalizedName)
|
|
858
|
+
try {
|
|
859
|
+
const r = await getClient().status({ claude_instance_id })
|
|
860
|
+
clearOutageFlag(channelId, 'ad-unreachable')
|
|
861
|
+
clearOutageFlag(channelId, 'tmux-unavailable')
|
|
862
|
+
return AGENT_DIRECTOR_LIVE_STATES.has(r.state)
|
|
863
|
+
} catch (err) {
|
|
864
|
+
if (err instanceof ErrSpawnNotFound) {
|
|
865
|
+
clearOutageFlag(channelId, 'ad-unreachable')
|
|
866
|
+
clearOutageFlag(channelId, 'tmux-unavailable')
|
|
867
|
+
return false
|
|
868
|
+
}
|
|
869
|
+
if (err instanceof ErrSystemInstallDisappeared) {
|
|
870
|
+
setOutageFlag(channelId, 'ad-unreachable', err.binaryPath)
|
|
871
|
+
return false
|
|
872
|
+
}
|
|
873
|
+
if (err instanceof ErrTmuxNotAvailable) {
|
|
874
|
+
setOutageFlag(channelId, 'tmux-unavailable')
|
|
875
|
+
return false
|
|
876
|
+
}
|
|
877
|
+
console.error(`[slack] isSessionAlive: status error for channel=${channelId}:`, err)
|
|
878
|
+
return false
|
|
879
|
+
}
|
|
880
|
+
}
|
|
881
|
+
}
|
|
882
|
+
|
|
883
|
+
// ---------------------------------------------------------------------------
|
|
884
|
+
// _buildStatRouteImpl
|
|
885
|
+
// ---------------------------------------------------------------------------
|
|
886
|
+
|
|
887
|
+
/**
|
|
888
|
+
* _buildStatRouteImpl — test-only factory for the health-check tick's
|
|
889
|
+
* statRoute dependency. Production code wires this via main()'s
|
|
890
|
+
* initHealthCheck call with no deps; tests inject `stat` / `setTimeout` /
|
|
891
|
+
* `clearTimeout` to exercise the 5-second timeout budget and the
|
|
892
|
+
* fsPromises.stat error swallowing against the REAL factory (not a replica).
|
|
893
|
+
*
|
|
894
|
+
* @internal
|
|
895
|
+
*/
|
|
896
|
+
export function _buildStatRouteImpl(deps?: {
|
|
897
|
+
stat?: (path: string) => Promise<{ isDirectory(): boolean }>
|
|
898
|
+
setTimeout?: (fn: () => void, ms: number) => ReturnType<typeof setTimeout>
|
|
899
|
+
clearTimeout?: (handle: ReturnType<typeof setTimeout> | undefined) => void
|
|
900
|
+
}): (cwd: string) => Promise<boolean> {
|
|
901
|
+
const stat = deps?.stat ?? ((p: string) => fsPromises.stat(p) as Promise<{ isDirectory(): boolean }>)
|
|
902
|
+
const setT = deps?.setTimeout ?? ((fn: () => void, ms: number) => setTimeout(fn, ms))
|
|
903
|
+
const clearT = deps?.clearTimeout ?? ((h: ReturnType<typeof setTimeout> | undefined) => clearTimeout(h))
|
|
904
|
+
const STAT_TIMEOUT_MS = 5_000
|
|
905
|
+
return async (cwd: string) => {
|
|
906
|
+
const statPromise = (async () => {
|
|
907
|
+
try {
|
|
908
|
+
const st = await stat(cwd)
|
|
909
|
+
return st.isDirectory()
|
|
910
|
+
} catch (err) {
|
|
911
|
+
console.error(`[slack] health-check: statRoute(${cwd}) failed:`, err)
|
|
912
|
+
return false
|
|
913
|
+
}
|
|
914
|
+
})()
|
|
915
|
+
let timeoutHandle: ReturnType<typeof setTimeout> | undefined
|
|
916
|
+
const timeoutPromise = new Promise<boolean>((resolve) => {
|
|
917
|
+
timeoutHandle = setT(() => {
|
|
918
|
+
console.error(`[slack] health-check: statRoute(${cwd}) timed out after ${STAT_TIMEOUT_MS}ms — treating as unreachable`)
|
|
919
|
+
resolve(false)
|
|
920
|
+
}, STAT_TIMEOUT_MS)
|
|
921
|
+
})
|
|
922
|
+
try {
|
|
923
|
+
return await Promise.race([statPromise, timeoutPromise])
|
|
924
|
+
} finally {
|
|
925
|
+
clearT(timeoutHandle)
|
|
926
|
+
}
|
|
927
|
+
}
|
|
928
|
+
}
|
|
929
|
+
|
|
841
930
|
// ---------------------------------------------------------------------------
|
|
842
931
|
// Main
|
|
843
932
|
//
|
|
@@ -910,6 +999,15 @@ export async function main(): Promise<void> {
|
|
|
910
999
|
}
|
|
911
1000
|
}
|
|
912
1001
|
|
|
1002
|
+
initOutageState({
|
|
1003
|
+
postToChannel: (channelId, text) => {
|
|
1004
|
+
web.chat.postMessage({ channel: channelId, text }).catch((err) => {
|
|
1005
|
+
console.error(`[slack] outage-state: postMessage failed for channel=${channelId}:`, err)
|
|
1006
|
+
})
|
|
1007
|
+
},
|
|
1008
|
+
getClient,
|
|
1009
|
+
})
|
|
1010
|
+
|
|
913
1011
|
if (isDryRun()) {
|
|
914
1012
|
console.error('[slack] Running in dry-run mode — Slack disabled')
|
|
915
1013
|
botUserId = 'U000DRY'
|
|
@@ -926,6 +1024,10 @@ export async function main(): Promise<void> {
|
|
|
926
1024
|
await socket.start()
|
|
927
1025
|
console.error('[slack] Socket Mode connected')
|
|
928
1026
|
|
|
1027
|
+
if (routingConfig) {
|
|
1028
|
+
resetAllToHealthy(Object.keys(routingConfig.routes))
|
|
1029
|
+
}
|
|
1030
|
+
|
|
929
1031
|
// SR-2.1 permission poller — single-threaded interval loop monitors AD
|
|
930
1032
|
// state for spawns in check_permission and posts Block Kit prompts.
|
|
931
1033
|
if (routingConfig) {
|
|
@@ -1127,18 +1229,7 @@ export async function main(): Promise<void> {
|
|
|
1127
1229
|
// SR-11 Event 6a. Any AGENT_DIRECTOR_LIVE_STATES value → alive; terminal
|
|
1128
1230
|
// states (ended, missing) and ErrSpawnNotFound → dead. Other errors fall
|
|
1129
1231
|
// back to "dead" defensively — health-check will retry.
|
|
1130
|
-
const isSessionAliveAdapter =
|
|
1131
|
-
if (!routingConfig?.routes[channelId]) return false
|
|
1132
|
-
const claude_instance_id = instanceIdFor(channelId, routingConfig.routes[channelId]?.normalizedName)
|
|
1133
|
-
try {
|
|
1134
|
-
const r = await getClient().status({ claude_instance_id })
|
|
1135
|
-
return AGENT_DIRECTOR_LIVE_STATES.has(r.state)
|
|
1136
|
-
} catch (err) {
|
|
1137
|
-
if (err instanceof ErrSpawnNotFound) return false
|
|
1138
|
-
console.error(`[slack] isSessionAlive: status error for channel=${channelId}:`, err)
|
|
1139
|
-
return false
|
|
1140
|
-
}
|
|
1141
|
-
}
|
|
1232
|
+
const isSessionAliveAdapter = _buildIsSessionAliveAdapter(() => routingConfig)
|
|
1142
1233
|
|
|
1143
1234
|
// Initialize restart module with library-backed adapters
|
|
1144
1235
|
initRestart({
|
|
@@ -1153,9 +1244,12 @@ export async function main(): Promise<void> {
|
|
|
1153
1244
|
killSession: async (channelId) => {
|
|
1154
1245
|
try {
|
|
1155
1246
|
const normalizedName = routingConfig?.routes[channelId]?.normalizedName
|
|
1156
|
-
await
|
|
1247
|
+
await withOutageDetection(channelId, undefined, (client) =>
|
|
1248
|
+
client.kill({ claude_instance_id: instanceIdFor(channelId, normalizedName) })
|
|
1249
|
+
)
|
|
1157
1250
|
} catch (err) {
|
|
1158
1251
|
if (err instanceof ErrSpawnNotFound) return
|
|
1252
|
+
if (err instanceof ErrSystemInstallDisappeared || err instanceof ErrTmuxNotAvailable) return
|
|
1159
1253
|
console.error(`[slack] killSession (restart adapter): error for channel=${channelId}:`, err)
|
|
1160
1254
|
}
|
|
1161
1255
|
},
|
|
@@ -1226,7 +1320,7 @@ export async function main(): Promise<void> {
|
|
|
1226
1320
|
initHealthCheck({
|
|
1227
1321
|
isSessionAlive: isSessionAliveAdapter,
|
|
1228
1322
|
isRestartPendingOrActive,
|
|
1229
|
-
|
|
1323
|
+
statRoute: _buildStatRouteImpl(),
|
|
1230
1324
|
scheduleRestart,
|
|
1231
1325
|
isShuttingDown: () => shuttingDown,
|
|
1232
1326
|
getRoutes: () => {
|
package/src/session-manager.ts
CHANGED
|
@@ -38,6 +38,13 @@ import type { WebClient } from '@slack/web-api'
|
|
|
38
38
|
import { checkCozempicAvailable } from './cozempic.ts'
|
|
39
39
|
import { type RoutingConfig, MCP_SERVER_NAME, normalizeChannelName } from './config.ts'
|
|
40
40
|
import { getClient } from './agent-director-client.ts'
|
|
41
|
+
import { withOutageDetection, withSpawnDetection } from './outage-state.ts'
|
|
42
|
+
import {
|
|
43
|
+
ErrSystemInstallDisappeared,
|
|
44
|
+
ErrTmuxNotAvailable,
|
|
45
|
+
ErrCwdNotFound,
|
|
46
|
+
ErrCwdNotADirectory,
|
|
47
|
+
} from './agent-director-errors.ts'
|
|
41
48
|
import { recordStartupError } from './startup-errors.ts'
|
|
42
49
|
import { isDryRun } from './tokens.ts'
|
|
43
50
|
|
|
@@ -184,12 +191,13 @@ export async function reconnectMcp(
|
|
|
184
191
|
const claude_instance_id = instanceIdFor(channelId, routingConfig?.routes[channelId]?.normalizedName)
|
|
185
192
|
console.error(`[slack] reconnecting MCP server "${MCP_SERVER_NAME}": channel=${channelId}`)
|
|
186
193
|
try {
|
|
187
|
-
await
|
|
194
|
+
await withOutageDetection(channelId, undefined, (client) => client.sendKeys({
|
|
188
195
|
claude_instance_id,
|
|
189
196
|
text: `/mcp reconnect ${MCP_SERVER_NAME}`,
|
|
190
|
-
})
|
|
197
|
+
}))
|
|
191
198
|
return true
|
|
192
199
|
} catch (err) {
|
|
200
|
+
if (err instanceof ErrSystemInstallDisappeared || err instanceof ErrTmuxNotAvailable) return false
|
|
193
201
|
const e = err instanceof AgentDirectorError ? err : new AgentDirectorError('send-keys', 'UnknownError', String(err))
|
|
194
202
|
console.error(`[slack] reconnectMcp: send-keys failed for channel=${channelId}: ${e.errName}`)
|
|
195
203
|
postSpawnFailureToChannel(channelId, e, web)
|
|
@@ -263,20 +271,23 @@ export async function approveDevChannelsDialog(
|
|
|
263
271
|
): Promise<void> {
|
|
264
272
|
void web
|
|
265
273
|
const claude_instance_id = instanceIdFor(channelId, normalizedName)
|
|
266
|
-
const client = getClient()
|
|
267
274
|
const deadline = Date.now() + _dialogPollTimeoutMs
|
|
268
275
|
|
|
269
276
|
let approved = false
|
|
270
277
|
while (Date.now() < deadline) {
|
|
271
278
|
try {
|
|
272
|
-
const { pane } = await client.readPane({ claude_instance_id, n_lines: 40, allow_pending: true })
|
|
279
|
+
const { pane } = await withOutageDetection(channelId, undefined, (client) => client.readPane({ claude_instance_id, n_lines: 40, allow_pending: true }))
|
|
273
280
|
if (pane.includes(DEV_CHANNELS_DIALOG_NEEDLE)) {
|
|
274
|
-
await client.sendKeys({ claude_instance_id, text: '', allow_pending: true })
|
|
281
|
+
await withOutageDetection(channelId, undefined, (client) => client.sendKeys({ claude_instance_id, text: '', allow_pending: true }))
|
|
275
282
|
approved = true
|
|
276
283
|
break
|
|
277
284
|
}
|
|
278
285
|
} catch (err) {
|
|
279
|
-
|
|
286
|
+
if (err instanceof ErrSystemInstallDisappeared || err instanceof ErrTmuxNotAvailable) {
|
|
287
|
+
// Outage flag raised; continue polling
|
|
288
|
+
} else {
|
|
289
|
+
console.error(`[slack] approveDevChannelsDialog: readPane error channel=${channelId}: ${String(err)}`)
|
|
290
|
+
}
|
|
280
291
|
}
|
|
281
292
|
await new Promise((r) => setTimeout(r, _dialogPollIntervalMs))
|
|
282
293
|
}
|
|
@@ -292,7 +303,7 @@ export async function approveDevChannelsDialog(
|
|
|
292
303
|
while (Date.now() < deadline && misses < DIALOG_GONE_CONFIRMS_REQUIRED) {
|
|
293
304
|
await new Promise((r) => setTimeout(r, _dialogPollIntervalMs))
|
|
294
305
|
try {
|
|
295
|
-
const { pane } = await client.readPane({ claude_instance_id, n_lines: 40, allow_pending: true })
|
|
306
|
+
const { pane } = await withOutageDetection(channelId, undefined, (client) => client.readPane({ claude_instance_id, n_lines: 40, allow_pending: true }))
|
|
296
307
|
misses = pane.includes(DEV_CHANNELS_DIALOG_NEEDLE) ? 0 : misses + 1
|
|
297
308
|
} catch {
|
|
298
309
|
/* tolerate transient readPane failure */
|
|
@@ -362,20 +373,23 @@ export async function approveTrustFolderDialog(
|
|
|
362
373
|
): Promise<void> {
|
|
363
374
|
void web
|
|
364
375
|
const claude_instance_id = instanceIdFor(channelId, normalizedName)
|
|
365
|
-
const client = getClient()
|
|
366
376
|
const deadline = Date.now() + _trustDialogPollTimeoutMs
|
|
367
377
|
|
|
368
378
|
let approved = false
|
|
369
379
|
while (Date.now() < deadline) {
|
|
370
380
|
try {
|
|
371
|
-
const { pane } = await client.readPane({ claude_instance_id, n_lines: 40, allow_pending: true })
|
|
381
|
+
const { pane } = await withOutageDetection(channelId, undefined, (client) => client.readPane({ claude_instance_id, n_lines: 40, allow_pending: true }))
|
|
372
382
|
if (pane.includes(TRUST_DIALOG_NEEDLE)) {
|
|
373
|
-
await client.sendKeys({ claude_instance_id, text: '', allow_pending: true })
|
|
383
|
+
await withOutageDetection(channelId, undefined, (client) => client.sendKeys({ claude_instance_id, text: '', allow_pending: true }))
|
|
374
384
|
approved = true
|
|
375
385
|
break
|
|
376
386
|
}
|
|
377
387
|
} catch (err) {
|
|
378
|
-
|
|
388
|
+
if (err instanceof ErrSystemInstallDisappeared || err instanceof ErrTmuxNotAvailable) {
|
|
389
|
+
// Outage flag raised; continue polling
|
|
390
|
+
} else {
|
|
391
|
+
console.error(`[slack] approveTrustFolderDialog: readPane error channel=${channelId}: ${String(err)}`)
|
|
392
|
+
}
|
|
379
393
|
}
|
|
380
394
|
await new Promise((r) => setTimeout(r, _trustDialogPollIntervalMs))
|
|
381
395
|
}
|
|
@@ -389,7 +403,7 @@ export async function approveTrustFolderDialog(
|
|
|
389
403
|
while (Date.now() < deadline && misses < DIALOG_GONE_CONFIRMS_REQUIRED) {
|
|
390
404
|
await new Promise((r) => setTimeout(r, _trustDialogPollIntervalMs))
|
|
391
405
|
try {
|
|
392
|
-
const { pane } = await client.readPane({ claude_instance_id, n_lines: 40, allow_pending: true })
|
|
406
|
+
const { pane } = await withOutageDetection(channelId, undefined, (client) => client.readPane({ claude_instance_id, n_lines: 40, allow_pending: true }))
|
|
393
407
|
misses = pane.includes(TRUST_DIALOG_NEEDLE) ? 0 : misses + 1
|
|
394
408
|
} catch {
|
|
395
409
|
/* tolerate transient readPane failure */
|
|
@@ -439,13 +453,16 @@ export async function waitForWaitingAndReconnect(
|
|
|
439
453
|
while (Date.now() < deadline) {
|
|
440
454
|
let state: string
|
|
441
455
|
try {
|
|
442
|
-
const r = await
|
|
456
|
+
const r = await withOutageDetection(channelId, undefined, (client) => client.status({ claude_instance_id }))
|
|
443
457
|
state = r.state
|
|
444
458
|
} catch (err) {
|
|
445
459
|
if (err instanceof ErrSpawnNotFound) {
|
|
446
460
|
console.error(`[slack] waitForWaitingAndReconnect: spawn not found for channel=${channelId} — aborting poll`)
|
|
447
461
|
return true
|
|
448
462
|
}
|
|
463
|
+
if (err instanceof ErrSystemInstallDisappeared || err instanceof ErrTmuxNotAvailable) {
|
|
464
|
+
return false
|
|
465
|
+
}
|
|
449
466
|
const e = err instanceof AgentDirectorError ? err : new AgentDirectorError('status', 'UnknownError', String(err))
|
|
450
467
|
console.error(`[slack] waitForWaitingAndReconnect: status error for channel=${channelId}: ${e.errName}`)
|
|
451
468
|
postSpawnFailureToChannel(channelId, e, web)
|
|
@@ -507,7 +524,7 @@ function buildSpawnParams(
|
|
|
507
524
|
/** Best-effort kill — never throws. */
|
|
508
525
|
async function tryKill(channelId: string, normalizedName: string | undefined): Promise<void> {
|
|
509
526
|
try {
|
|
510
|
-
await
|
|
527
|
+
await withOutageDetection(channelId, undefined, (client) => client.kill({ claude_instance_id: instanceIdFor(channelId, normalizedName) }))
|
|
511
528
|
} catch {
|
|
512
529
|
/* ignore */
|
|
513
530
|
}
|
|
@@ -521,9 +538,12 @@ async function tryDelete(
|
|
|
521
538
|
isStartup: boolean,
|
|
522
539
|
): Promise<boolean> {
|
|
523
540
|
try {
|
|
524
|
-
await
|
|
541
|
+
await withOutageDetection(channelId, undefined, (client) => client.delete({ claude_instance_id: [instanceIdFor(channelId, normalizedName)] }))
|
|
525
542
|
return true
|
|
526
543
|
} catch (err) {
|
|
544
|
+
if (err instanceof ErrSystemInstallDisappeared || err instanceof ErrTmuxNotAvailable) {
|
|
545
|
+
return false
|
|
546
|
+
}
|
|
527
547
|
const e = err instanceof AgentDirectorError ? err : new AgentDirectorError('delete', 'UnknownError', String(err))
|
|
528
548
|
console.error(`[slack] tryDelete: failed for channel=${channelId}: ${e.errName}`)
|
|
529
549
|
if (isStartup) recordStartupError('spawn-failed', `delete failed for channel=${channelId}: ${e.errName}`, e)
|
|
@@ -560,43 +580,58 @@ export async function spawnForRoute(
|
|
|
560
580
|
|
|
561
581
|
const params = buildSpawnParams(channelId, route, routingConfig)
|
|
562
582
|
const normalizedName = routingConfig.routes[channelId]?.normalizedName
|
|
563
|
-
const client = getClient()
|
|
564
583
|
|
|
565
584
|
// Attempt fresh spawn ---
|
|
566
585
|
try {
|
|
567
|
-
const r = await client.spawn(params)
|
|
586
|
+
const r = await withSpawnDetection(channelId, route.cwd, (client) => client.spawn(params))
|
|
568
587
|
console.error(`[slack] spawnForRoute: spawned channel=${channelId} instanceId=${r.claude_instance_id}`)
|
|
569
588
|
await approveTrustFolderDialog(channelId, web, isStartup, normalizedName)
|
|
570
589
|
await approveDevChannelsDialog(channelId, web, isStartup, normalizedName)
|
|
571
590
|
return { channelId, action: 'spawned' }
|
|
572
591
|
} catch (err) {
|
|
573
|
-
if (
|
|
592
|
+
if (err instanceof ErrInstanceIdCollision) {
|
|
593
|
+
// Collision → fall through to get-then-act
|
|
594
|
+
console.error(`[slack] spawnForRoute: ErrInstanceIdCollision for channel=${channelId} — fetching current state`)
|
|
595
|
+
} else if (
|
|
596
|
+
err instanceof ErrSystemInstallDisappeared ||
|
|
597
|
+
err instanceof ErrTmuxNotAvailable ||
|
|
598
|
+
err instanceof ErrCwdNotFound ||
|
|
599
|
+
err instanceof ErrCwdNotADirectory
|
|
600
|
+
) {
|
|
601
|
+
return { channelId, action: 'failed' }
|
|
602
|
+
} else {
|
|
574
603
|
const e = err instanceof AgentDirectorError ? err : new AgentDirectorError('spawn', 'UnknownError', String(err))
|
|
575
604
|
console.error(`[slack] spawnForRoute: spawn failed for channel=${channelId}: ${e.errName}`)
|
|
576
605
|
if (isStartup) recordStartupError('spawn-failed', `spawn failed for channel=${channelId}: ${e.errName}`, e)
|
|
577
606
|
postSpawnFailureToChannel(channelId, e, web, isStartup)
|
|
578
607
|
return { channelId, action: 'failed' }
|
|
579
608
|
}
|
|
580
|
-
// Collision → fall through to get-then-act
|
|
581
|
-
console.error(`[slack] spawnForRoute: ErrInstanceIdCollision for channel=${channelId} — fetching current state`)
|
|
582
609
|
}
|
|
583
610
|
|
|
584
611
|
// Collision-handling: get-then-act ---
|
|
585
612
|
let state: string
|
|
586
613
|
try {
|
|
587
|
-
const r = await client.get({ claude_instance_id: instanceIdFor(channelId, normalizedName) })
|
|
614
|
+
const r = await withOutageDetection(channelId, undefined, (client) => client.get({ claude_instance_id: instanceIdFor(channelId, normalizedName) }))
|
|
588
615
|
state = r.state
|
|
589
616
|
} catch (err) {
|
|
590
617
|
if (err instanceof ErrSpawnNotFound) {
|
|
591
618
|
// Race: row deleted between spawn-collision and get. Retry spawn once.
|
|
592
619
|
console.error(`[slack] spawnForRoute: ErrSpawnNotFound after collision for channel=${channelId} — retrying spawn (single retry)`)
|
|
593
620
|
try {
|
|
594
|
-
const r = await client.spawn(params)
|
|
621
|
+
const r = await withSpawnDetection(channelId, route.cwd, (client) => client.spawn(params))
|
|
595
622
|
console.error(`[slack] spawnForRoute: retry-spawn succeeded for channel=${channelId} instanceId=${r.claude_instance_id}`)
|
|
596
623
|
await approveTrustFolderDialog(channelId, web, isStartup, normalizedName)
|
|
597
624
|
await approveDevChannelsDialog(channelId, web, isStartup, normalizedName)
|
|
598
625
|
return { channelId, action: 'spawned' }
|
|
599
626
|
} catch (err2) {
|
|
627
|
+
if (
|
|
628
|
+
err2 instanceof ErrSystemInstallDisappeared ||
|
|
629
|
+
err2 instanceof ErrTmuxNotAvailable ||
|
|
630
|
+
err2 instanceof ErrCwdNotFound ||
|
|
631
|
+
err2 instanceof ErrCwdNotADirectory
|
|
632
|
+
) {
|
|
633
|
+
return { channelId, action: 'failed' }
|
|
634
|
+
}
|
|
600
635
|
const e = err2 instanceof AgentDirectorError ? err2 : new AgentDirectorError('spawn', 'UnknownError', String(err2))
|
|
601
636
|
console.error(`[slack] spawnForRoute: retry-spawn also failed for channel=${channelId}: ${e.errName}`)
|
|
602
637
|
if (isStartup) recordStartupError('spawn-failed', `retry-spawn failed for channel=${channelId}: ${e.errName}`, e)
|
|
@@ -604,6 +639,9 @@ export async function spawnForRoute(
|
|
|
604
639
|
return { channelId, action: 'failed' }
|
|
605
640
|
}
|
|
606
641
|
}
|
|
642
|
+
if (err instanceof ErrSystemInstallDisappeared || err instanceof ErrTmuxNotAvailable) {
|
|
643
|
+
return { channelId, action: 'failed' }
|
|
644
|
+
}
|
|
607
645
|
const e = err instanceof AgentDirectorError ? err : new AgentDirectorError('get', 'UnknownError', String(err))
|
|
608
646
|
console.error(`[slack] spawnForRoute: get failed for channel=${channelId}: ${e.errName}`)
|
|
609
647
|
postSpawnFailureToChannel(channelId, e, web, isStartup)
|
|
@@ -618,12 +656,20 @@ export async function spawnForRoute(
|
|
|
618
656
|
await tryKill(channelId, normalizedName)
|
|
619
657
|
if (!(await tryDelete(channelId, normalizedName, web, isStartup))) return { channelId, action: 'failed' }
|
|
620
658
|
try {
|
|
621
|
-
await client.spawn(params)
|
|
659
|
+
await withSpawnDetection(channelId, route.cwd, (client) => client.spawn(params))
|
|
622
660
|
console.error(`[slack] spawnForRoute: fresh-spawned (after kill+delete) for channel=${channelId}`)
|
|
623
661
|
await approveTrustFolderDialog(channelId, web, isStartup, normalizedName)
|
|
624
662
|
await approveDevChannelsDialog(channelId, web, isStartup, normalizedName)
|
|
625
663
|
return { channelId, action: 'spawned' }
|
|
626
664
|
} catch (err) {
|
|
665
|
+
if (
|
|
666
|
+
err instanceof ErrSystemInstallDisappeared ||
|
|
667
|
+
err instanceof ErrTmuxNotAvailable ||
|
|
668
|
+
err instanceof ErrCwdNotFound ||
|
|
669
|
+
err instanceof ErrCwdNotADirectory
|
|
670
|
+
) {
|
|
671
|
+
return { channelId, action: 'failed' }
|
|
672
|
+
}
|
|
627
673
|
const e = err instanceof AgentDirectorError ? err : new AgentDirectorError('spawn', 'UnknownError', String(err))
|
|
628
674
|
console.error(`[slack] spawnForRoute: fresh spawn after delete failed for channel=${channelId}: ${e.errName}`)
|
|
629
675
|
if (isStartup) recordStartupError('spawn-failed', `fresh spawn after delete failed for channel=${channelId}: ${e.errName}`, e)
|
|
@@ -635,7 +681,7 @@ export async function spawnForRoute(
|
|
|
635
681
|
// resume_enabled: attempt resume
|
|
636
682
|
console.error(`[slack] spawnForRoute: attempting resume for channel=${channelId}`)
|
|
637
683
|
try {
|
|
638
|
-
await client.resume({ claude_instance_id: instanceIdFor(channelId, normalizedName) })
|
|
684
|
+
await withSpawnDetection(channelId, route.cwd, (client) => client.resume({ claude_instance_id: instanceIdFor(channelId, normalizedName) }))
|
|
639
685
|
console.error(`[slack] spawnForRoute: resumed channel=${channelId}`)
|
|
640
686
|
return { channelId, action: 'resumed' }
|
|
641
687
|
} catch (err) {
|
|
@@ -643,12 +689,20 @@ export async function spawnForRoute(
|
|
|
643
689
|
console.error(`[slack] spawnForRoute: ${err.errName} on resume for channel=${channelId} — delete+fresh`)
|
|
644
690
|
if (!(await tryDelete(channelId, normalizedName, web, isStartup))) return { channelId, action: 'failed' }
|
|
645
691
|
try {
|
|
646
|
-
await client.spawn(params)
|
|
692
|
+
await withSpawnDetection(channelId, route.cwd, (client) => client.spawn(params))
|
|
647
693
|
console.error(`[slack] spawnForRoute: fresh-spawned (after delete) for channel=${channelId}`)
|
|
648
694
|
await approveTrustFolderDialog(channelId, web, isStartup, normalizedName)
|
|
649
695
|
await approveDevChannelsDialog(channelId, web, isStartup, normalizedName)
|
|
650
696
|
return { channelId, action: 'spawned' }
|
|
651
697
|
} catch (err2) {
|
|
698
|
+
if (
|
|
699
|
+
err2 instanceof ErrSystemInstallDisappeared ||
|
|
700
|
+
err2 instanceof ErrTmuxNotAvailable ||
|
|
701
|
+
err2 instanceof ErrCwdNotFound ||
|
|
702
|
+
err2 instanceof ErrCwdNotADirectory
|
|
703
|
+
) {
|
|
704
|
+
return { channelId, action: 'failed' }
|
|
705
|
+
}
|
|
652
706
|
const e = err2 instanceof AgentDirectorError ? err2 : new AgentDirectorError('spawn', 'UnknownError', String(err2))
|
|
653
707
|
console.error(`[slack] spawnForRoute: fresh spawn after delete failed for channel=${channelId}: ${e.errName}`)
|
|
654
708
|
if (isStartup) recordStartupError('spawn-failed', `fresh spawn after delete failed for channel=${channelId}: ${e.errName}`, e)
|
|
@@ -662,17 +716,33 @@ export async function spawnForRoute(
|
|
|
662
716
|
await tryKill(channelId, normalizedName)
|
|
663
717
|
if (!(await tryDelete(channelId, normalizedName, web, isStartup))) return { channelId, action: 'failed' }
|
|
664
718
|
try {
|
|
665
|
-
await client.spawn(params)
|
|
719
|
+
await withSpawnDetection(channelId, route.cwd, (client) => client.spawn(params))
|
|
666
720
|
await approveTrustFolderDialog(channelId, web, isStartup, normalizedName)
|
|
667
721
|
await approveDevChannelsDialog(channelId, web, isStartup, normalizedName)
|
|
668
722
|
return { channelId, action: 'spawned' }
|
|
669
723
|
} catch (err2) {
|
|
724
|
+
if (
|
|
725
|
+
err2 instanceof ErrSystemInstallDisappeared ||
|
|
726
|
+
err2 instanceof ErrTmuxNotAvailable ||
|
|
727
|
+
err2 instanceof ErrCwdNotFound ||
|
|
728
|
+
err2 instanceof ErrCwdNotADirectory
|
|
729
|
+
) {
|
|
730
|
+
return { channelId, action: 'failed' }
|
|
731
|
+
}
|
|
670
732
|
const e = err2 instanceof AgentDirectorError ? err2 : new AgentDirectorError('spawn', 'UnknownError', String(err2))
|
|
671
733
|
if (isStartup) recordStartupError('spawn-failed', `fresh spawn failed for channel=${channelId}: ${e.errName}`, e)
|
|
672
734
|
postSpawnFailureToChannel(channelId, e, web, isStartup)
|
|
673
735
|
return { channelId, action: 'failed' }
|
|
674
736
|
}
|
|
675
737
|
}
|
|
738
|
+
if (
|
|
739
|
+
err instanceof ErrSystemInstallDisappeared ||
|
|
740
|
+
err instanceof ErrTmuxNotAvailable ||
|
|
741
|
+
err instanceof ErrCwdNotFound ||
|
|
742
|
+
err instanceof ErrCwdNotADirectory
|
|
743
|
+
) {
|
|
744
|
+
return { channelId, action: 'failed' }
|
|
745
|
+
}
|
|
676
746
|
const e = err instanceof AgentDirectorError ? err : new AgentDirectorError('resume', 'UnknownError', String(err))
|
|
677
747
|
console.error(`[slack] spawnForRoute: resume failed for channel=${channelId}: ${e.errName}`)
|
|
678
748
|
postSpawnFailureToChannel(channelId, e, web, isStartup)
|
|
@@ -985,9 +1055,13 @@ export async function reconcileInstanceIds(
|
|
|
985
1055
|
`[slack] reconcileInstanceIds: deleting stale row channel=${o.channelId} instanceId=${o.oldInstanceId}`,
|
|
986
1056
|
)
|
|
987
1057
|
try {
|
|
988
|
-
await client.delete({ claude_instance_id: [o.oldInstanceId] })
|
|
1058
|
+
await withOutageDetection(o.channelId, undefined, (client) => client.delete({ claude_instance_id: [o.oldInstanceId] }))
|
|
989
1059
|
deleted++
|
|
990
1060
|
} catch (err) {
|
|
1061
|
+
if (err instanceof ErrSystemInstallDisappeared || err instanceof ErrTmuxNotAvailable) {
|
|
1062
|
+
failed++
|
|
1063
|
+
continue
|
|
1064
|
+
}
|
|
991
1065
|
const e = err instanceof AgentDirectorError ? err : new AgentDirectorError('delete', 'UnknownError', String(err))
|
|
992
1066
|
console.error(
|
|
993
1067
|
`[slack] reconcileInstanceIds: delete failed for channel=${o.channelId} instanceId=${o.oldInstanceId}: ${e.errName}`,
|