claude-slack-channel-bots 0.7.1 → 0.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "claude-slack-channel-bots",
3
- "version": "0.7.1",
3
+ "version": "0.8.0",
4
4
  "description": "Multi-session Slack-to-Claude bridge — run multiple Claude Code bots across Slack channels via Socket Mode",
5
5
  "type": "module",
6
6
  "bin": {
@@ -31,7 +31,7 @@
31
31
  "@modelcontextprotocol/sdk": "^1.0.0",
32
32
  "@slack/socket-mode": "^2.0.0",
33
33
  "@slack/web-api": "^7.0.0",
34
- "agent-director": "^0.7.0",
34
+ "agent-director": "^0.7.8",
35
35
  "semver": "^7.6.0"
36
36
  },
37
37
  "devDependencies": {
@@ -11,6 +11,10 @@
11
11
  * - ErrSystemInstallNotFound (Client.create / resolveSystemBinary — no `agent-director` on PATH or at ~/.agent-director)
12
12
  * - ErrSystemInstallTooOld (Client.create / resolveSystemBinary — system binary older than required minimum)
13
13
  * - ErrSystemInstallUnreachable (Client.create / resolveSystemBinary — system binary present but not executable or fails --version)
14
+ * - ErrSystemInstallDisappeared (any verb / binary gone after valid construction — b.xht)
15
+ * - ErrTmuxNotAvailable (spawn / tmux binary not found or not executable)
16
+ * - ErrCwdNotFound (spawn / route cwd does not exist on disk)
17
+ * - ErrCwdNotADirectory (spawn / route cwd path exists but is not a directory)
14
18
  * - ErrInstanceIdCollision (spawn / SR-1.4 idempotency)
15
19
  * - ErrSpawnNotFound (get / status / decide on missing row)
16
20
  * - ErrNoSessionId (resume / SR-1.3 fallthrough)
@@ -41,6 +45,10 @@ export {
41
45
  ErrSystemInstallNotFound,
42
46
  ErrSystemInstallTooOld,
43
47
  ErrSystemInstallUnreachable,
48
+ ErrSystemInstallDisappeared,
49
+ ErrTmuxNotAvailable,
50
+ ErrCwdNotFound,
51
+ ErrCwdNotADirectory,
44
52
  ErrCallTimeout,
45
53
  ErrInstanceIdCollision,
46
54
  ErrSpawnNotFound,
@@ -8,6 +8,8 @@
8
8
  * SPDX-License-Identifier: MIT
9
9
  */
10
10
 
11
+ import { setOutageFlag, clearOutageFlag } from './outage-state.ts'
12
+
11
13
  // ---------------------------------------------------------------------------
12
14
  // Types
13
15
  // ---------------------------------------------------------------------------
@@ -15,7 +17,7 @@
15
17
  export interface HealthCheckDeps {
16
18
  isSessionAlive(channelId: string): Promise<boolean>
17
19
  isRestartPendingOrActive(channelId: string): boolean
18
- hasReachedMaxFailures(channelId: string): boolean
20
+ statRoute(cwd: string): Promise<boolean>
19
21
  scheduleRestart(channelId: string, cwd: string): void
20
22
  isShuttingDown(): boolean
21
23
  getRoutes(): Record<string, string>
@@ -27,6 +29,8 @@ export interface HealthCheckDeps {
27
29
 
28
30
  let deps: HealthCheckDeps | null = null
29
31
  let intervalId: ReturnType<typeof setInterval> | null = null
32
+ let tickInFlight = false
33
+ let skippedTicks = 0
30
34
 
31
35
  // ---------------------------------------------------------------------------
32
36
  // initHealthCheck
@@ -46,21 +50,45 @@ export function startHealthCheck(intervalSeconds: number): void {
46
50
  intervalId = setInterval(async () => {
47
51
  if (!deps) return
48
52
  if (deps.isShuttingDown()) return
49
-
50
- const routes = deps.getRoutes()
51
-
52
- for (const [channelId, cwd] of Object.entries(routes)) {
53
- try {
54
- if (deps.isRestartPendingOrActive(channelId)) continue
55
- if (deps.hasReachedMaxFailures(channelId)) continue
56
-
57
- const alive = await deps.isSessionAlive(channelId)
58
- if (!alive) {
59
- deps.scheduleRestart(channelId, cwd)
53
+ if (tickInFlight) {
54
+ skippedTicks++
55
+ // Fire exactly once when the streak crosses 5 (4 → 5 transition). At the
56
+ // 120 s interval, five consecutive skips represents ~10 minutes of
57
+ // tick-budget exhaustion — long enough to indicate genuine health-check
58
+ // wedging (e.g., a persistently hung statRoute or isSessionAliveAdapter)
59
+ // rather than transient slowness. Further skips within the same streak
60
+ // are silent; the next successfully-started tick body resets the counter
61
+ // and re-arms the warning for a future streak.
62
+ if (skippedTicks === 5) {
63
+ console.error('[slack] health-check: tick body in flight; skipped 5 consecutive ticks — investigate budget exhaustion')
64
+ }
65
+ return
66
+ }
67
+ tickInFlight = true
68
+ skippedTicks = 0
69
+ try {
70
+ const routes = deps.getRoutes()
71
+
72
+ for (const [channelId, cwd] of Object.entries(routes)) {
73
+ try {
74
+ if (deps.isRestartPendingOrActive(channelId)) continue
75
+
76
+ if (await deps.statRoute(cwd)) {
77
+ clearOutageFlag(channelId, 'cwd-unreachable')
78
+ } else {
79
+ setOutageFlag(channelId, 'cwd-unreachable', cwd)
80
+ }
81
+
82
+ const alive = await deps.isSessionAlive(channelId)
83
+ if (!alive) {
84
+ deps.scheduleRestart(channelId, cwd)
85
+ }
86
+ } catch (err) {
87
+ console.error(`[slack] health-check: error checking channel=${channelId}:`, err)
60
88
  }
61
- } catch (err) {
62
- console.error(`[slack] health-check: error checking channel=${channelId}:`, err)
63
89
  }
90
+ } finally {
91
+ tickInFlight = false
64
92
  }
65
93
  }, intervalSeconds * 1000)
66
94
  }
@@ -86,4 +114,6 @@ export function _resetHealthCheckState(): void {
86
114
  intervalId = null
87
115
  }
88
116
  deps = null
117
+ tickInFlight = false
118
+ skippedTicks = 0
89
119
  }
@@ -0,0 +1,287 @@
1
+ /**
2
+ * outage-state.ts — Channel-scoped outage-flag state machine + Slack emit surface.
3
+ *
4
+ * Tracks three orthogonal outage classes per routed Slack channel and emits
5
+ * onset / all-clear messages via the injected `postToChannel` hook. The module
6
+ * is intentionally free of Date / timestamp logic — operators scroll back to
7
+ * the onset message for timing context.
8
+ *
9
+ * Public API surface (all exported):
10
+ * - initOutageState(deps) — install production dependencies
11
+ * - getOutageFlags(channelId) — read live flag set
12
+ * - setOutageFlag(channelId, cls, detail?) — raise flag + emit onset message
13
+ * - clearOutageFlag(channelId, cls) — lower flag; emits all-clear when set empties
14
+ * - resetAllToHealthy(channelIds) — silent bulk wipe (boot-time reset)
15
+ * - withOutageDetection(ch, cwd, fn) — AD verb wrapper; raises/clears flags on error/success
16
+ * - withSpawnDetection(ch, cwd, fn) — like withOutageDetection + clears cwd-unreachable on success
17
+ * - _resetOutageState() — test-only state reset
18
+ *
19
+ * Template exports (used by tests):
20
+ * - ONSET_TEMPLATES
21
+ * - ALL_CLEAR_TEMPLATE
22
+ *
23
+ * SPDX-License-Identifier: MIT
24
+ */
25
+
26
+ import type { Client } from 'agent-director'
27
+ import {
28
+ ErrSystemInstallDisappeared,
29
+ ErrTmuxNotAvailable,
30
+ ErrCwdNotFound,
31
+ ErrCwdNotADirectory,
32
+ } from './agent-director-errors.ts'
33
+
34
+ // ---------------------------------------------------------------------------
35
+ // Data types
36
+ // ---------------------------------------------------------------------------
37
+
38
+ /** Union of all outage classifications. No 'healthy' member — absence == healthy. */
39
+ export type OutageClass = 'ad-unreachable' | 'cwd-unreachable' | 'tmux-unavailable'
40
+
41
+ /**
42
+ * Detail record for a single outage class in a bad stretch.
43
+ * No `enteredAtIso` — timestamps are intentionally absent from the all-clear template.
44
+ */
45
+ export interface ClassRecord {
46
+ detail?: string
47
+ }
48
+
49
+ /** Per-channel state entry. */
50
+ interface ChannelEntry {
51
+ /** Currently active outage flags. */
52
+ flags: Set<OutageClass>
53
+ /** Class → detail record for the current bad stretch. Reset to empty Map on all-clear. */
54
+ badStretchClasses: Map<OutageClass, ClassRecord>
55
+ }
56
+
57
+ /** Dependencies injected via `initOutageState` — wires the module to production Slack + AD. */
58
+ export interface OutageStateDeps {
59
+ /** Fire-and-forget Slack post; errors MUST be handled internally by the caller. */
60
+ postToChannel(channelId: string, text: string): void
61
+ /** Return the singleton AD Client. Same semantics as getClient() in agent-director-client.ts. */
62
+ getClient(): Client
63
+ }
64
+
65
+ // ---------------------------------------------------------------------------
66
+ // Module-scoped state
67
+ // ---------------------------------------------------------------------------
68
+
69
+ let deps: OutageStateDeps | undefined
70
+ const entries = new Map<string, ChannelEntry>()
71
+
72
+ // ---------------------------------------------------------------------------
73
+ // Helpers
74
+ // ---------------------------------------------------------------------------
75
+
76
+ /** Lazy-create a ChannelEntry for `channelId` on first access. */
77
+ function entryFor(channelId: string): ChannelEntry {
78
+ let entry = entries.get(channelId)
79
+ if (!entry) {
80
+ entry = { flags: new Set(), badStretchClasses: new Map() }
81
+ entries.set(channelId, entry)
82
+ }
83
+ return entry
84
+ }
85
+
86
+ // ---------------------------------------------------------------------------
87
+ // Slack message templates
88
+ // ---------------------------------------------------------------------------
89
+
90
+ /** Stable iteration order for the all-clear template. */
91
+ const STABLE_CLASS_ORDER: OutageClass[] = [
92
+ 'ad-unreachable',
93
+ 'cwd-unreachable',
94
+ 'tmux-unavailable',
95
+ ]
96
+
97
+ /**
98
+ * ONSET_TEMPLATES — one template function per outage class.
99
+ * The optional `detail` parameter carries class-specific context
100
+ * (binary path for ad-unreachable; cwd path for cwd-unreachable).
101
+ */
102
+ export const ONSET_TEMPLATES: Record<OutageClass, (detail?: string) => string> = {
103
+ 'ad-unreachable': (binaryPath?: string) =>
104
+ `:rotating_light: *agent-director unreachable* — affects every routed channel.\nBinary: \`${binaryPath ?? '<unknown>'}\`\nRemediation: reinstall agent-director.`,
105
+
106
+ 'tmux-unavailable': (_detail?: string) =>
107
+ `:rotating_light: *tmux unavailable* — affects every routed channel.\nRemediation: install or repair tmux.`,
108
+
109
+ 'cwd-unreachable': (cwd?: string) =>
110
+ `:rotating_light: *Route cwd unreachable* — \`${cwd ?? '<unknown>'}\`\nRemediation: restore the directory or remove this route from \`config.json\`.`,
111
+ }
112
+
113
+ /**
114
+ * ALL_CLEAR_TEMPLATE — renders a Slack all-clear message from the bad-stretch
115
+ * history snapshot. Entries are emitted in the stable class order regardless
116
+ * of the order flags were raised. No timestamps.
117
+ */
118
+ export function ALL_CLEAR_TEMPLATE(resolved: Map<OutageClass, ClassRecord>): string {
119
+ const parts: string[] = []
120
+ for (const cls of STABLE_CLASS_ORDER) {
121
+ const rec = resolved.get(cls)
122
+ if (rec === undefined) continue
123
+ const detailSuffix = rec.detail !== undefined ? ` (\`${rec.detail}\`)` : ''
124
+ parts.push(`\`${cls}\`${detailSuffix}`)
125
+ }
126
+ return `:white_check_mark: *All clear.* Resolved: ${parts.join(', ')}.`
127
+ }
128
+
129
+ // ---------------------------------------------------------------------------
130
+ // Public API — init + accessors
131
+ // ---------------------------------------------------------------------------
132
+
133
+ /**
134
+ * initOutageState — installs production dependencies. Called once from
135
+ * `src/server.ts:main()` after `routingConfig` is loaded, before the
136
+ * Socket Mode connect block.
137
+ */
138
+ export function initOutageState(d: OutageStateDeps): void {
139
+ deps = d
140
+ }
141
+
142
+ /**
143
+ * getOutageFlags — returns the live read-only flag set for `channelId`.
144
+ * Returns an empty `ReadonlySet` sentinel when no entry exists yet.
145
+ */
146
+ export function getOutageFlags(channelId: string): ReadonlySet<OutageClass> {
147
+ return entries.get(channelId)?.flags ?? (new Set<OutageClass>() as ReadonlySet<OutageClass>)
148
+ }
149
+
150
+ // ---------------------------------------------------------------------------
151
+ // Public API — mutators
152
+ // ---------------------------------------------------------------------------
153
+
154
+ /**
155
+ * setOutageFlag — raises `cls` for `channelId` and emits an onset Slack
156
+ * message. Same-flag re-raise is a silent no-op (dedupe).
157
+ *
158
+ * State mutates BEFORE the emit so a synchronous throw in `postToChannel`
159
+ * cannot cause double-emission on the next observation.
160
+ */
161
+ export function setOutageFlag(channelId: string, cls: OutageClass, detail?: string): void {
162
+ if (!deps) return
163
+ const entry = entryFor(channelId)
164
+ if (entry.flags.has(cls)) return // same-flag dedupe
165
+ // Mutate state BEFORE emit (SR-V-2.x state-before-emit contract).
166
+ entry.flags.add(cls)
167
+ entry.badStretchClasses.set(cls, { detail })
168
+ deps.postToChannel(channelId, ONSET_TEMPLATES[cls](detail))
169
+ }
170
+
171
+ /**
172
+ * clearOutageFlag — lowers `cls` for `channelId`. Emits the all-clear Slack
173
+ * message ONLY when the clear leaves the flag set empty and there is a
174
+ * non-empty bad-stretch history (i.e., at least one onset was recorded).
175
+ * Intermediate clears (flag set still non-empty after removal) are silent.
176
+ *
177
+ * State mutates BEFORE the emit (same contract as setOutageFlag).
178
+ */
179
+ export function clearOutageFlag(channelId: string, cls: OutageClass): void {
180
+ if (!deps) return
181
+ const entry = entries.get(channelId)
182
+ if (!entry) return
183
+ if (!entry.flags.has(cls)) return // same-state dedupe
184
+ // Mutate state BEFORE emit.
185
+ entry.flags.delete(cls)
186
+ if (entry.flags.size === 0 && entry.badStretchClasses.size > 0) {
187
+ // Snapshot history and reset BEFORE the postToChannel call.
188
+ const snapshot = new Map(entry.badStretchClasses)
189
+ entry.badStretchClasses = new Map()
190
+ deps.postToChannel(channelId, ALL_CLEAR_TEMPLATE(snapshot))
191
+ }
192
+ }
193
+
194
+ /**
195
+ * resetAllToHealthy — silently wipes every channel's flag set and bad-stretch
196
+ * history to a clean slate. No `postToChannel` calls. Used at boot time
197
+ * after `socket.start()` as a defensive boundary for pre-auth observations
198
+ * (boot-time call is added by Epic 2).
199
+ */
200
+ export function resetAllToHealthy(channelIds: string[]): void {
201
+ for (const channelId of channelIds) {
202
+ entries.set(channelId, { flags: new Set(), badStretchClasses: new Map() })
203
+ }
204
+ }
205
+
206
+ // ---------------------------------------------------------------------------
207
+ // Public API — AD verb wrappers
208
+ // ---------------------------------------------------------------------------
209
+
210
+ /**
211
+ * withOutageDetection — centralized wrapper for AD verb calls that should
212
+ * participate in outage detection.
213
+ *
214
+ * On error:
215
+ * - ErrSystemInstallDisappeared → raises 'ad-unreachable' (detail = binaryPath)
216
+ * - ErrTmuxNotAvailable → raises 'tmux-unavailable'
217
+ * - ErrCwdNotFound / ErrCwdNotADirectory → raises 'cwd-unreachable' (detail = routeCwd)
218
+ * UNLESS routeCwd is undefined, in which case logs loudly and rethrows
219
+ * WITHOUT raising the flag (defensive carve-out for verb-class drift).
220
+ * - Other errors → no flag change; rethrow unchanged.
221
+ *
222
+ * On success: clears 'ad-unreachable' and 'tmux-unavailable', returns result.
223
+ *
224
+ * The original error is always rethrown so callers can handle it normally.
225
+ */
226
+ export async function withOutageDetection<T>(
227
+ channelId: string,
228
+ routeCwd: string | undefined,
229
+ fn: (client: Client) => Promise<T>,
230
+ ): Promise<T> {
231
+ if (!deps) {
232
+ throw new Error(
233
+ 'outage-state: withOutageDetection called before initOutageState — caller-site bug',
234
+ )
235
+ }
236
+ try {
237
+ const result = await fn(deps.getClient())
238
+ clearOutageFlag(channelId, 'ad-unreachable')
239
+ clearOutageFlag(channelId, 'tmux-unavailable')
240
+ return result
241
+ } catch (err) {
242
+ if (err instanceof ErrSystemInstallDisappeared) {
243
+ setOutageFlag(channelId, 'ad-unreachable', err.binaryPath)
244
+ } else if (err instanceof ErrTmuxNotAvailable) {
245
+ setOutageFlag(channelId, 'tmux-unavailable')
246
+ } else if (err instanceof ErrCwdNotFound || err instanceof ErrCwdNotADirectory) {
247
+ if (routeCwd !== undefined) {
248
+ setOutageFlag(channelId, 'cwd-unreachable', routeCwd)
249
+ } else {
250
+ console.error(
251
+ `[slack] outage-state: withOutageDetection: cwd error on channel=${channelId} but routeCwd is undefined — verb-class drift; rethrowing without raising flag`,
252
+ err,
253
+ )
254
+ }
255
+ }
256
+ throw err
257
+ }
258
+ }
259
+
260
+ /**
261
+ * withSpawnDetection — like `withOutageDetection` but also clears
262
+ * 'cwd-unreachable' on success. Spawn and resume verbs are the only calls
263
+ * that actually exercise the route's cwd, so cwd health is only confirmed
264
+ * by a successful spawn/resume.
265
+ */
266
+ export async function withSpawnDetection<T>(
267
+ channelId: string,
268
+ routeCwd: string | undefined,
269
+ fn: (client: Client) => Promise<T>,
270
+ ): Promise<T> {
271
+ const result = await withOutageDetection(channelId, routeCwd, fn)
272
+ clearOutageFlag(channelId, 'cwd-unreachable')
273
+ return result
274
+ }
275
+
276
+ // ---------------------------------------------------------------------------
277
+ // Test-only
278
+ // ---------------------------------------------------------------------------
279
+
280
+ /**
281
+ * _resetOutageState — clears all module-scoped state. For tests only.
282
+ * Production code must not call this.
283
+ */
284
+ export function _resetOutageState(): void {
285
+ deps = undefined
286
+ entries.clear()
287
+ }