@swarmclawai/swarmclaw 1.5.64 → 1.5.66

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -399,6 +399,25 @@ Operational docs: https://swarmclaw.ai/docs/observability
399
399
 
400
400
  ## Releases
401
401
 
402
+ ### v1.5.66 Highlights
403
+
404
+ Fixes a runaway-token-burn bug in the orchestrator-wake and heartbeat loops. The root cause was hidden in the success/failure classification: a session run can resolve its promise successfully while still carrying an `error` on the result (e.g. a provider 429 swallowed into persisted output), and the wake trackers only incremented their failure counters on a rejected promise. So the backoff never engaged, the auto-disable-after-N-failures gate never tripped, and the wake kept firing at its configured interval indefinitely — every firing spending tokens on a full prompt against a provider that was already cooling down.
405
+
406
+ - **`classifyWakeOutcome` (`src/lib/server/runtime/heartbeat-service.ts`)** — new pure helper, extracted for unit testing, that maps a resolved run result into `null` (success) or a short failure reason. A run counts as a failure when `result.error` is a non-empty string, *or* when `result.text` is empty/whitespace-only. Both the orchestrator-wake and heartbeat outcome handlers now feed through this helper, so silent-failure runs tick the failure counter and the exponential backoff (10s → 5min) kicks in normally.
407
+ - **Auto-disable gate now trips for provider 429 / silent-wake loops.** The existing `MAX_CONSECUTIVE_FAILURES = 10` threshold was already in place but unreachable for the most common failure mode (429 errors that still persisted a run). After the fix, ten consecutive dud wakes auto-disable the orchestrator/heartbeat for that agent/session and post an explicit notification instead of grinding indefinitely.
408
+ - **Regression coverage.** `heartbeat-service.test.ts` now has 5 targeted cases on `classifyWakeOutcome` — the 429 regression, empty-output detection, non-string error fields, whitespace-only errors, and the happy path. `test:runtime` now runs 104 cases.
409
+
410
+ ### v1.5.65 Highlights
411
+
412
+ Follow-up hardening on the v1.5.64 work after live-testing the chat-header flows, the MCP connection pool, and the MCP Registry browser. Six concrete bugs fixed in the clear/undo, MCP pool eviction, and registry-browser code paths.
413
+
414
+ - **`clearChatMessages` now resets `opencodeWebSessionId` too.** The snapshot/undo pair already captured and restored it, but `clear` itself left the stale identifier in place — so a fresh opencode-web turn would resume the conversation the user intended to drop. Paired with a matching default in `storage-normalization.ts` so older session records load with `opencodeWebSessionId: null` instead of `undefined`. Regression covered by `clear-route.test.ts`.
415
+ - **Undo toast no longer writes to the wrong chat.** If the user navigated away after clicking Clear, clicking Undo in the toast would inject restored messages into whatever chat was currently open. `chat-area.tsx` now gates the `setMessages` calls on `selectActiveSessionId === targetSessionId`; same guard added to the compact-complete path.
416
+ - **Background MCP status probes no longer evict the connection pool.** Visiting `/mcp-servers` auto-called `POST /api/mcp-servers/:id/test` for every server, which force-disconnected pooled clients that running agents were using mid-turn. Eviction is now gated behind `?reset=1`, which only the explicit **Re-test** button sends. Regression added to `src/app/api/mcp-servers/route.test.ts`.
417
+ - **SwarmDock MCP Registry browser actually works now.** The upstream `swarmdock-api.onrender.com` endpoint emits no CORS headers, so the in-browser `RegistryBrowser` component always failed with `Failed to fetch`. Added `GET /api/mcp-registry` and `GET /api/mcp-registry/:slug` as server-side proxies and rewired the component to call them. Verified in Chrome: 20 servers load, selecting one prefills the New MCP Server sheet with its recommended install command.
418
+ - **`mcp-registry` CLI group.** New commands `swarmclaw mcp-registry search` and `swarmclaw mcp-registry get <slug>` so CLI workflows can pull from the same proxy.
419
+ - **Prior release's MCP tool-evict-on-transport-failure fix** (cherry-picked from user's local branch): connection-class errors from downstream MCP tools now evict the pool entry for the originating server, so the next turn reconnects fresh instead of retrying through a half-broken transport.
420
+
402
421
  ### v1.5.64 Highlights
403
422
 
404
423
  Two themes this release. First, **context-window management reaches the chat UI**: a live token-usage meter in every chat header, a one-click LLM-backed compaction that keeps the session alive without nuking history, and a redesigned clear flow with a 30-second undo that restores both transcripts and CLI resume IDs. Second, **MCP token spend is now controllable**: per-server `alwaysExpose` policy, per-agent eager-tool overrides, an in-session `mcp_tool_search` promoter, a long-lived connection pool, a token-cost endpoint per server, and a built-in browser for the public SwarmDock MCP registry.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@swarmclawai/swarmclaw",
3
- "version": "1.5.64",
3
+ "version": "1.5.66",
4
4
  "description": "Build and run autonomous AI agents with OpenClaw, Hermes, multiple model providers, orchestration, delegation, memory, skills, schedules, and chat connectors.",
5
5
  "main": "electron-dist/main.js",
6
6
  "license": "MIT",
@@ -0,0 +1,31 @@
1
+ import { NextResponse } from 'next/server'
2
+
3
+ const REGISTRY_API = 'https://swarmdock-api.onrender.com/api/v1/mcp/servers'
4
+
5
+ export async function GET(_req: Request, { params }: { params: Promise<{ slug: string }> }) {
6
+ const { slug } = await params
7
+ if (!slug.trim()) {
8
+ return NextResponse.json({ error: 'slug is required' }, { status: 400 })
9
+ }
10
+ try {
11
+ const upstream = await fetch(`${REGISTRY_API}/${encodeURIComponent(slug)}`, {
12
+ headers: { accept: 'application/json' },
13
+ })
14
+ if (upstream.status === 404) {
15
+ return NextResponse.json({ error: 'Registry server not found' }, { status: 404 })
16
+ }
17
+ if (!upstream.ok) {
18
+ return NextResponse.json(
19
+ { error: `Server detail returned ${upstream.status}` },
20
+ { status: 502 },
21
+ )
22
+ }
23
+ const data = await upstream.json()
24
+ return NextResponse.json(data)
25
+ } catch (err: unknown) {
26
+ return NextResponse.json(
27
+ { error: err instanceof Error ? err.message : 'Registry unreachable' },
28
+ { status: 502 },
29
+ )
30
+ }
31
+ }
@@ -0,0 +1,36 @@
1
+ import { NextResponse } from 'next/server'
2
+
3
+ // Server-side proxy for the public SwarmDock MCP Registry. The upstream API
4
+ // does not emit CORS headers, so the RegistryBrowser component in the browser
5
+ // cannot fetch it directly. This route forwards the search request and its
6
+ // JSON response untouched.
7
+
8
+ const REGISTRY_API = 'https://swarmdock-api.onrender.com/api/v1/mcp/servers'
9
+
10
+ export async function GET(req: Request) {
11
+ const url = new URL(req.url)
12
+ const q = url.searchParams.get('q') ?? ''
13
+ const limitRaw = url.searchParams.get('limit') ?? '20'
14
+ const limit = Math.max(1, Math.min(Number.parseInt(limitRaw, 10) || 20, 50))
15
+ const qs = new URLSearchParams({ limit: String(limit) })
16
+ if (q.trim()) qs.set('q', q.trim())
17
+
18
+ try {
19
+ const upstream = await fetch(`${REGISTRY_API}?${qs.toString()}`, {
20
+ headers: { accept: 'application/json' },
21
+ })
22
+ if (!upstream.ok) {
23
+ return NextResponse.json(
24
+ { error: `Registry returned ${upstream.status}` },
25
+ { status: 502 },
26
+ )
27
+ }
28
+ const data = await upstream.json()
29
+ return NextResponse.json(data)
30
+ } catch (err: unknown) {
31
+ return NextResponse.json(
32
+ { error: err instanceof Error ? err.message : 'Registry unreachable' },
33
+ { status: 502 },
34
+ )
35
+ }
36
+ }
@@ -5,15 +5,21 @@ import { connectMcpServer, mcpToolsToLangChain, disconnectMcpServer } from '@/li
5
5
  import { evictMcpClient } from '@/lib/server/mcp-connection-pool'
6
6
  import { errorMessage } from '@/lib/shared-utils'
7
7
 
8
- export async function POST(_req: Request, { params }: { params: Promise<{ id: string }> }) {
8
+ export async function POST(req: Request, { params }: { params: Promise<{ id: string }> }) {
9
9
  const { id } = await params
10
10
  const servers = loadMcpServers()
11
11
  const server = servers[id]
12
12
  if (!server) return notFound()
13
13
 
14
- // Force a fresh connection for the test if a pooled client is in a weird
15
- // state, the test button is the user's signal to rebuild it.
16
- await evictMcpClient(id)
14
+ // Only evict the pool when the caller explicitly asks for a reset (e.g. the
15
+ // "Re-test" button). Background probes from the server list view skip this
16
+ // so they don't disconnect pooled clients that running agents are using
17
+ // mid-turn. Pool eviction on config change is handled by the PUT route.
18
+ const url = new URL(req.url)
19
+ const reset = url.searchParams.get('reset') === '1' || url.searchParams.get('reset') === 'true'
20
+ if (reset) {
21
+ await evictMcpClient(id)
22
+ }
17
23
 
18
24
  try {
19
25
  const { client, transport } = await connectMcpServer(server)
@@ -61,6 +61,16 @@ test('MCP server routes exercise a live stdio server end to end', async () => {
61
61
  assert.equal(health.ok, true)
62
62
  assert.deepEqual(health.tools, ['mcp_smoke_ping', 'mcp_smoke_echo', 'mcp_smoke_cwd_check'])
63
63
 
64
+ // `reset=1` still works and succeeds — used by the explicit "Re-test" button
65
+ // to force pool eviction. Default (no query) path skips eviction so
66
+ // auto-probes don't disrupt in-flight agent MCP calls.
67
+ const resetHealthResponse = await testMcpServer(new Request(`http://local/api/mcp-servers/${serverId}/test?reset=1`, {
68
+ method: 'POST',
69
+ }), routeParams(serverId))
70
+ assert.equal(resetHealthResponse.status, 200)
71
+ const resetHealth = await resetHealthResponse.json() as Record<string, unknown>
72
+ assert.equal(resetHealth.ok, true)
73
+
64
74
  const toolsResponse = await listMcpTools(new Request(`http://local/api/mcp-servers/${serverId}/tools`), routeParams(serverId))
65
75
  assert.equal(toolsResponse.status, 200)
66
76
  const tools = await toolsResponse.json() as Array<Record<string, unknown>>
package/src/cli/index.js CHANGED
@@ -372,6 +372,14 @@ const COMMAND_GROUPS = [
372
372
  cmd('invoke', 'POST', '/mcp-servers/:id/invoke', 'Invoke an MCP tool on a server', { expectsJsonBody: true }),
373
373
  ],
374
374
  },
375
+ {
376
+ name: 'mcp-registry',
377
+ description: 'Browse the public SwarmDock MCP Registry',
378
+ commands: [
379
+ cmd('search', 'GET', '/mcp-registry', 'Search registry servers (supports --query q=postgres,limit=20)'),
380
+ cmd('get', 'GET', '/mcp-registry/:slug', 'Get registry server detail by slug'),
381
+ ],
382
+ },
375
383
  {
376
384
  name: 'memories',
377
385
  description: 'Alias of memory command group',
@@ -188,7 +188,7 @@ export function McpServerList({ inSidebar }: { inSidebar?: boolean }) {
188
188
  e.stopPropagation()
189
189
  setStatuses((prev) => ({ ...prev, [id]: { ok: false, loading: true } }))
190
190
  try {
191
- const res = await api<{ ok: boolean; tools?: string[]; error?: string }>('POST', `/mcp-servers/${id}/test`)
191
+ const res = await api<{ ok: boolean; tools?: string[]; error?: string }>('POST', `/mcp-servers/${id}/test?reset=1`)
192
192
  if (!mountedRef.current) return
193
193
  setStatuses((prev) => ({ ...prev, [id]: { ok: res.ok, tools: res.tools, error: res.error, loading: false } }))
194
194
  if (res.ok) toast.success('Connection test passed')
@@ -11,8 +11,7 @@
11
11
  */
12
12
 
13
13
  import { useEffect, useState } from 'react'
14
-
15
- const REGISTRY_API = 'https://swarmdock-api.onrender.com/api/v1/mcp/servers'
14
+ import { api } from '@/lib/app/api-client'
16
15
 
17
16
  export interface RegistryPrefill {
18
17
  name: string
@@ -100,10 +99,8 @@ export function RegistryBrowser({
100
99
  setError(null)
101
100
  try {
102
101
  const qs = query ? `?q=${encodeURIComponent(query)}&limit=20` : '?limit=20'
103
- const res = await fetch(`${REGISTRY_API}${qs}`)
104
- if (!res.ok) throw new Error(`Registry returned ${res.status}`)
105
- const data = await res.json() as { servers: RegistryServer[] }
106
- if (!cancelled) setServers(data.servers)
102
+ const data = await api<{ servers: RegistryServer[] }>('GET', `/mcp-registry${qs}`)
103
+ if (!cancelled) setServers(data.servers ?? [])
107
104
  } catch (err) {
108
105
  if (!cancelled) setError(err instanceof Error ? err.message : 'Failed to load registry')
109
106
  } finally {
@@ -120,9 +117,7 @@ export function RegistryBrowser({
120
117
  const handleSelect = async (slug: string) => {
121
118
  setSelecting(slug)
122
119
  try {
123
- const res = await fetch(`${REGISTRY_API}/${encodeURIComponent(slug)}`)
124
- if (!res.ok) throw new Error(`Server detail returned ${res.status}`)
125
- const detail = await res.json() as RegistryDetail
120
+ const detail = await api<RegistryDetail>('GET', `/mcp-registry/${encodeURIComponent(slug)}`)
126
121
  const prefill = installToPrefill(detail)
127
122
  if (!prefill) {
128
123
  setError('This server has no installation method SwarmClaw can consume yet.')
@@ -6,6 +6,7 @@ import {
6
6
  evictAllMcpClients,
7
7
  evictMcpClient,
8
8
  getOrConnectMcpClient,
9
+ isConnectionLikeError,
9
10
  isPooled,
10
11
  poolSize,
11
12
  } from './mcp-connection-pool'
@@ -96,3 +97,31 @@ describe('mcp-connection-pool', () => {
96
97
  assert.equal(poolSize(), 0)
97
98
  })
98
99
  })
100
+
101
+ describe('isConnectionLikeError', () => {
102
+ it('returns true for known transport-level error codes', () => {
103
+ const err = Object.assign(new Error('epipe'), { code: 'EPIPE' })
104
+ assert.equal(isConnectionLikeError(err), true)
105
+ const err2 = Object.assign(new Error('reset'), { code: 'ECONNRESET' })
106
+ assert.equal(isConnectionLikeError(err2), true)
107
+ })
108
+
109
+ it('returns true on connection-closed messages', () => {
110
+ assert.equal(isConnectionLikeError(new Error('Connection closed')), true)
111
+ assert.equal(isConnectionLikeError(new Error('MCP server not connected')), true)
112
+ assert.equal(isConnectionLikeError(new Error('child process exited')), true)
113
+ assert.equal(isConnectionLikeError(new Error('socket hang up')), true)
114
+ })
115
+
116
+ it('returns false for ordinary tool-level errors', () => {
117
+ assert.equal(isConnectionLikeError(new Error('GitHub token is invalid')), false)
118
+ assert.equal(isConnectionLikeError(new Error('File not found: /nope')), false)
119
+ assert.equal(isConnectionLikeError(new Error('schema validation failed')), false)
120
+ })
121
+
122
+ it('returns false for non-error inputs', () => {
123
+ assert.equal(isConnectionLikeError(null), false)
124
+ assert.equal(isConnectionLikeError(undefined), false)
125
+ assert.equal(isConnectionLikeError(''), false)
126
+ })
127
+ })
@@ -132,3 +132,19 @@ async function safeDisconnect(entry: PoolEntry): Promise<void> {
132
132
  /* ignore — we're tearing down anyway */
133
133
  }
134
134
  }
135
+
136
+ /**
137
+ * Heuristic: does this error look like the pooled connection is dead (vs. a
138
+ * normal tool-level error the caller should surface)? Conservative by design —
139
+ * we only evict on well-known transport-level signatures so a "your API key is
140
+ * wrong" error from an MCP tool doesn't force a reconnect storm.
141
+ */
142
+ export function isConnectionLikeError(err: unknown): boolean {
143
+ if (!err) return false
144
+ const code = typeof err === 'object' && err && 'code' in err ? String((err as { code: unknown }).code ?? '') : ''
145
+ if (code && /^(ECONNREFUSED|ECONNRESET|EPIPE|EHOSTUNREACH|ETIMEDOUT|ENOTFOUND|ECONNABORTED)$/i.test(code)) {
146
+ return true
147
+ }
148
+ const msg = err instanceof Error ? err.message : String(err)
149
+ return /connection closed|transport closed|server has closed|process exited|child exited|mcp server not connected|read ECONN|write EPIPE|socket hang up|stream closed|unexpected end of (?:json|input|stream)/i.test(msg)
150
+ }
@@ -450,3 +450,35 @@ describe('heartbeatConfigForSession lightContext', () => {
450
450
  assert.equal(cfg.lightContext, false)
451
451
  })
452
452
  })
453
+
454
+ describe('classifyWakeOutcome (runaway-loop guard)', () => {
455
+ it('returns null for a run with visible text and no error', () => {
456
+ assert.equal(mod.classifyWakeOutcome({ text: 'all good', error: null }), null)
457
+ assert.equal(mod.classifyWakeOutcome({ text: 'ORCHESTRATOR_OK' }), null)
458
+ })
459
+
460
+ it('treats a resolved-but-errored result as failure (the 429 regression)', () => {
461
+ const out = mod.classifyWakeOutcome({
462
+ text: '',
463
+ error: '429 All credentials for model gpt-5.4 are cooling down via provider codex',
464
+ })
465
+ assert.equal(out, '429 All credentials for model gpt-5.4 are cooling down via provider codex')
466
+ })
467
+
468
+ it('counts empty visible output as failure so silent wakes trigger backoff', () => {
469
+ assert.equal(mod.classifyWakeOutcome({ text: '' }), 'empty wake response')
470
+ assert.equal(mod.classifyWakeOutcome({ text: ' \n\t' }), 'empty wake response')
471
+ assert.equal(mod.classifyWakeOutcome({}), 'empty wake response')
472
+ assert.equal(mod.classifyWakeOutcome(null), 'empty wake response')
473
+ assert.equal(mod.classifyWakeOutcome(undefined), 'empty wake response')
474
+ })
475
+
476
+ it('ignores a non-string error field and falls back to text check', () => {
477
+ assert.equal(mod.classifyWakeOutcome({ text: 'hi', error: 42 }), null)
478
+ assert.equal(mod.classifyWakeOutcome({ text: '', error: 42 }), 'empty wake response')
479
+ })
480
+
481
+ it('ignores an empty-string error so whitespace errors do not double-count', () => {
482
+ assert.equal(mod.classifyWakeOutcome({ text: 'fine', error: ' ' }), null)
483
+ })
484
+ })
@@ -54,6 +54,23 @@ const ORCHESTRATOR_MIN_INTERVAL_SEC = 60
54
54
  const ORCHESTRATOR_MAX_INTERVAL_SEC = 86400 // 24h
55
55
  const ORCHESTRATOR_MAX_PROMPT_CHARS = 4000
56
56
 
57
+ /**
58
+ * Classify a resolved session-run result as success or failure for the
59
+ * heartbeat/orchestrator outcome tracker. A resolved promise can still
60
+ * carry an error on `result.error` (e.g. a provider 429 that was swallowed
61
+ * into persisted output) or resolve with empty text, and both cases must
62
+ * count as failures — otherwise a stuck wake loop never ticks the
63
+ * failure counter, never backs off, and never auto-disables.
64
+ */
65
+ export function classifyWakeOutcome(result: unknown): string | null {
66
+ if (!result || typeof result !== 'object') return 'empty wake response'
67
+ const obj = result as { error?: unknown; text?: unknown }
68
+ if (typeof obj.error === 'string' && obj.error.trim()) return obj.error
69
+ const text = typeof obj.text === 'string' ? obj.text : ''
70
+ if (!text.trim()) return 'empty wake response'
71
+ return null
72
+ }
73
+
57
74
  interface FailureRecord {
58
75
  count: number
59
76
  lastFailedAt: number
@@ -782,24 +799,28 @@ export async function tickHeartbeats() {
782
799
  state.lastBySession.set(session.id, now)
783
800
 
784
801
  const sid = session.id as string
785
- enqueue.promise.then(() => {
786
- const prev = state.failures.get(sid)
787
- if (prev?.recoveryAttempts) {
788
- log.info('heartbeat', `Recovery successful for session ${sid} after ${prev.recoveryAttempts} attempt(s)`)
802
+ // A session run can "resolve" with an error in result.error (e.g. provider
803
+ // 429 swallowed into the persisted failure) or with empty text. Treat both
804
+ // as failures so backoff and auto-disable trigger, otherwise a stuck
805
+ // heartbeat keeps re-firing at the configured interval and burning tokens.
806
+ const handleHeartbeatOutcome = (failure: string | null) => {
807
+ if (!failure) {
808
+ const prev = state.failures.get(sid)
809
+ if (prev?.recoveryAttempts) {
810
+ log.info('heartbeat', `Recovery successful for session ${sid} after ${prev.recoveryAttempts} attempt(s)`)
811
+ }
812
+ state.failures.delete(sid)
813
+ patchSession(sid, (s) => {
814
+ if (!s) return s
815
+ s.lastDeliveryStatus = 'ok'
816
+ s.lastDeliveredAt = Date.now()
817
+ return s
818
+ })
819
+ return
789
820
  }
790
- state.failures.delete(sid)
791
- // Track successful delivery
792
- patchSession(sid, (s) => {
793
- if (!s) return s
794
- s.lastDeliveryStatus = 'ok'
795
- s.lastDeliveredAt = Date.now()
796
- return s
797
- })
798
- }).catch((err: unknown) => {
799
821
  const prev = state.failures.get(sid)
800
822
  const newCount = (prev?.count ?? 0) + 1
801
823
  const record: FailureRecord = { count: newCount, lastFailedAt: Date.now() }
802
- // Auto-disable heartbeat after too many consecutive failures to prevent resource waste
803
824
  if (newCount >= MAX_CONSECUTIVE_FAILURES) {
804
825
  record.autoDisabledAt = Date.now()
805
826
  log.warn('heartbeat', `Auto-disabling heartbeat for session ${sid} after ${newCount} consecutive failures`)
@@ -821,17 +842,20 @@ export async function tickHeartbeats() {
821
842
  })
822
843
  }
823
844
  state.failures.set(sid, record)
824
- const msg = errorMessage(err)
825
- log.warn('heartbeat', `Heartbeat run failed for session ${sid} (${newCount}/${MAX_CONSECUTIVE_FAILURES})`, msg)
826
- // Track failed delivery
845
+ log.warn('heartbeat', `Heartbeat run failed for session ${sid} (${newCount}/${MAX_CONSECUTIVE_FAILURES})`, failure)
827
846
  patchSession(sid, (s) => {
828
847
  if (!s) return s
829
848
  s.lastDeliveryStatus = 'error'
830
- s.lastDeliveryError = msg
849
+ s.lastDeliveryError = failure
831
850
  s.lastDeliveredAt = Date.now()
832
851
  return s
833
852
  })
834
- })
853
+ }
854
+ enqueue.promise
855
+ .then((result) => handleHeartbeatOutcome(classifyWakeOutcome(result)))
856
+ .catch((err: unknown) => {
857
+ handleHeartbeatOutcome(errorMessage(err) || 'heartbeat rejected')
858
+ })
835
859
  }
836
860
  }
837
861
 
@@ -1118,10 +1142,15 @@ export async function tickOrchestratorAgents() {
1118
1142
 
1119
1143
  log.info('orchestrator', `Woke orchestrator agent ${agent.name} (${agent.id}), cycle #${(agent.orchestratorCycleCount || 0) + 1}`)
1120
1144
 
1121
- // Track success/failure
1122
- enqueue.promise.then(() => {
1123
- orchestratorState.failures.delete(agent.id)
1124
- }).catch((err: unknown) => {
1145
+ // Track success/failure. A run can "resolve" but still carry an error
1146
+ // on the result (e.g. provider 429 that was caught and persisted), so we
1147
+ // inspect the resolved result as well as the rejected path — otherwise
1148
+ // a stuck wake loop never ticks the failure counter and never backs off.
1149
+ const handleWakeOutcome = (failure: string | null) => {
1150
+ if (!failure) {
1151
+ orchestratorState.failures.delete(agent.id)
1152
+ return
1153
+ }
1125
1154
  const prev = orchestratorState.failures.get(agent.id)
1126
1155
  const newCount = (prev?.count ?? 0) + 1
1127
1156
  const record: FailureRecord = { count: newCount, lastFailedAt: Date.now() }
@@ -1146,8 +1175,13 @@ export async function tickOrchestratorAgents() {
1146
1175
  })
1147
1176
  }
1148
1177
  orchestratorState.failures.set(agent.id, record)
1149
- log.warn('orchestrator', `Orchestrator wake failed for agent ${agent.id} (${newCount}/${MAX_CONSECUTIVE_FAILURES})`, errorMessage(err))
1150
- })
1178
+ log.warn('orchestrator', `Orchestrator wake failed for agent ${agent.id} (${newCount}/${MAX_CONSECUTIVE_FAILURES})`, failure)
1179
+ }
1180
+ enqueue.promise
1181
+ .then((result) => handleWakeOutcome(classifyWakeOutcome(result)))
1182
+ .catch((err: unknown) => {
1183
+ handleWakeOutcome(errorMessage(err) || 'wake rejected')
1184
+ })
1151
1185
  } catch (err) {
1152
1186
  log.warn('orchestrator', `Error ticking orchestrator agent ${agent.id}:`, errorMessage(err))
1153
1187
  }
@@ -62,7 +62,7 @@ import {
62
62
  shouldExposeMcpTool,
63
63
  type DiscoveredTool,
64
64
  } from '../mcp-gateway-runtime'
65
- import { getOrConnectMcpClient } from '../mcp-connection-pool'
65
+ import { getOrConnectMcpClient, evictMcpClient, isConnectionLikeError } from '../mcp-connection-pool'
66
66
  import {
67
67
  getEnabledCapabilitySelection,
68
68
  isExternalExtensionId,
@@ -94,6 +94,37 @@ function inferBareName(langChainName: string, serverName: string): string {
94
94
  return langChainName.startsWith(prefix) ? langChainName.slice(prefix.length) : langChainName
95
95
  }
96
96
 
97
+ /**
98
+ * Wraps an MCP-sourced LangChain tool so connection-class failures (stdio pipe
99
+ * closed, HTTP reset, etc.) evict the pool entry, letting the next turn
100
+ * rebuild the client fresh. Non-connection errors (validation, tool logic,
101
+ * auth) propagate unchanged — we trust the downstream's isError signal.
102
+ */
103
+ function wrapMcpToolWithPoolEviction(
104
+ inner: StructuredToolInterface,
105
+ serverId: string,
106
+ ): StructuredToolInterface {
107
+ const wrappedCallback = async (args: unknown): Promise<unknown> => {
108
+ try {
109
+ return await inner.invoke(args as Record<string, unknown>)
110
+ } catch (err: unknown) {
111
+ if (isConnectionLikeError(err)) {
112
+ void evictMcpClient(serverId).catch(() => undefined)
113
+ log.warn('session-tools', `MCP tool "${inner.name}" connection error — evicted pool entry for ${serverId}`, {
114
+ error: errorMessage(err),
115
+ })
116
+ }
117
+ throw err
118
+ }
119
+ }
120
+ return tool(wrappedCallback, {
121
+ name: inner.name,
122
+ description: inner.description,
123
+ // Re-use the inner tool's zod schema so shape/validation is identical.
124
+ schema: (inner as unknown as { schema: z.ZodType }).schema,
125
+ })
126
+ }
127
+
97
128
  export async function buildSessionTools(cwd: string, enabledExtensions: string[], ctx?: ToolContext): Promise<SessionToolsResult> {
98
129
  const tools: StructuredToolInterface[] = []
99
130
  const cleanupFns: (() => Promise<void>)[] = []
@@ -354,7 +385,7 @@ export async function buildSessionTools(cwd: string, enabledExtensions: string[]
354
385
  })
355
386
  if (!shouldBind) continue
356
387
  toolToExtensionMap[t.name] = `mcp:${serverId}`
357
- tools.push(t)
388
+ tools.push(wrapMcpToolWithPoolEviction(t, serverId))
358
389
  }
359
390
  } catch (err: unknown) {
360
391
  log.warn('session-tools', `Failed to connect MCP server "${config.name}"`, { serverId, error: errorMessage(err) })