clawmatrix 0.2.7 → 0.2.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "clawmatrix",
3
- "version": "0.2.7",
3
+ "version": "0.2.8",
4
4
  "description": "Decentralized mesh cluster plugin for OpenClaw — inter-gateway communication, model proxy, task handoff, and tool proxy.",
5
5
  "type": "module",
6
6
  "license": "MIT",
package/src/acp-proxy.ts CHANGED
@@ -8,7 +8,7 @@ import {
8
8
  import type { PeerManager } from "./peer-manager.ts";
9
9
  import type { ClawMatrixConfig } from "./config.ts";
10
10
  import type { GatewayInfo } from "./tool-proxy.ts";
11
- import { spawnProcess, readFileText } from "./compat.ts";
11
+ import { spawnProcess, readFileText, readFileHead, readFileTail } from "./compat.ts";
12
12
  import { debug } from "./debug.ts";
13
13
  import { homedir } from "node:os";
14
14
  import { join } from "node:path";
@@ -57,6 +57,29 @@ const DEFAULT_SESSION_TTL = 1_800_000;
57
57
  const DEFAULT_MAX_SESSIONS = 5;
58
58
  const CLEANUP_INTERVAL = 120_000;
59
59
  const SESSION_INIT_TIMEOUT = 60_000; // 60s timeout for ACP agent initialization
60
+ const REUSE_POOL_TTL = 60_000; // keep oneshot sessions alive for 60s for reuse
61
+ const PREWARM_DELAY = 5_000; // delay before prewarming after a session is used
62
+ const DAEMON_IDLE_TTL = 300_000; // kill daemon after 5 min idle
63
+
64
+ // ── Agent Daemon: long-lived process per agent type ─────────────
65
+
66
+ /**
67
+ * A long-lived ACP agent process that stays alive across multiple sessions.
68
+ * Instead of spawning a new process per session, we reuse the same initialized
69
+ * connection and call newSession() which takes milliseconds vs seconds for a cold spawn.
70
+ */
71
+ interface AgentDaemon {
72
+ agent: string;
73
+ cwd: string;
74
+ conn: ClientSideConnection;
75
+ proc: { kill: () => void; exited: Promise<number> };
76
+ /** Per-session stream callbacks keyed by ACP session ID */
77
+ streamCallbacks: Map<string, ((delta: string, event?: string) => void) | null>;
78
+ lastActiveAt: number;
79
+ idleTimer: ReturnType<typeof setTimeout>;
80
+ /** Set to true when process has exited */
81
+ dead: boolean;
82
+ }
60
83
 
61
84
  // ── Session state on receiver side ───────────────────────────────
62
85
 
@@ -124,6 +147,17 @@ export class AcpProxy {
124
147
  private cleanupTimer: ReturnType<typeof setInterval> | null = null;
125
148
  // Multi-device sync: track which nodes are watching each session
126
149
  private sessionWatchers = new Map<string, Set<string>>();
150
+ // Session reuse pool: oneshot sessions kept alive briefly for reuse
151
+ private reusePool = new Map<string, { session: AcpSession; timer: ReturnType<typeof setTimeout> }>();
152
+ // Prewarm pool: pre-initialized ACP connections ready for immediate use
153
+ private warmPool = new Map<string, AcpSession[]>();
154
+ // Track pending prewarm timers so they can be cancelled on dispose
155
+ private prewarmTimers = new Set<ReturnType<typeof setTimeout>>();
156
+ private disposed = false;
157
+ // Agent daemon pool: long-lived process per agent type, reused across sessions
158
+ private daemons = new Map<string, AgentDaemon>();
159
+ // In-flight daemon acquisition: prevents multiple concurrent spawns for same agent
160
+ private daemonStarting = new Map<string, Promise<AgentDaemon>>();
127
161
 
128
162
  constructor(config: ClawMatrixConfig, peerManager: PeerManager, openclawConfig?: Record<string, unknown>, gatewayInfo?: GatewayInfo) {
129
163
  this.config = config;
@@ -577,7 +611,7 @@ export class AcpProxy {
577
611
  });
578
612
  }
579
613
  if (mode === "oneshot" && task) {
580
- this.destroySession(newSession);
614
+ this.returnToReusePool(newSession, cwd ?? process.cwd());
581
615
  }
582
616
  } else {
583
617
  // Re-create native session
@@ -637,7 +671,7 @@ export class AcpProxy {
637
671
  });
638
672
  }
639
673
  if (mode === "oneshot" && task) {
640
- this.destroySession(session);
674
+ this.returnToReusePool(session, cwd ?? process.cwd());
641
675
  }
642
676
  } else {
643
677
  // Create new native OpenClaw session
@@ -871,6 +905,15 @@ export class AcpProxy {
871
905
  return;
872
906
  }
873
907
 
908
+ if (session.kind !== "acp") {
909
+ this.peerManager.sendTo(from, {
910
+ type: "acp_set_mode_res", id, from: this.config.nodeId, to: from,
911
+ timestamp: Date.now(),
912
+ payload: { success: false, error: "Mode switching not supported for native sessions" },
913
+ } satisfies AcpSetModeResponse);
914
+ return;
915
+ }
916
+
874
917
  try {
875
918
  await session.conn.setSessionMode({
876
919
  sessionId: session.acpSessionId,
@@ -910,8 +953,8 @@ export class AcpProxy {
910
953
  timestamp: Date.now(),
911
954
  payload: {
912
955
  success: true,
913
- modes: session.availableModes,
914
- currentModeId: session.currentModeId,
956
+ modes: session.kind === "acp" ? session.availableModes : undefined,
957
+ currentModeId: session.kind === "acp" ? session.currentModeId : undefined,
915
958
  },
916
959
  } satisfies AcpGetModesResponse);
917
960
  }
@@ -1022,17 +1065,17 @@ export class AcpProxy {
1022
1065
  const sessionsDir = join(agentsDir, agentId, "sessions");
1023
1066
  if (entry.sessionFile) {
1024
1067
  const candidate = join(sessionsDir, entry.sessionFile);
1025
- try { await readFileText(candidate); return candidate; } catch { /* try default */ }
1068
+ try { await access(candidate); return candidate; } catch { /* try default */ }
1026
1069
  }
1027
1070
  const candidate = join(sessionsDir, `${sessionId}.jsonl`);
1028
- try { await readFileText(candidate); return candidate; } catch { /* continue */ }
1071
+ try { await access(candidate); return candidate; } catch { /* continue */ }
1029
1072
  }
1030
1073
  }
1031
1074
  } catch { /* store unreadable */ }
1032
1075
 
1033
1076
  // Fallback: try direct file path
1034
1077
  const candidate = join(agentsDir, agentId, "sessions", `${sessionId}.jsonl`);
1035
- try { await readFileText(candidate); return candidate; } catch { /* next agent */ }
1078
+ try { await access(candidate); return candidate; } catch { /* next agent */ }
1036
1079
  }
1037
1080
  } catch { /* no agents directory */ }
1038
1081
 
@@ -1049,16 +1092,282 @@ export class AcpProxy {
1049
1092
  return null;
1050
1093
  }
1051
1094
 
1095
+ // ── Internal: agent daemon pool ─────────────────────────────────
1096
+
1097
+ /**
1098
+ * Get or create a long-lived daemon for the given agent.
1099
+ * The daemon process stays alive and its initialized connection is reused
1100
+ * for all subsequent newSession() calls — eliminating spawn+init overhead.
1101
+ */
1102
+ private async getOrCreateDaemon(agent: string, cwd: string): Promise<AgentDaemon> {
1103
+ const key = `${agent}:${cwd}`;
1104
+ // Return existing healthy daemon for this agent+cwd
1105
+ const existing = this.daemons.get(key);
1106
+ if (existing && !existing.dead) {
1107
+ existing.lastActiveAt = Date.now();
1108
+ this.resetDaemonIdleTimer(existing);
1109
+ return existing;
1110
+ }
1111
+
1112
+ // Prevent concurrent spawns for the same agent+cwd
1113
+ const inflight = this.daemonStarting.get(key);
1114
+ if (inflight) return inflight;
1115
+
1116
+ const promise = this.spawnDaemon(agent, cwd);
1117
+ this.daemonStarting.set(key, promise);
1118
+ try {
1119
+ const daemon = await promise;
1120
+ return daemon;
1121
+ } finally {
1122
+ this.daemonStarting.delete(key);
1123
+ }
1124
+ }
1125
+
1126
+ private async spawnDaemon(agent: string, cwd: string): Promise<AgentDaemon> {
1127
+ const streamCallbacks = new Map<string, ((delta: string, event?: string) => void) | null>();
1128
+
1129
+ const { conn, proc } = await this.spawnAndInit(agent, cwd, "spawnDaemon", (params) => {
1130
+ const acpSessionId = (params as unknown as { sessionId?: string }).sessionId;
1131
+ if (!acpSessionId) {
1132
+ debug("acp", `[${agent}:daemon] sessionUpdate missing sessionId, cannot route`);
1133
+ return null;
1134
+ }
1135
+ return streamCallbacks.get(acpSessionId) ?? null;
1136
+ });
1137
+
1138
+ debug("acp", `[${agent}] daemon initialized`);
1139
+
1140
+ const key = `${agent}:${cwd}`;
1141
+ const daemon: AgentDaemon = {
1142
+ agent,
1143
+ cwd,
1144
+ conn,
1145
+ proc,
1146
+ streamCallbacks,
1147
+ lastActiveAt: Date.now(),
1148
+ idleTimer: setTimeout(() => {}, 0), // placeholder, reset below
1149
+ dead: false,
1150
+ };
1151
+
1152
+ // Monitor process exit — clean up daemon and any sessions using its connection
1153
+ const onDaemonExit = () => {
1154
+ daemon.dead = true;
1155
+ // Only remove from map if this daemon hasn't been replaced by a new one
1156
+ if (this.daemons.get(key) === daemon) this.daemons.delete(key);
1157
+ // Clean up sessions that were using this daemon's connection
1158
+ for (const [sid, s] of this.sessions) {
1159
+ if (s.kind === "acp" && s.conn === daemon.conn) {
1160
+ this.sessions.delete(sid);
1161
+ debug("acp", `daemon exited: cleaned up session ${sid} (agent=${daemon.agent})`);
1162
+ }
1163
+ }
1164
+ };
1165
+ proc.exited.then(onDaemonExit).catch(onDaemonExit);
1166
+
1167
+ this.resetDaemonIdleTimer(daemon);
1168
+ this.daemons.set(key, daemon);
1169
+ return daemon;
1170
+ }
1171
+
1172
+ private resetDaemonIdleTimer(daemon: AgentDaemon) {
1173
+ clearTimeout(daemon.idleTimer);
1174
+ daemon.idleTimer = setTimeout(() => {
1175
+ if (daemon.dead) return;
1176
+ // Only kill if no active sessions are using this daemon
1177
+ const hasActiveSessions = [...this.sessions.values()].some(
1178
+ (s) => s.kind === "acp" && s.agent === daemon.agent && s.conn === daemon.conn,
1179
+ );
1180
+ if (!hasActiveSessions) {
1181
+ debug("acp", `daemon idle timeout: killing ${daemon.agent} (cwd=${daemon.cwd})`);
1182
+ daemon.dead = true;
1183
+ this.daemons.delete(`${daemon.agent}:${daemon.cwd}`);
1184
+ try { daemon.proc.kill(); } catch { /* best effort */ }
1185
+ } else {
1186
+ // Sessions still active, reschedule
1187
+ this.resetDaemonIdleTimer(daemon);
1188
+ }
1189
+ }, DAEMON_IDLE_TTL);
1190
+ }
1191
+
1192
+ /**
1193
+ * Create a new ACP session using the daemon's long-lived connection.
1194
+ * Falls back to spawnAndConnect if daemon is unavailable.
1195
+ */
1196
+ private async createSessionViaDaemon(
1197
+ agent: string,
1198
+ cwd: string,
1199
+ from: string,
1200
+ ): Promise<AcpSession> {
1201
+ const daemon = await this.getOrCreateDaemon(agent, cwd);
1202
+
1203
+ const response = await daemon.conn.newSession({ cwd, mcpServers: [] });
1204
+ debug("acp", `[${agent}] daemon session created: ${response.sessionId}`);
1205
+
1206
+ const availableModes: AcpModeInfo[] | undefined = response.modes?.availableModes?.map(
1207
+ (m) => ({ id: m.id, name: m.name, description: m.description ?? undefined }),
1208
+ );
1209
+
1210
+ const session: AcpSession = {
1211
+ kind: "acp",
1212
+ sessionId: crypto.randomUUID(),
1213
+ agent,
1214
+ acpSessionId: response.sessionId,
1215
+ conn: daemon.conn,
1216
+ proc: daemon.proc,
1217
+ lastActiveAt: Date.now(),
1218
+ from,
1219
+ setStreamCallback: (cb) => {
1220
+ daemon.streamCallbacks.set(response.sessionId, cb);
1221
+ },
1222
+ availableModes,
1223
+ currentModeId: response.modes?.currentModeId,
1224
+ };
1225
+
1226
+ return session;
1227
+ }
1228
+
1229
+ // ── Internal: session reuse & prewarming ─────────────────────────
1230
+
1231
+ /** Try to grab a reusable session from the reuse pool (same agent, same cwd). */
1232
+ private takeFromReusePool(agent: string, cwd: string): AcpSession | null {
1233
+ const key = `${agent}:${cwd}`;
1234
+ const entry = this.reusePool.get(key);
1235
+ if (!entry) return null;
1236
+ this.reusePool.delete(key);
1237
+ clearTimeout(entry.timer);
1238
+ debug("acp", `reuse pool hit: agent=${agent} cwd=${cwd} sessionId=${entry.session.sessionId}`);
1239
+ return entry.session;
1240
+ }
1241
+
1242
+ /** Find the daemon that owns this session's connection (if any). */
1243
+ private findDaemonByConn(session: AcpSession): AgentDaemon | null {
1244
+ // Fast path: check the exact agent+cwd key
1245
+ for (const daemon of this.daemons.values()) {
1246
+ if (daemon.conn === session.conn) return daemon;
1247
+ }
1248
+ return null;
1249
+ }
1250
+
1251
+ /** Return a completed oneshot session to the reuse pool instead of destroying it. */
1252
+ private returnToReusePool(session: AcpSession, cwd: string) {
1253
+ // Daemon-backed sessions don't need reuse pool — the daemon stays alive
1254
+ const daemon = this.findDaemonByConn(session);
1255
+ if (daemon) {
1256
+ daemon.streamCallbacks.delete(session.acpSessionId);
1257
+ debug("acp", `daemon-backed oneshot done: agent=${session.agent} (daemon stays alive)`);
1258
+ return;
1259
+ }
1260
+
1261
+ const key = `${session.agent}:${cwd}`;
1262
+ // Evict any existing entry for the same key
1263
+ const existing = this.reusePool.get(key);
1264
+ if (existing) {
1265
+ clearTimeout(existing.timer);
1266
+ this.destroySession(existing.session);
1267
+ }
1268
+ const timer = setTimeout(() => {
1269
+ const entry = this.reusePool.get(key);
1270
+ if (entry?.session.sessionId === session.sessionId) {
1271
+ this.reusePool.delete(key);
1272
+ this.destroySession(session);
1273
+ debug("acp", `reuse pool expired: agent=${session.agent} cwd=${cwd}`);
1274
+ }
1275
+ }, REUSE_POOL_TTL);
1276
+ this.reusePool.set(key, { session, timer });
1277
+ debug("acp", `reuse pool stored: agent=${session.agent} cwd=${cwd} ttl=${REUSE_POOL_TTL / 1000}s`);
1278
+ }
1279
+
1280
+ /** Try to grab a pre-warmed session from the warm pool (same agent + cwd). */
1281
+ private takeFromWarmPool(agent: string, cwd: string): AcpSession | null {
1282
+ const key = `${agent}:${cwd}`;
1283
+ const pool = this.warmPool.get(key);
1284
+ if (!pool || pool.length === 0) return null;
1285
+ const session = pool.shift()!;
1286
+ debug("acp", `warm pool hit: agent=${agent} cwd=${cwd} sessionId=${session.sessionId}`);
1287
+ return session;
1288
+ }
1289
+
1290
+ /** Schedule a prewarm for the given agent after a short delay. */
1291
+ private schedulePrewarm(agent: string, cwd: string) {
1292
+ const timer = setTimeout(async () => {
1293
+ this.prewarmTimers.delete(timer);
1294
+ if (this.disposed) return;
1295
+ // Only prewarm if the warm pool is empty for this agent+cwd
1296
+ const key = `${agent}:${cwd}`;
1297
+ const pool = this.warmPool.get(key);
1298
+ if (pool && pool.length > 0) return;
1299
+ try {
1300
+ debug("acp", `prewarming agent=${agent} cwd=${cwd}`);
1301
+ const session = await this.spawnAndInitSession(agent, cwd, "prewarm");
1302
+ if (this.disposed) { this.destroySession(session); return; }
1303
+ if (!this.warmPool.has(key)) this.warmPool.set(key, []);
1304
+ this.warmPool.get(key)!.push(session);
1305
+ debug("acp", `warm pool ready: agent=${agent} sessionId=${session.sessionId}`);
1306
+ } catch (err) {
1307
+ debug("acp", `prewarm failed: agent=${agent} error=${errorMessage(err)}`);
1308
+ }
1309
+ }, PREWARM_DELAY);
1310
+ this.prewarmTimers.add(timer);
1311
+ }
1312
+
1052
1313
  // ── Internal: ACP session management (receiver) ────────────────
1053
1314
 
1054
1315
  private async createSession(agent: string, cwd: string | undefined, from: string): Promise<AcpSession> {
1055
- const cmd = this.resolveCommand(agent);
1056
1316
  const effectiveCwd = cwd ?? process.cwd();
1057
1317
 
1058
- debug("acp", `createSession: spawning ${cmd.join(" ")} in ${effectiveCwd} (for ${from})`);
1318
+ // 1. Try daemon pool first (long-lived process, instant newSession)
1319
+ try {
1320
+ return await this.createSessionViaDaemon(agent, effectiveCwd, from);
1321
+ } catch (err) {
1322
+ debug("acp", `daemon session failed for ${agent}: ${errorMessage(err)}, falling back`);
1323
+ }
1324
+
1325
+ // 2. Check reuse pool (same agent + cwd, still alive)
1326
+ const reused = this.takeFromReusePool(agent, effectiveCwd);
1327
+ if (reused) {
1328
+ reused.from = from;
1329
+ reused.lastActiveAt = Date.now();
1330
+ reused.sessionId = crypto.randomUUID();
1331
+ this.monitorProcess(reused);
1332
+ return reused;
1333
+ }
1334
+
1335
+ // 3. Check warm pool (pre-initialized, same agent + cwd)
1336
+ const warm = this.takeFromWarmPool(agent, effectiveCwd);
1337
+ if (warm) {
1338
+ warm.from = from;
1339
+ warm.lastActiveAt = Date.now();
1340
+ warm.sessionId = crypto.randomUUID();
1341
+ this.monitorProcess(warm);
1342
+ this.schedulePrewarm(agent, effectiveCwd);
1343
+ return warm;
1344
+ }
1345
+
1346
+ // 4. Cold start: spawn + initialize (last resort)
1347
+ const session = await this.spawnAndInitSession(agent, effectiveCwd, from);
1348
+ this.schedulePrewarm(agent, effectiveCwd);
1349
+ return session;
1350
+ }
1351
+
1352
+ /**
1353
+ * Spawn an ACP agent process, set up NDJSON/stdio, and initialize the ACP connection.
1354
+ * Returns the initialized connection and process handle.
1355
+ * Shared by spawnDaemon (daemon pool) and spawnAndConnect (per-session).
1356
+ *
1357
+ * @param resolveStreamCb Called on each sessionUpdate to resolve the stream callback.
1358
+ * For daemons this routes by ACP session ID; for per-session it returns a closure variable.
1359
+ */
1360
+ private async spawnAndInit(
1361
+ agent: string,
1362
+ cwd: string,
1363
+ label: string,
1364
+ resolveStreamCb: (params: SessionNotification) => ((delta: string, event?: string) => void) | null | undefined,
1365
+ ): Promise<{ conn: ClientSideConnection; proc: { kill: () => void; exited: Promise<number> } }> {
1366
+ const cmd = this.resolveCommand(agent);
1367
+ debug("acp", `${label}: spawning ${cmd.join(" ")} in ${cwd}`);
1059
1368
 
1060
1369
  const proc = spawnProcess(cmd, {
1061
- cwd: effectiveCwd,
1370
+ cwd,
1062
1371
  stdout: "pipe",
1063
1372
  stderr: "pipe",
1064
1373
  stdin: "pipe",
@@ -1069,7 +1378,7 @@ export class AcpProxy {
1069
1378
  throw new Error(`Failed to spawn ACP agent "${agent}": no stdio streams`);
1070
1379
  }
1071
1380
 
1072
- // Capture stderr for diagnostics (agent errors, missing API keys, etc.)
1381
+ // Capture stderr for diagnostics
1073
1382
  if (proc.stderr) {
1074
1383
  const reader = (proc.stderr as ReadableStream<Uint8Array>).getReader();
1075
1384
  const decoder = new TextDecoder();
@@ -1085,19 +1394,14 @@ export class AcpProxy {
1085
1394
  })();
1086
1395
  }
1087
1396
 
1088
- // Detect early process exit (e.g., npx not found, package install failure)
1089
1397
  let earlyExit = false;
1090
1398
  const earlyExitPromise = proc.exited.then((code) => {
1091
1399
  earlyExit = true;
1092
1400
  throw new Error(`ACP agent "${agent}" exited during init with code ${code}`);
1093
1401
  });
1094
1402
 
1095
- // Build ACP connection over NDJSON/stdio
1096
1403
  const stream = ndJsonStream(proc.stdin, proc.stdout);
1097
1404
 
1098
- // Accumulated text for streaming back — set per-prompt via closure
1099
- let streamCallback: ((delta: string, event?: string) => void) | null = null;
1100
-
1101
1405
  const conn = new ClientSideConnection(
1102
1406
  (_agentRef) => ({
1103
1407
  requestPermission: async (params: RequestPermissionRequest): Promise<RequestPermissionResponse> => {
@@ -1106,23 +1410,21 @@ export class AcpProxy {
1106
1410
  sessionUpdate: async (params: SessionNotification): Promise<void> => {
1107
1411
  const update = params.update as Record<string, unknown>;
1108
1412
  const updateType = update.sessionUpdate as string;
1109
-
1110
- // Extract text delta from content chunks
1413
+ const cb = resolveStreamCb(params);
1111
1414
  if (updateType === "agent_message_chunk" || updateType === "agent_thought_chunk") {
1112
1415
  const content = (update as { content?: { type?: string; text?: string } }).content;
1113
1416
  if (content?.type === "text" && content.text) {
1114
- streamCallback?.(content.text, updateType);
1417
+ cb?.(content.text, updateType);
1115
1418
  }
1116
1419
  } else if (updateType === "tool_call") {
1117
1420
  const toolName = (update as { title?: string }).title
1118
1421
  || (update as { toolCallId?: string }).toolCallId
1119
1422
  || "unknown";
1120
- debug("acp", `tool_call update: title=${JSON.stringify((update as any).title)} toolCallId=${JSON.stringify((update as any).toolCallId)} keys=${Object.keys(update).join(",")}`);
1121
- streamCallback?.(`\n[tool_call: ${toolName}]\n`, updateType);
1423
+ cb?.(`\n[tool_call: ${toolName}]\n`, updateType);
1122
1424
  } else if (updateType === "tool_call_update") {
1123
1425
  const status = (update as { status?: string }).status;
1124
1426
  if (status === "completed" || status === "error") {
1125
- streamCallback?.("", "tool_result");
1427
+ cb?.("", "tool_result");
1126
1428
  }
1127
1429
  }
1128
1430
  },
@@ -1130,74 +1432,86 @@ export class AcpProxy {
1130
1432
  stream,
1131
1433
  );
1132
1434
 
1133
- // Initialize with timeout — prevents hanging if agent process is stuck
1435
+ let initTimer: ReturnType<typeof setTimeout>;
1134
1436
  const initTimeout = new Promise<never>((_, reject) => {
1135
- setTimeout(() => reject(new Error(
1437
+ initTimer = setTimeout(() => reject(new Error(
1136
1438
  `ACP agent "${agent}" initialization timed out after ${SESSION_INIT_TIMEOUT / 1000}s`,
1137
1439
  )), SESSION_INIT_TIMEOUT);
1138
1440
  });
1139
1441
 
1140
- const initAndSession = async () => {
1141
- // Initialize the ACP connection
1142
- debug("acp", `[${agent}] initializing ACP connection...`);
1143
- await conn.initialize({
1144
- protocolVersion: 1,
1145
- clientInfo: { name: "ClawMatrix", version: "0.1.0" },
1146
- clientCapabilities: {},
1147
- });
1148
- debug("acp", `[${agent}] ACP connection initialized, creating session...`);
1149
-
1150
- // Create a new session
1151
- return conn.newSession({
1152
- cwd: effectiveCwd,
1153
- mcpServers: [],
1154
- });
1155
- };
1156
-
1157
- let sessionResponse: Awaited<ReturnType<typeof conn.newSession>>;
1158
1442
  try {
1159
- sessionResponse = await Promise.race([
1160
- initAndSession(),
1443
+ await Promise.race([
1444
+ conn.initialize({
1445
+ protocolVersion: 1,
1446
+ clientInfo: { name: "ClawMatrix", version: "0.1.0" },
1447
+ clientCapabilities: {},
1448
+ }),
1161
1449
  initTimeout,
1162
1450
  earlyExitPromise as Promise<never>,
1163
1451
  ]);
1164
1452
  } catch (err) {
1165
- const msg = err instanceof Error ? err.message : (typeof err === "string" ? err : JSON.stringify(err));
1166
- debug("acp", `[${agent}] init failed: ${msg}`);
1167
- // Kill the process if init failed
1168
1453
  if (!earlyExit) proc.kill();
1169
- throw new Error(`ACP agent "${agent}" init failed: ${msg}`);
1454
+ throw err;
1455
+ } finally {
1456
+ clearTimeout(initTimer!);
1170
1457
  }
1171
- debug("acp", `[${agent}] session created: ${sessionResponse.sessionId}`);
1172
1458
 
1173
- // Extract available modes from session response
1174
- const availableModes: AcpModeInfo[] | undefined = sessionResponse.modes?.availableModes?.map(
1459
+ return { conn, proc };
1460
+ }
1461
+
1462
+ /**
1463
+ * Spawn an ACP agent, initialize, then call `sessionFactory` to create/resume the session.
1464
+ * Used for per-session (non-daemon) spawns.
1465
+ */
1466
+ private async spawnAndConnect(
1467
+ agent: string,
1468
+ cwd: string,
1469
+ from: string,
1470
+ label: string,
1471
+ sessionFactory: (conn: ClientSideConnection, cwd: string) => Promise<{ sessionId: string; modes?: { availableModes?: Array<{ id: string; name: string; description?: string }>; currentModeId?: string } }>,
1472
+ ): Promise<AcpSession> {
1473
+ let streamCallback: ((delta: string, event?: string) => void) | null = null;
1474
+
1475
+ const { conn, proc } = await this.spawnAndInit(agent, cwd, label, () => streamCallback);
1476
+
1477
+ let response: Awaited<ReturnType<typeof sessionFactory>>;
1478
+ try {
1479
+ response = await sessionFactory(conn, cwd);
1480
+ } catch (err) {
1481
+ proc.kill();
1482
+ throw err;
1483
+ }
1484
+ debug("acp", `[${agent}] session ready: ${response.sessionId}`);
1485
+
1486
+ const availableModes: AcpModeInfo[] | undefined = response.modes?.availableModes?.map(
1175
1487
  (m) => ({ id: m.id, name: m.name, description: m.description ?? undefined }),
1176
1488
  );
1177
- const currentModeId = sessionResponse.modes?.currentModeId;
1178
-
1179
- const sessionId = crypto.randomUUID();
1180
1489
 
1181
1490
  const session: AcpSession = {
1182
1491
  kind: "acp",
1183
- sessionId,
1492
+ sessionId: crypto.randomUUID(),
1184
1493
  agent,
1185
- acpSessionId: sessionResponse.sessionId,
1494
+ acpSessionId: response.sessionId,
1186
1495
  conn,
1187
1496
  proc,
1188
1497
  lastActiveAt: Date.now(),
1189
1498
  from,
1190
1499
  setStreamCallback: (cb) => { streamCallback = cb; },
1191
1500
  availableModes,
1192
- currentModeId,
1501
+ currentModeId: response.modes?.currentModeId,
1193
1502
  };
1194
1503
 
1195
- // Monitor process exit — clean up dead sessions proactively
1196
1504
  this.monitorProcess(session);
1197
-
1198
1505
  return session;
1199
1506
  }
1200
1507
 
1508
+ /** Spawn a new ACP agent process, initialize ACP connection, and create a session. */
1509
+ private spawnAndInitSession(agent: string, cwd: string, from: string): Promise<AcpSession> {
1510
+ return this.spawnAndConnect(agent, cwd, from, "spawnAndInitSession", (conn, effectiveCwd) =>
1511
+ conn.newSession({ cwd: effectiveCwd, mcpServers: [] }),
1512
+ );
1513
+ }
1514
+
1201
1515
  /** Watch for unexpected process exit and clean up the session. */
1202
1516
  private monitorProcess(session: AcpSession) {
1203
1517
  session.proc.exited.then(() => {
@@ -1386,7 +1700,7 @@ export class AcpProxy {
1386
1700
 
1387
1701
  const reader = body.getReader();
1388
1702
  const decoder = new TextDecoder();
1389
- let full = "";
1703
+ const chunks: string[] = [];
1390
1704
  let buffer = "";
1391
1705
 
1392
1706
  try {
@@ -1407,7 +1721,7 @@ export class AcpProxy {
1407
1721
  const parsed = JSON.parse(data);
1408
1722
  const delta = parsed.choices?.[0]?.delta?.content;
1409
1723
  if (delta) {
1410
- full += delta;
1724
+ chunks.push(delta);
1411
1725
  const streamFrame: AcpStreamChunk = {
1412
1726
  type: "acp_stream",
1413
1727
  id: requestId,
@@ -1428,141 +1742,19 @@ export class AcpProxy {
1428
1742
  reader.releaseLock();
1429
1743
  }
1430
1744
 
1431
- return full;
1745
+ return chunks.join("");
1432
1746
  }
1433
1747
 
1434
1748
  /** Create a session by resuming an existing ACP session ID. */
1435
- private async createSessionWithResume(
1749
+ private createSessionWithResume(
1436
1750
  agent: string,
1437
1751
  acpSessionId: string,
1438
1752
  cwd: string,
1439
1753
  from: string,
1440
1754
  ): Promise<AcpSession> {
1441
- const cmd = this.resolveCommand(agent);
1442
- debug("acp", `createSessionWithResume: spawning ${cmd.join(" ")} in ${cwd} (resume ${acpSessionId.slice(0, 8)}...)`);
1443
-
1444
- const proc = spawnProcess(cmd, {
1445
- cwd,
1446
- stdout: "pipe",
1447
- stderr: "pipe",
1448
- stdin: "pipe",
1449
- });
1450
-
1451
- if (!proc.stdin || !proc.stdout) {
1452
- proc.kill();
1453
- throw new Error(`Failed to spawn ACP agent "${agent}": no stdio streams`);
1454
- }
1455
-
1456
- // Capture stderr for diagnostics
1457
- if (proc.stderr) {
1458
- const reader = (proc.stderr as ReadableStream<Uint8Array>).getReader();
1459
- const decoder = new TextDecoder();
1460
- (async () => {
1461
- try {
1462
- while (true) {
1463
- const { done, value } = await reader.read();
1464
- if (done) break;
1465
- const text = decoder.decode(value, { stream: true }).trim();
1466
- if (text) debug("acp", `[${agent}:stderr] ${text}`);
1467
- }
1468
- } catch { /* stream closed */ }
1469
- })();
1470
- }
1471
-
1472
- let earlyExit = false;
1473
- const earlyExitPromise = proc.exited.then((code) => {
1474
- earlyExit = true;
1475
- throw new Error(`ACP agent "${agent}" exited during resume init with code ${code}`);
1476
- });
1477
-
1478
- const stream = ndJsonStream(proc.stdin, proc.stdout);
1479
- let streamCallback: ((delta: string, event?: string) => void) | null = null;
1480
-
1481
- const conn = new ClientSideConnection(
1482
- (_agentRef) => ({
1483
- requestPermission: async (params: RequestPermissionRequest): Promise<RequestPermissionResponse> => {
1484
- return this.resolvePermission(params);
1485
- },
1486
- sessionUpdate: async (params: SessionNotification): Promise<void> => {
1487
- const update = params.update as Record<string, unknown>;
1488
- const updateType = update.sessionUpdate as string;
1489
- if (updateType === "agent_message_chunk" || updateType === "agent_thought_chunk") {
1490
- const content = (update as { content?: { type?: string; text?: string } }).content;
1491
- if (content?.type === "text" && content.text) {
1492
- streamCallback?.(content.text, updateType);
1493
- }
1494
- } else if (updateType === "tool_call") {
1495
- const toolName = (update as { title?: string }).title
1496
- || (update as { toolCallId?: string }).toolCallId
1497
- || "unknown";
1498
- debug("acp", `tool_call update: title=${JSON.stringify((update as any).title)} toolCallId=${JSON.stringify((update as any).toolCallId)} keys=${Object.keys(update).join(",")}`);
1499
- streamCallback?.(`\n[tool_call: ${toolName}]\n`, updateType);
1500
- } else if (updateType === "tool_call_update") {
1501
- const status = (update as { status?: string }).status;
1502
- if (status === "completed" || status === "error") {
1503
- streamCallback?.("", "tool_result");
1504
- }
1505
- }
1506
- },
1507
- }),
1508
- stream,
1509
- );
1510
-
1511
- const initTimeout = new Promise<never>((_, reject) => {
1512
- setTimeout(() => reject(new Error(
1513
- `ACP agent "${agent}" resume initialization timed out after ${SESSION_INIT_TIMEOUT / 1000}s`,
1514
- )), SESSION_INIT_TIMEOUT);
1515
- });
1516
-
1517
- const initAndResume = async () => {
1518
- await conn.initialize({
1519
- protocolVersion: 1,
1520
- clientInfo: { name: "ClawMatrix", version: "0.1.0" },
1521
- clientCapabilities: {},
1522
- });
1523
-
1524
- // Resume the existing ACP session instead of creating a new one
1525
- return conn.unstable_resumeSession({
1526
- sessionId: acpSessionId,
1527
- cwd,
1528
- });
1529
- };
1530
-
1531
- let resumeResponse: Awaited<ReturnType<typeof conn.unstable_resumeSession>>;
1532
- try {
1533
- resumeResponse = await Promise.race([
1534
- initAndResume(),
1535
- initTimeout,
1536
- earlyExitPromise as Promise<never>,
1537
- ]);
1538
- } catch (err) {
1539
- if (!earlyExit) proc.kill();
1540
- throw err;
1541
- }
1542
-
1543
- const availableModes: AcpModeInfo[] | undefined = resumeResponse.modes?.availableModes?.map(
1544
- (m) => ({ id: m.id, name: m.name, description: m.description ?? undefined }),
1755
+ return this.spawnAndConnect(agent, cwd, from, "createSessionWithResume", (conn) =>
1756
+ conn.unstable_resumeSession({ sessionId: acpSessionId, cwd }),
1545
1757
  );
1546
- const currentModeId = resumeResponse.modes?.currentModeId;
1547
-
1548
- const sessionId = crypto.randomUUID();
1549
-
1550
- const session: AcpSession = {
1551
- kind: "acp",
1552
- sessionId,
1553
- agent,
1554
- acpSessionId,
1555
- conn,
1556
- proc,
1557
- lastActiveAt: Date.now(),
1558
- from,
1559
- setStreamCallback: (cb) => { streamCallback = cb; },
1560
- availableModes,
1561
- currentModeId,
1562
- };
1563
-
1564
- this.monitorProcess(session);
1565
- return session;
1566
1758
  }
1567
1759
 
1568
1760
  /** Read all session stores from disk (OpenClaw + Claude Code). */
@@ -1571,7 +1763,16 @@ export class AcpProxy {
1571
1763
  }
1572
1764
 
1573
1765
  private destroySession(session: AnySession) {
1766
+ invalidateSessionListCache();
1574
1767
  if (session.kind === "acp") {
1768
+ // Clean up daemon stream callback
1769
+ const daemon = this.findDaemonByConn(session);
1770
+ if (daemon) {
1771
+ daemon.streamCallbacks.delete(session.acpSessionId);
1772
+ // Don't kill — process is shared by the daemon
1773
+ return;
1774
+ }
1775
+ // Not a daemon-backed session — kill the process
1575
1776
  try {
1576
1777
  session.proc.kill();
1577
1778
  } catch {
@@ -1844,6 +2045,36 @@ export class AcpProxy {
1844
2045
  }
1845
2046
  this.sessions.clear();
1846
2047
  this.sessionWatchers.clear();
2048
+
2049
+ this.disposed = true;
2050
+
2051
+ // Cancel pending prewarm timers
2052
+ for (const timer of this.prewarmTimers) clearTimeout(timer);
2053
+ this.prewarmTimers.clear();
2054
+
2055
+ // Clean up reuse pool
2056
+ for (const [, entry] of this.reusePool) {
2057
+ clearTimeout(entry.timer);
2058
+ this.destroySession(entry.session);
2059
+ }
2060
+ this.reusePool.clear();
2061
+
2062
+ // Clean up warm pool
2063
+ for (const [, pool] of this.warmPool) {
2064
+ for (const session of pool) {
2065
+ this.destroySession(session);
2066
+ }
2067
+ }
2068
+ this.warmPool.clear();
2069
+
2070
+ // Clean up daemons
2071
+ for (const [, daemon] of this.daemons) {
2072
+ clearTimeout(daemon.idleTimer);
2073
+ daemon.dead = true;
2074
+ try { daemon.proc.kill(); } catch { /* best effort */ }
2075
+ }
2076
+ this.daemons.clear();
2077
+ this.daemonStarting.clear();
1847
2078
  }
1848
2079
  }
1849
2080
 
@@ -1855,9 +2086,7 @@ const TRANSCRIPT_MAX_LINES = 20;
1855
2086
  /** Read the first user message from a session transcript JSONL file. */
1856
2087
  async function readFirstUserMessageFromTranscript(transcriptPath: string): Promise<string | null> {
1857
2088
  try {
1858
- const content = await readFileText(transcriptPath);
1859
- // Only scan the head to avoid reading large transcripts
1860
- const head = content.slice(0, TRANSCRIPT_HEAD_BYTES);
2089
+ const head = await readFileHead(transcriptPath, TRANSCRIPT_HEAD_BYTES);
1861
2090
  const lines = head.split("\n").slice(0, TRANSCRIPT_MAX_LINES);
1862
2091
  for (const line of lines) {
1863
2092
  if (!line.trim()) continue;
@@ -1883,8 +2112,56 @@ async function readFirstUserMessageFromTranscript(transcriptPath: string): Promi
1883
2112
  return null;
1884
2113
  }
1885
2114
 
2115
+ // ── Session list cache (stale-while-revalidate) ─────────────────────
2116
+ const SESSION_LIST_CACHE_TTL = 30_000; // 30 seconds fresh
2117
+ const SESSION_LIST_STALE_TTL = 120_000; // serve stale up to 2 min while refreshing
2118
+ let sessionListCache: { result: AcpSessionInfo[]; expiresAt: number; staleAt: number } | null = null;
2119
+ let sessionListRefreshing = false;
2120
+
2121
+ /** Invalidate the session list cache so the next call re-reads from disk. */
2122
+ export function invalidateSessionListCache() {
2123
+ sessionListCache = null;
2124
+ }
2125
+
2126
+ /** Refresh session list in background (stale-while-revalidate). */
2127
+ async function refreshSessionListInBackground() {
2128
+ if (sessionListRefreshing) return;
2129
+ sessionListRefreshing = true;
2130
+ try {
2131
+ const result = await fetchSessionListFromDisk();
2132
+ const now = Date.now();
2133
+ sessionListCache = { result, expiresAt: now + SESSION_LIST_CACHE_TTL, staleAt: now + SESSION_LIST_STALE_TTL };
2134
+ } catch {
2135
+ // Background refresh failed — keep stale data
2136
+ } finally {
2137
+ sessionListRefreshing = false;
2138
+ }
2139
+ }
2140
+
1886
2141
  /** Read all ACP session stores from disk (OpenClaw + Claude Code). Can be used without an AcpProxy instance. */
1887
2142
  export async function readAllSessionStoresFromDisk(): Promise<AcpSessionInfo[]> {
2143
+ const now = Date.now();
2144
+ if (sessionListCache) {
2145
+ if (now < sessionListCache.expiresAt) {
2146
+ // Fresh cache — return immediately
2147
+ return sessionListCache.result;
2148
+ }
2149
+ if (now < sessionListCache.staleAt) {
2150
+ // Stale but within grace period — return stale data, refresh in background
2151
+ refreshSessionListInBackground();
2152
+ return sessionListCache.result;
2153
+ }
2154
+ }
2155
+
2156
+ // Cache miss or expired — fetch from disk and cache
2157
+ const results = await fetchSessionListFromDisk();
2158
+ const ts = Date.now();
2159
+ sessionListCache = { result: results, expiresAt: ts + SESSION_LIST_CACHE_TTL, staleAt: ts + SESSION_LIST_STALE_TTL };
2160
+ return results;
2161
+ }
2162
+
2163
+ /** Fetch session list from disk (no caching). */
2164
+ async function fetchSessionListFromDisk(): Promise<AcpSessionInfo[]> {
1888
2165
  const results: AcpSessionInfo[] = [];
1889
2166
 
1890
2167
  // 1. OpenClaw agent session stores (ACP-backed + native OpenClaw sessions)
@@ -1898,20 +2175,22 @@ export async function readAllSessionStoresFromDisk(): Promise<AcpSessionInfo[]>
1898
2175
  try {
1899
2176
  const content = await readFileText(storePath);
1900
2177
  const entries: Record<string, { sessionId?: string; updatedAt?: number; displayName?: string; subject?: string; label?: string; acp?: { agent?: string } }> = JSON.parse(content);
1901
- for (const [key, entry] of Object.entries(entries)) {
1902
- if (!entry.sessionId) continue;
1903
- // Use the OpenClaw agent directory name, not the ACP backend agent.
1904
- // A cron/handoff session using Claude Code as backend is still an OpenClaw session.
1905
- // Pure Claude Code sessions are read separately from ~/.claude/history.jsonl.
1906
- const agent = agentId;
1907
- // Try to read the first user message from the transcript for description
1908
- const transcriptPath = join(sessionsDir, `${entry.sessionId}.jsonl`);
1909
- const firstMessage = await readFirstUserMessageFromTranscript(transcriptPath);
2178
+ const agent = agentId;
2179
+ // Read all transcripts in parallel instead of sequentially
2180
+ const entryList = Object.entries(entries).filter(([, e]) => e.sessionId);
2181
+ const transcriptResults = await Promise.all(
2182
+ entryList.map(([, entry]) => {
2183
+ const transcriptPath = join(sessionsDir, `${entry.sessionId!}.jsonl`);
2184
+ return readFirstUserMessageFromTranscript(transcriptPath);
2185
+ }),
2186
+ );
2187
+ for (let i = 0; i < entryList.length; i++) {
2188
+ const [key, entry] = entryList[i]!;
1910
2189
  results.push({
1911
- sessionId: entry.sessionId,
2190
+ sessionId: entry.sessionId!,
1912
2191
  cwd: key,
1913
2192
  title: entry.displayName ?? entry.subject ?? entry.label ?? undefined,
1914
- description: firstMessage ?? undefined,
2193
+ description: transcriptResults[i] ?? undefined,
1915
2194
  updatedAt: entry.updatedAt ? new Date(entry.updatedAt).toISOString() : undefined,
1916
2195
  agent,
1917
2196
  });
@@ -1925,9 +2204,11 @@ export async function readAllSessionStoresFromDisk(): Promise<AcpSessionInfo[]>
1925
2204
  }
1926
2205
 
1927
2206
  // 2. Claude Code session history (~/.claude/history.jsonl)
2207
+ // Only read the last 512KB — newer entries are appended at the end,
2208
+ // so the tail contains the most recent sessions.
1928
2209
  try {
1929
2210
  const historyPath = join(homedir(), ".claude", "history.jsonl");
1930
- const content = await readFileText(historyPath);
2211
+ const content = await readFileTail(historyPath, 512 * 1024);
1931
2212
  const sessions = new Map<string, { title: string; updatedAt: number; cwd: string }>();
1932
2213
 
1933
2214
  for (const line of content.split("\n")) {
package/src/compat.ts CHANGED
@@ -5,7 +5,7 @@
5
5
  */
6
6
 
7
7
  import { spawn as cpSpawn } from "node:child_process";
8
- import { readFile, writeFile } from "node:fs/promises";
8
+ import { open, readFile, stat, writeFile } from "node:fs/promises";
9
9
  import { createRequire } from "node:module";
10
10
 
11
11
  export interface SpawnResult {
@@ -167,6 +167,41 @@ export async function readFileText(path: string): Promise<string> {
167
167
  return readFile(path, "utf-8");
168
168
  }
169
169
 
170
+ /** Read at most `bytes` bytes from the beginning of a file as text. */
171
+ export async function readFileHead(path: string, bytes: number): Promise<string> {
172
+ const fh = await open(path, "r");
173
+ try {
174
+ const buf = Buffer.alloc(bytes);
175
+ const { bytesRead } = await fh.read(buf, 0, bytes, 0);
176
+ return buf.toString("utf-8", 0, bytesRead);
177
+ } finally {
178
+ await fh.close();
179
+ }
180
+ }
181
+
182
+ /** Read at most `bytes` bytes from the end of a file as text. */
183
+ export async function readFileTail(path: string, bytes: number): Promise<string> {
184
+ const info = await stat(path);
185
+ const size = info.size;
186
+ if (size === 0) return "";
187
+ const readBytes = Math.min(bytes, size);
188
+ const offset = size - readBytes;
189
+ const fh = await open(path, "r");
190
+ try {
191
+ const buf = Buffer.alloc(readBytes);
192
+ const { bytesRead } = await fh.read(buf, 0, readBytes, offset);
193
+ const text = buf.toString("utf-8", 0, bytesRead);
194
+ // Drop the first (possibly partial) line if we didn't read from start
195
+ if (offset > 0) {
196
+ const nl = text.indexOf("\n");
197
+ return nl >= 0 ? text.slice(nl + 1) : text;
198
+ }
199
+ return text;
200
+ } finally {
201
+ await fh.close();
202
+ }
203
+ }
204
+
170
205
  /** Write text to a file (replaces Bun.write()). */
171
206
  export async function writeFileText(path: string, content: string): Promise<void> {
172
207
  await writeFile(path, content, "utf-8");
package/src/handoff.ts CHANGED
@@ -91,6 +91,7 @@ export class HandoffManager {
91
91
  for (const [id, entry] of this.active) {
92
92
  if (entry.status === "input_required" && entry.inputRequiredAt && now - entry.inputRequiredAt > INPUT_REQUIRED_TTL) {
93
93
  this.active.delete(id);
94
+ this.sessionWatchers.delete(entry.sessionId);
94
95
  // Notify the requester that the handoff timed out
95
96
  this.peerManager.sendTo(entry.from, {
96
97
  type: "handoff_res",
@@ -110,6 +111,7 @@ export class HandoffManager {
110
111
  // (e.g. cancel arrived during input_required when no process was running)
111
112
  if (entry.status === "canceled") {
112
113
  this.active.delete(id);
114
+ this.sessionWatchers.delete(entry.sessionId);
113
115
  }
114
116
  }
115
117
  // Clean stale inputRequiredTargets (requester side)
@@ -355,6 +357,7 @@ export class HandoffManager {
355
357
 
356
358
  if (activeEntry.status === "canceled") {
357
359
  this.active.delete(id);
360
+ this.sessionWatchers.delete(activeEntry.sessionId);
358
361
  return;
359
362
  }
360
363
 
@@ -401,11 +404,13 @@ export class HandoffManager {
401
404
 
402
405
  if (activeEntry.status === "canceled") {
403
406
  this.active.delete(id);
407
+ this.sessionWatchers.delete(activeEntry.sessionId);
404
408
  return;
405
409
  }
406
410
 
407
411
  activeEntry.status = "completed";
408
412
  this.active.delete(id);
413
+ this.sessionWatchers.delete(activeEntry.sessionId);
409
414
 
410
415
  // Broadcast task completed
411
416
  this.taskActivity.broadcast(id, "handoff", "completed", agent, activeEntry.startedAt);
@@ -429,10 +434,12 @@ export class HandoffManager {
429
434
  } catch (err) {
430
435
  if (activeEntry.status === "canceled") {
431
436
  this.active.delete(id);
437
+ this.sessionWatchers.delete(activeEntry.sessionId);
432
438
  return;
433
439
  }
434
440
  activeEntry.status = "failed";
435
441
  this.active.delete(id);
442
+ this.sessionWatchers.delete(activeEntry.sessionId);
436
443
 
437
444
  // Broadcast task failed
438
445
  this.taskActivity.broadcast(
@@ -468,7 +475,7 @@ export class HandoffManager {
468
475
 
469
476
  const reader = body.getReader();
470
477
  const decoder = new TextDecoder();
471
- let full = "";
478
+ const chunks: string[] = [];
472
479
  let buffer = "";
473
480
 
474
481
  try {
@@ -489,7 +496,7 @@ export class HandoffManager {
489
496
  const parsed = JSON.parse(data);
490
497
  const delta = parsed.choices?.[0]?.delta?.content;
491
498
  if (delta) {
492
- full += delta;
499
+ chunks.push(delta);
493
500
  const streamFrame: HandoffStreamChunk = {
494
501
  type: "handoff_stream",
495
502
  id: handoffId,
@@ -531,7 +538,7 @@ export class HandoffManager {
531
538
  this.peerManager.sendTo(to, doneFrame);
532
539
  if (sessionId) this.sendToOtherWatchers(sessionId, to, doneFrame);
533
540
 
534
- return full;
541
+ return chunks.join("");
535
542
  }
536
543
 
537
544
  /** Handle incoming input_required from remote (requester side). */
@@ -28,6 +28,26 @@ import type { KeyPair } from "./crypto.ts";
28
28
  const RECONNECT_BASE = 1_000;
29
29
  const RECONNECT_MAX = 60_000;
30
30
 
31
+ /** Frame types that bypass dedup (streams share one id across chunks; responses share id with request). */
32
+ const SKIP_DEDUP_TYPES = new Set([
33
+ // Streaming
34
+ "model_stream", "handoff_stream", "acp_stream",
35
+ // Response frames (share id with their request)
36
+ "model_res", "tool_res", "tool_batch_res",
37
+ "handoff_res", "handoff_status_res", "handoff_input_required",
38
+ // Handoff control (reuse original handoff_req id)
39
+ "handoff_input", "handoff_cancel", "handoff_status",
40
+ // Diagnostics & approval
41
+ "diagnostic_exec_res", "diagnostic_status_res", "peer_approval_res",
42
+ // ACP responses
43
+ "acp_res", "acp_close_res", "acp_list_res", "acp_resume_res",
44
+ "acp_cancel_res", "acp_set_mode_res", "acp_get_modes_res",
45
+ "chat_history_res",
46
+ // Terminal
47
+ "terminal_open_res", "terminal_data", "terminal_resize",
48
+ "terminal_close", "terminal_close_res",
49
+ ]);
50
+
31
51
  /** Classify WebSocket close code into a human-readable reason. */
32
52
  function classifyCloseReason(code: number, reason: string): string {
33
53
  if (reason) return reason;
@@ -364,6 +384,12 @@ export class PeerManager extends EventEmitter<PeerManagerEvents> {
364
384
  tryReconnect();
365
385
  });
366
386
  ws.addEventListener("close", (ev) => {
387
+ // Don't reconnect if this was a self-connection (peer URL points to ourselves).
388
+ // Without this guard, outbound detects self → closes → scheduleReconnect → loop.
389
+ if (ev.code === 4002 && ev.reason === "self-connection") {
390
+ debug("peer", `connectToPeer(${peer.nodeId}): self-connection, will not reconnect`);
391
+ return;
392
+ }
367
393
  if (!lastError) {
368
394
  lastError = classifyCloseReason(ev.code, ev.reason);
369
395
  }
@@ -397,19 +423,17 @@ export class PeerManager extends EventEmitter<PeerManagerEvents> {
397
423
  // Peer's persistent public key for TOFU identity binding
398
424
  const peerPublicKey = conn.remoteIdentityKey ?? undefined;
399
425
 
400
- // Prevent self-connection: if outbound connection authenticated as our own nodeId
401
- // (e.g. peer URL accidentally points to self), close immediately.
402
- if (conn.role === "outbound" && nodeId === this.config.nodeId) {
403
- debug("peer", `Self-connection detected (outbound to ${nodeId}), closing`);
426
+ // Prevent self-connection: close immediately if the remote side authenticated
427
+ // with our own nodeId. For outbound this means the peer URL accidentally
428
+ // points to self; for inbound it means a remote node is (mis)using our nodeId.
429
+ // Exception: loopback connections with the same nodeId are local clients
430
+ // (Mac desktop app / iOS simulator) and are allowed through.
431
+ const isLocalClient = conn.role === "inbound" && nodeId === this.config.nodeId && isLoopback(ip);
432
+ if (nodeId === this.config.nodeId && !isLocalClient) {
433
+ debug("peer", `Self-connection detected (${conn.role}, nodeId=${nodeId}, ip=${ip}), closing`);
404
434
  conn.close(4002, "self-connection");
405
435
  return;
406
436
  }
407
-
408
- // Peer approval check (inbound only — outbound peers are explicitly configured)
409
- // Skip approval for same-nodeId connections from localhost (local clients
410
- // like Mac desktop app / iOS simulator). An attacker would need to already
411
- // be on the same machine to exploit this, which is outside our threat model.
412
- const isLocalClient = nodeId === this.config.nodeId && isLoopback(ip);
413
437
  debug("approval", `onPeerAuthenticated: nodeId=${nodeId} role=${conn.role} isLocalClient=${isLocalClient} ip=${ip}`);
414
438
  if (conn.role === "inbound" && !isLocalClient) {
415
439
  // IP-level approval rate limiting (suppress noise from leaked tokens)
@@ -589,32 +613,10 @@ export class PeerManager extends EventEmitter<PeerManagerEvents> {
589
613
  return;
590
614
  }
591
615
 
592
- // Skip dedup for streaming and response frame types.
593
- // Stream frames share one id across many chunks.
594
- // Response frames (model_res, tool_res, handoff_res, etc.) share the same id
595
- // as their request — without this exemption, a relay node that forwarded
596
- // the request would mark the id as seen and then drop the returning response.
597
616
  // Skip dedup for streaming chunks (same id across many chunks) and response
598
617
  // frames (share id with their request — relay would otherwise drop the reply).
599
- // handoff_input, handoff_cancel, and handoff_status reuse the original handoff_req id,
600
- // so relay nodes that already forwarded the request would drop them as duplicates.
601
- const skipDedup = frame.type === "model_stream" || frame.type === "handoff_stream"
602
- || frame.type === "model_res" || frame.type === "tool_res"
603
- || frame.type === "handoff_res" || frame.type === "handoff_status_res"
604
- || frame.type === "handoff_input_required"
605
- || frame.type === "handoff_input" || frame.type === "handoff_cancel"
606
- || frame.type === "handoff_status"
607
- || frame.type === "diagnostic_exec_res" || frame.type === "diagnostic_status_res"
608
- || frame.type === "peer_approval_res"
609
- || frame.type === "acp_stream" || frame.type === "acp_res"
610
- || frame.type === "acp_close_res"
611
- || frame.type === "acp_list_res" || frame.type === "acp_resume_res"
612
- || frame.type === "acp_cancel_res"
613
- || frame.type === "acp_set_mode_res" || frame.type === "acp_get_modes_res"
614
- || frame.type === "terminal_open_res" || frame.type === "terminal_data"
615
- || frame.type === "terminal_resize" || frame.type === "terminal_close"
616
- || frame.type === "terminal_close_res";
617
- if (frame.id && !skipDedup && this.router.isDuplicate(frame.id)) return;
618
+ // handoff_input/cancel/status reuse the original handoff_req id.
619
+ if (frame.id && !SKIP_DEDUP_TYPES.has(frame.type) && this.router.isDuplicate(frame.id)) return;
618
620
 
619
621
  // Handle peer approval responses locally (don't emit to cluster-service)
620
622
  if (frame.type === "peer_approval_res") {
package/src/router.ts CHANGED
@@ -3,6 +3,7 @@ import type { Connection } from "./connection.ts";
3
3
 
4
4
  const DEFAULT_TTL = 3;
5
5
  const MAX_SEEN_FRAMES = 10_000;
6
+ const MAX_FAILED_REQUESTS = 5_000;
6
7
  const ROTATE_INTERVAL = 60_000; // rotate dedup maps every 60s
7
8
 
8
9
  export interface RouteEntry {
@@ -305,6 +306,21 @@ export class Router {
305
306
  * TTL defaults to 15 minutes — long enough for handoff timeouts. */
306
307
  markFailed(requestId: string, ttlMs = 900_000) {
307
308
  this.failedRequests.set(requestId, Date.now() + ttlMs);
309
+ // Evict entries when map grows too large: first expired, then FIFO
310
+ if (this.failedRequests.size > MAX_FAILED_REQUESTS) {
311
+ const now = Date.now();
312
+ // Pass 1: remove expired entries
313
+ for (const [id, expiresAt] of this.failedRequests) {
314
+ if (now > expiresAt) this.failedRequests.delete(id);
315
+ }
316
+ // Pass 2: if still over limit, remove oldest (insertion-order) entries
317
+ if (this.failedRequests.size > MAX_FAILED_REQUESTS) {
318
+ for (const [id] of this.failedRequests) {
319
+ if (this.failedRequests.size <= MAX_FAILED_REQUESTS) break;
320
+ this.failedRequests.delete(id);
321
+ }
322
+ }
323
+ }
308
324
  }
309
325
 
310
326
  isFailed(requestId: string): boolean {