quadwork 1.19.2 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (117) hide show
  1. package/README.md +19 -35
  2. package/bin/quadwork.js +48 -1118
  3. package/out/404.html +1 -1
  4. package/out/__next.__PAGE__.txt +3 -3
  5. package/out/__next._full.txt +14 -14
  6. package/out/__next._head.txt +4 -4
  7. package/out/__next._index.txt +8 -8
  8. package/out/__next._tree.txt +2 -2
  9. package/out/_next/static/chunks/{030cjkhts487t.js → 079wdniva~de1.js} +1 -1
  10. package/out/_next/static/chunks/{0n~dq4kpx9xxx.js → 07lhk_q6pmm3r.js} +1 -1
  11. package/out/_next/static/chunks/0_79hkefw1mo2.js +1 -0
  12. package/out/_next/static/chunks/{153f.fj8jlvle.js → 0_lyyn..t63bc.js} +1 -1
  13. package/out/_next/static/chunks/0oxv9vrvc17to.js +2 -0
  14. package/out/_next/static/chunks/0py7102i226n5.js +1 -0
  15. package/out/_next/static/chunks/{13fv-yi7.v52g.js → 0q4bm04c1jl_3.js} +1 -1
  16. package/out/_next/static/chunks/{0_idxioyl0p7h.js → 0sjhy6oe3mbon.js} +1 -1
  17. package/out/_next/static/chunks/13xk0vgfbrcld.css +2 -0
  18. package/out/_next/static/chunks/14k3bfe537f9_.js +25 -0
  19. package/out/_next/static/chunks/{turbopack-0qm-e3ifrz~2u.js → turbopack-0y2u-q0l2m67w.js} +1 -1
  20. package/out/_not-found/__next._full.txt +13 -13
  21. package/out/_not-found/__next._head.txt +4 -4
  22. package/out/_not-found/__next._index.txt +8 -8
  23. package/out/_not-found/__next._not-found.__PAGE__.txt +2 -2
  24. package/out/_not-found/__next._not-found.txt +3 -3
  25. package/out/_not-found/__next._tree.txt +2 -2
  26. package/out/_not-found.html +1 -1
  27. package/out/_not-found.txt +13 -13
  28. package/out/app-shell/__next._full.txt +13 -13
  29. package/out/app-shell/__next._head.txt +4 -4
  30. package/out/app-shell/__next._index.txt +8 -8
  31. package/out/app-shell/__next._tree.txt +2 -2
  32. package/out/app-shell/__next.app-shell.__PAGE__.txt +2 -2
  33. package/out/app-shell/__next.app-shell.txt +3 -3
  34. package/out/app-shell.html +1 -1
  35. package/out/app-shell.txt +13 -13
  36. package/out/index.html +1 -1
  37. package/out/index.txt +14 -14
  38. package/out/project/_/__next._full.txt +14 -14
  39. package/out/project/_/__next._head.txt +4 -4
  40. package/out/project/_/__next._index.txt +8 -8
  41. package/out/project/_/__next._tree.txt +2 -2
  42. package/out/project/_/__next.project.$d$id.__PAGE__.txt +3 -3
  43. package/out/project/_/__next.project.$d$id.txt +3 -3
  44. package/out/project/_/__next.project.txt +3 -3
  45. package/out/project/_/queue/__next._full.txt +14 -14
  46. package/out/project/_/queue/__next._head.txt +4 -4
  47. package/out/project/_/queue/__next._index.txt +8 -8
  48. package/out/project/_/queue/__next._tree.txt +2 -2
  49. package/out/project/_/queue/__next.project.$d$id.queue.__PAGE__.txt +3 -3
  50. package/out/project/_/queue/__next.project.$d$id.queue.txt +3 -3
  51. package/out/project/_/queue/__next.project.$d$id.txt +3 -3
  52. package/out/project/_/queue/__next.project.txt +3 -3
  53. package/out/project/_/queue.html +1 -1
  54. package/out/project/_/queue.txt +14 -14
  55. package/out/project/_.html +1 -1
  56. package/out/project/_.txt +14 -14
  57. package/out/settings/__next._full.txt +14 -14
  58. package/out/settings/__next._head.txt +4 -4
  59. package/out/settings/__next._index.txt +8 -8
  60. package/out/settings/__next._tree.txt +2 -2
  61. package/out/settings/__next.settings.__PAGE__.txt +3 -3
  62. package/out/settings/__next.settings.txt +3 -3
  63. package/out/settings.html +1 -1
  64. package/out/settings.txt +14 -14
  65. package/out/setup/__next._full.txt +14 -14
  66. package/out/setup/__next._head.txt +4 -4
  67. package/out/setup/__next._index.txt +8 -8
  68. package/out/setup/__next._tree.txt +2 -2
  69. package/out/setup/__next.setup.__PAGE__.txt +3 -3
  70. package/out/setup/__next.setup.txt +3 -3
  71. package/out/setup.html +1 -1
  72. package/out/setup.txt +14 -14
  73. package/package.json +4 -2
  74. package/server/ac-restore.js +128 -0
  75. package/server/bridges/discord.js +183 -0
  76. package/server/bridges/telegram.js +210 -0
  77. package/server/config.js +4 -60
  78. package/server/file-chat.js +318 -0
  79. package/server/index.js +173 -1286
  80. package/server/install-agentchattr.js +3 -284
  81. package/server/mcp-chat-shim.js +171 -0
  82. package/server/migrate-ac.js +158 -0
  83. package/server/pty-dispatcher.js +188 -0
  84. package/server/routes.js +149 -1397
  85. package/templates/CLAUDE.md +2 -2
  86. package/templates/OVERNIGHT-QUEUE.md +1 -1
  87. package/templates/seeds/butler.CLAUDE.md +30 -62
  88. package/templates/seeds/dev.AGENTS.md +10 -1
  89. package/templates/seeds/head.AGENTS.md +3 -3
  90. package/templates/seeds/re1.AGENTS.md +3 -3
  91. package/templates/seeds/re2.AGENTS.md +3 -3
  92. package/bridges/discord/__pycache__/discord_bridge.cpython-314.pyc +0 -0
  93. package/bridges/discord/discord_bridge.py +0 -666
  94. package/bridges/discord/requirements.txt +0 -2
  95. package/out/_next/static/chunks/0_bb~2.5h2ntm.css +0 -2
  96. package/out/_next/static/chunks/0makcdqkwobp6.js +0 -25
  97. package/out/_next/static/chunks/0uz5svjlo9dwl.js +0 -1
  98. package/out/_next/static/chunks/0zahstmgdrpy5.js +0 -1
  99. package/out/_next/static/chunks/0zfotsowwll1x.js +0 -2
  100. package/server/__tests__/bridge-auto-stop-guard.test.js +0 -134
  101. package/server/__tests__/rate-limit-handling.test.js +0 -168
  102. package/server/__tests__/scrub-secrets.test.js +0 -235
  103. package/server/__tests__/v1110-security-qa.test.js +0 -312
  104. package/server/agentchattr-registry.js +0 -188
  105. package/server/install-agentchattr.patchCrashTimeout.test.js +0 -71
  106. package/server/queue-watcher.js +0 -171
  107. package/server/queue-watcher.test.js +0 -64
  108. package/server/routes.batchProgress.test.js +0 -94
  109. package/server/routes.chatWsSend.test.js +0 -161
  110. package/server/routes.discordBridge.test.js +0 -80
  111. package/server/routes.parseActiveBatch.test.js +0 -88
  112. package/server/routes.telegramBridge.test.js +0 -241
  113. package/templates/config.toml +0 -72
  114. package/templates/wrapper.py +0 -70
  115. /package/out/_next/static/{K7A3YZrh4sLaRRP1-Lq7v → 479UD5Kit4YvCmtgO25VT}/_buildManifest.js +0 -0
  116. /package/out/_next/static/{K7A3YZrh4sLaRRP1-Lq7v → 479UD5Kit4YvCmtgO25VT}/_clientMiddlewareManifest.js +0 -0
  117. /package/out/_next/static/{K7A3YZrh4sLaRRP1-Lq7v → 479UD5Kit4YvCmtgO25VT}/_ssgManifest.js +0 -0
package/server/index.js CHANGED
@@ -6,21 +6,23 @@ const os = require("os");
6
6
  const { WebSocketServer, WebSocket } = require("ws");
7
7
  const pty = require("node-pty");
8
8
  const { spawn } = require("child_process");
9
- const { readConfig, resolveAgentCwd, resolveAgentCommand, resolveProjectChattr, resolveChattrSpawn, syncChattrToken, CONFIG_PATH, ensureSecureDir, writeSecureFile, writeConfig } = require("./config");
9
+ const { readConfig, resolveAgentCwd, resolveAgentCommand, CONFIG_PATH, ensureSecureDir, writeSecureFile, writeConfig } = require("./config");
10
10
  const routes = require("./routes");
11
- const {
12
- patchAgentchattrConfigForDiscordBridge,
13
- patchAgentchattrConfigForTelegramBridge,
14
- projectAgentchattrConfigPath,
15
- } = routes;
16
- const { waitForAgentChattrReady, registerAgent, registerAgentWithRetry, deregisterAgent, startHeartbeat, stopHeartbeat } = require("./agentchattr-registry");
17
- const { patchAgentchattrCss, patchCrashTimeout } = require("./install-agentchattr");
18
- const { startQueueWatcher, stopQueueWatcher } = require("./queue-watcher");
11
+ const fileChat = require("./file-chat");
12
+ const { dispatchToAgentPTY, cleanupSession: cleanupPtyDispatcher } = require("./pty-dispatcher");
13
+ const { runAcMigration } = require("./migrate-ac");
19
14
 
20
15
  const net = require("net");
21
16
  const config = readConfig();
22
17
  const PORT = config.port || 8400;
23
18
 
19
+ function emitSystemMessage(projectId, text) {
20
+ try {
21
+ if (routes.getProjectChatMode(projectId) !== "file") return;
22
+ fileChat.appendMessage(projectId, { sender: "system", type: "system", text });
23
+ } catch {}
24
+ }
25
+
24
26
  const app = express();
25
27
  // #412 / quadwork#279: bump the global JSON body limit to 10mb so
26
28
  // POST /api/project-history can accept full chat exports. The
@@ -33,6 +35,14 @@ app.use(express.json({ limit: "10mb" }));
33
35
  // --- Mount migrated API routes (from Next.js) ---
34
36
  app.use(routes);
35
37
 
38
+ // #730: wire PTY injection dispatcher into the chat route
39
+ routes.setPtyDispatchCallback((projectId, msg) => {
40
+ dispatchToAgentPTY(projectId, msg, agentSessions, {
41
+ isLoopGuardPaused: fileChat.isLoopGuardPaused,
42
+ safeWrite,
43
+ });
44
+ });
45
+
36
46
  const server = http.createServer(app);
37
47
 
38
48
  // --- REST endpoints ---
@@ -163,9 +173,6 @@ app.get("/api/caffeinate/status", (_req, res) => {
163
173
  // PTY (term) is the source of truth for "running". WS is optional (attaches to view terminal).
164
174
  const agentSessions = new Map();
165
175
 
166
- // AgentChattr server processes — per-project (key = projectId)
167
- const chattrProcesses = new Map();
168
-
169
176
  // #631: Butler session — single global PTY (not per-project, no AC integration)
170
177
  let butlerSession = { term: null, viewers: new Set(), viewerDims: new Map(), lastDims: null, state: "stopped", error: null, scrollback: Buffer.alloc(0) };
171
178
 
@@ -317,6 +324,27 @@ function writeMcpConfigFile(projectId, agentId, mcpHttpPort, token) {
317
324
  return filePath;
318
325
  }
319
326
 
327
+ function writeFileChatMcpConfig(projectId, agentId, serverPort) {
328
+ const os = require("os");
329
+ const crypto = require("crypto");
330
+ const configDir = path.join(os.homedir(), ".quadwork", projectId);
331
+ ensureSecureDir(configDir);
332
+ const filePath = path.join(configDir, `mcp-${agentId}.json`);
333
+ const shimPath = path.join(__dirname, "mcp-chat-shim.js");
334
+ const token = crypto.randomBytes(16).toString("hex");
335
+ fileChat.registerShimToken(projectId, agentId, token);
336
+ const config = {
337
+ mcpServers: {
338
+ chat: {
339
+ command: "node",
340
+ args: [shimPath, "--project", projectId, "--agent", agentId, "--port", String(serverPort), "--token", token],
341
+ },
342
+ },
343
+ };
344
+ writeSecureFile(filePath, JSON.stringify(config, null, 2));
345
+ return { filePath, token };
346
+ }
347
+
320
348
  /**
321
349
  * Build extra launch args for an agent (permission flags + MCP injection).
322
350
  * Async because Codex proxy_flag mode needs to await proxy startup.
@@ -324,16 +352,12 @@ function writeMcpConfigFile(projectId, agentId, mcpHttpPort, token) {
324
352
  async function buildAgentArgs(projectId, agentId) {
325
353
  const cfg = readConfig();
326
354
  const project = cfg.projects?.find((p) => p.id === projectId);
327
- if (!project) return { args: [], acRegistrationName: null, acServerPort: null, acRegistrationToken: null, acInjectMode: null, acMcpHttpPort: null };
355
+ if (!project) return { args: [] };
328
356
 
329
357
  const agentCfg = project.agents?.[agentId] || {};
330
358
  const command = agentCfg.command || "claude";
331
- const cliBase = command.split("/").pop().split(" ")[0]; // extract base CLI name
359
+ const cliBase = command.split("/").pop().split(" ")[0];
332
360
  const args = [];
333
- let acRegistrationName = null;
334
- let acServerPort = null;
335
- let acRegistrationToken = null;
336
- let acInjectMode = null;
337
361
 
338
362
  // Permission bypass flags
339
363
  if (agentCfg.auto_approve !== false) {
@@ -367,93 +391,22 @@ async function buildAgentArgs(projectId, agentId) {
367
391
  }
368
392
  }
369
393
 
370
- // MCP config injection
371
- const mcpHttpPort = project.mcp_http_port;
372
- const token = project.agentchattr_token;
373
- if (mcpHttpPort) {
374
- const injectMode = agentCfg.mcp_inject || (cliBase === "codex" ? "proxy_flag" : cliBase === "gemini" ? "env" : "flag");
375
- acInjectMode = injectMode;
376
- if (injectMode === "flag") {
377
- // Claude/Kimi: register with AgentChattr to obtain a per-agent
378
- // token (#239 — session_token is browser auth, not MCP auth) and
379
- // write that into the per-agent MCP config file.
380
- const chattrInfo = resolveProjectChattr(projectId);
381
- acServerPort = Number(new URL(chattrInfo.url).port) || 8300;
382
- // #565: extend timeout to 30s — first setup may need AC to install
383
- // (git clone + venv + pip install) before it can bind a port.
384
- const acReady = await waitForAgentChattrReady(acServerPort, 30000);
385
- if (!acReady) {
386
- console.warn(`[#565] Agent ${agentId}: AC not reachable on port ${acServerPort} after 30s. Spawning without chat integration.`);
387
- // #565: preserve acServerPort and acInjectMode so deferred
388
- // recovery in spawnAgentPty can retry registration later.
389
- return { args, acRegistrationName: null, acServerPort, acRegistrationToken: null, acInjectMode: injectMode, acMcpHttpPort: mcpHttpPort || null };
390
- }
391
- // #242: best-effort deregister any stale registration of the
392
- // canonical name (left over by a crashed previous QuadWork
393
- // session) so the fresh register lands at slot 1 instead of
394
- // head-2 / re2-2. We need the previous agent's bearer
395
- // token because app.py:2123 requires authenticated agent
396
- // session for family names — load it from disk (persisted
397
- // across restarts). Failures are non-fatal.
398
- const stalePersistedToken = readPersistedAgentToken(projectId, agentId);
399
- if (stalePersistedToken) {
400
- await deregisterAgent(acServerPort, agentId, stalePersistedToken).catch(() => {});
401
- clearPersistedAgentToken(projectId, agentId);
402
- }
403
- // #478: force-replace so AC expires any ghost slots for this base
404
- // #565: retry with backoff and degrade gracefully if AC is not ready
405
- const registration = await registerAgentWithRetry(acServerPort, agentId, agentCfg.display_name || null, { force: true });
406
- if (!registration) {
407
- console.warn(`[#565] Agent ${agentId}: AC registration failed after retries (${registerAgent.lastError}). Spawning without chat integration.`);
408
- } else {
409
- acRegistrationName = registration.name;
410
- acRegistrationToken = registration.token;
411
- writePersistedAgentToken(projectId, agentId, registration.token);
412
- const mcpConfigPath = writeMcpConfigFile(projectId, agentId, mcpHttpPort, registration.token);
413
- const flag = agentCfg.mcp_flag || "--mcp-config";
414
- args.push(flag, mcpConfigPath);
415
- }
416
- } else if (injectMode === "proxy_flag") {
417
- // Codex: register with AgentChattr first (#240) so the proxy
418
- // injects a real per-agent token, not the global session token.
419
- // Resolve via resolveProjectChattr so legacy/global-config
420
- // projects without a per-project agentchattr_url still work.
421
- const chattrInfo = resolveProjectChattr(projectId);
422
- acServerPort = Number(new URL(chattrInfo.url).port) || 8300;
423
- // #565: extend timeout to 30s for first-setup scenario
424
- const acReady = await waitForAgentChattrReady(acServerPort, 30000);
425
- if (!acReady) {
426
- console.warn(`[#565] Agent ${agentId}: AC not reachable on port ${acServerPort} after 30s. Spawning without chat integration.`);
427
- // #565: preserve acServerPort and acInjectMode so deferred
428
- // recovery in spawnAgentPty can retry registration later.
429
- return { args, acRegistrationName: null, acServerPort, acRegistrationToken: null, acInjectMode: injectMode, acMcpHttpPort: mcpHttpPort || null };
430
- }
431
- // #242: best-effort deregister stale canonical name first using
432
- // the persisted bearer token from a previous session.
433
- const stalePersistedToken = readPersistedAgentToken(projectId, agentId);
434
- if (stalePersistedToken) {
435
- await deregisterAgent(acServerPort, agentId, stalePersistedToken).catch(() => {});
436
- clearPersistedAgentToken(projectId, agentId);
437
- }
438
- // #478: force-replace so AC expires any ghost slots for this base
439
- // #565: retry with backoff and degrade gracefully if AC is not ready
440
- const registration = await registerAgentWithRetry(acServerPort, agentId, agentCfg.display_name || null, { force: true });
441
- if (!registration) {
442
- console.warn(`[#565] Agent ${agentId}: AC registration failed after retries (${registerAgent.lastError}). Spawning without chat integration.`);
443
- } else {
444
- acRegistrationName = registration.name;
445
- acRegistrationToken = registration.token;
446
- writePersistedAgentToken(projectId, agentId, registration.token);
447
- const upstreamUrl = `http://127.0.0.1:${mcpHttpPort}`;
448
- const proxyUrl = await startMcpProxy(projectId, agentId, upstreamUrl, registration.token);
449
- if (proxyUrl) {
450
- args.push("-c", `mcp_servers.agentchattr.url="${proxyUrl}"`);
451
- }
452
- }
453
- }
394
+ // MCP config injection — file-chat shim
395
+ const injectMode = agentCfg.mcp_inject || (cliBase === "codex" ? "proxy_flag" : cliBase === "gemini" ? "env" : "flag");
396
+ if (injectMode === "flag") {
397
+ const { filePath: mcpConfigPath } = writeFileChatMcpConfig(projectId, agentId, PORT);
398
+ const mcpFlag = agentCfg.mcp_flag || "--mcp-config";
399
+ args.push(mcpFlag, mcpConfigPath);
400
+ } else if (injectMode === "proxy_flag") {
401
+ const { token: shimToken } = writeFileChatMcpConfig(projectId, agentId, PORT);
402
+ const shimPath = path.join(__dirname, "mcp-chat-shim.js");
403
+ args.push(
404
+ "-c", `mcp_servers.chat.command="node"`,
405
+ "-c", `mcp_servers.chat.args=["${shimPath}","--project","${projectId}","--agent","${agentId}","--port","${PORT}","--token","${shimToken}"]`,
406
+ );
454
407
  }
455
-
456
- return { args, acRegistrationName, acServerPort, acRegistrationToken, acInjectMode, acMcpHttpPort: mcpHttpPort || null };
408
+ // env mode (Gemini) handled in buildAgentEnv
409
+ return { args };
457
410
  }
458
411
 
459
412
  /**
@@ -470,18 +423,19 @@ function buildAgentEnv(projectId, agentId) {
470
423
  const env = {};
471
424
 
472
425
  // Gemini: inject MCP via env var
473
- if (cliBase === "gemini" && project.mcp_http_port) {
426
+ if (cliBase === "gemini") {
474
427
  const os = require("os");
475
428
  const configDir = path.join(os.homedir(), ".quadwork", projectId);
476
429
  ensureSecureDir(configDir);
477
430
  const settingsPath = path.join(configDir, `mcp-${agentId}-settings.json`);
478
- const url = `http://127.0.0.1:${project.mcp_http_port}/mcp`;
431
+
432
+ const { token: shimToken } = writeFileChatMcpConfig(projectId, agentId, PORT);
433
+ const shimPath = path.join(__dirname, "mcp-chat-shim.js");
479
434
  const settings = {
480
435
  mcpServers: {
481
- agentchattr: {
482
- type: "http",
483
- url,
484
- ...(project.agentchattr_token ? { headers: { Authorization: `Bearer ${project.agentchattr_token}` } } : {}),
436
+ chat: {
437
+ command: "node",
438
+ args: [shimPath, "--project", projectId, "--agent", agentId, "--port", String(PORT), "--token", shimToken],
485
439
  },
486
440
  },
487
441
  };
@@ -492,76 +446,8 @@ function buildAgentEnv(projectId, agentId) {
492
446
  return env;
493
447
  }
494
448
 
495
- /**
496
- * #394 / quadwork#253: recover from a heartbeat 409 (AgentChattr was
497
- * restarted, in-memory registry wiped, our token is now stale). Mirrors
498
- * wrapper.py:732-741. Re-registers the running agent, swaps the
499
- * tracked name/token on the live session so the heartbeat interval
500
- * picks up the new credentials on its next tick, refreshes whichever
501
- * MCP transport this agent uses (Claude config file vs Codex proxy),
502
- * and restarts the queue watcher in case the assigned name changed
503
- * (multi-instance slot bump).
504
- *
505
- * Best-effort: any failure here just means the next 5s heartbeat will
506
- * fail again and we'll re-enter recovery — no tight retry loop because
507
- * startHeartbeat guards re-entry with `recovering`.
508
- */
509
- async function recoverFrom409(projectId, agentId, session) {
510
- if (!session.acServerPort) return;
511
- const cfg = readConfig();
512
- const project = cfg.projects?.find((p) => p.id === projectId);
513
- const agentCfg = project?.agents?.[agentId] || {};
514
- // AC may need a moment to come back up after a restart — wait briefly.
515
- await waitForAgentChattrReady(session.acServerPort, 10000);
516
-
517
- // Best-effort cleanup of the stale registration on disk so the
518
- // fresh register isn't shoved into a slot 2 by leftover state.
519
- const stale = readPersistedAgentToken(projectId, agentId);
520
- if (stale) {
521
- await deregisterAgent(session.acServerPort, agentId, stale).catch(() => {});
522
- clearPersistedAgentToken(projectId, agentId);
523
- }
524
-
525
- // #478: force-replace so AC expires any ghost slots for this base
526
- const replacement = await registerAgent(session.acServerPort, agentId, agentCfg.display_name || null, { force: true });
527
- if (!replacement) return;
528
-
529
- const previousName = session.acRegistrationName;
530
- session.acRegistrationName = replacement.name;
531
- session.acRegistrationToken = replacement.token;
532
- writePersistedAgentToken(projectId, agentId, replacement.token);
533
-
534
- // Refresh whichever MCP transport this agent uses so subsequent
535
- // tool calls (and the queue-watcher's `mcp read` injections) hit
536
- // AC with the new bearer token instead of the now-rejected one.
537
- if (session.acInjectMode === "flag" && session.acMcpHttpPort) {
538
- try { writeMcpConfigFile(projectId, agentId, session.acMcpHttpPort, replacement.token); } catch {}
539
- } else if (session.acInjectMode === "proxy_flag") {
540
- // Codex is pinned to the original ephemeral proxy URL, so we
541
- // can't tear the listener down — mutate the token in place.
542
- try { updateMcpProxyToken(projectId, agentId, replacement.token); } catch {}
543
- }
544
-
545
- // If the assigned name changed (e.g. multi-instance slot collision)
546
- // the queue watcher is now polling the wrong file. Restart it
547
- // against the new name so chat reaches the right agent.
548
- if (replacement.name !== previousName && session.term) {
549
- if (session.queueWatcherHandle) {
550
- stopQueueWatcher(session.queueWatcherHandle);
551
- session.queueWatcherHandle = null;
552
- }
553
- try {
554
- const { dir: acDir } = resolveProjectChattr(projectId);
555
- if (acDir) {
556
- const dataDir = path.join(acDir, "data");
557
- session.queueWatcherHandle = startQueueWatcher(dataDir, replacement.name, session.term);
558
- }
559
- } catch {}
560
- }
561
- }
562
-
563
449
  // Helper: spawn a PTY for a project/agent and register in agentSessions
564
- async function spawnAgentPty(project, agent) {
450
+ async function spawnAgentPty(project, agent, opts = {}) {
565
451
  const key = `${project}/${agent}`;
566
452
 
567
453
  const cwd = resolveAgentCwd(project, agent);
@@ -593,13 +479,7 @@ async function spawnAgentPty(project, agent) {
593
479
  lastDims: null,
594
480
  state: "running",
595
481
  error: null,
596
- acRegistrationName: built.acRegistrationName,
597
- acServerPort: built.acServerPort,
598
- acRegistrationToken: built.acRegistrationToken,
599
- acInjectMode: built.acInjectMode,
600
- acMcpHttpPort: built.acMcpHttpPort,
601
- acHeartbeatHandle: null,
602
- queueWatcherHandle: null,
482
+ lastOutputAt: Date.now(),
603
483
  // #418: ring buffer of recent PTY output so reconnecting WS
604
484
  // clients see the terminal state instead of a blank panel.
605
485
  // #538: scrollback is scrubbed of likely secrets before replay.
@@ -607,11 +487,16 @@ async function spawnAgentPty(project, agent) {
607
487
  };
608
488
  agentSessions.set(key, session);
609
489
 
490
+ if (!opts.suppressLifecycleMsg) {
491
+ emitSystemMessage(project, `${agent} joined`);
492
+ }
493
+
610
494
  // #418: capture PTY output into the scrollback ring buffer (64KB).
611
495
  // This runs independently of WS — even when no client is connected,
612
496
  // the buffer accumulates so the next connect gets replay.
613
497
  const SCROLLBACK_SIZE = 64 * 1024;
614
498
  term.onData((data) => {
499
+ session.lastOutputAt = Date.now();
615
500
  const chunk = Buffer.from(data);
616
501
  session.scrollback = Buffer.concat([session.scrollback, chunk]);
617
502
  if (session.scrollback.length > SCROLLBACK_SIZE) {
@@ -619,72 +504,10 @@ async function spawnAgentPty(project, agent) {
619
504
  }
620
505
  });
621
506
 
622
- // #391 / quadwork#250: keep this agent alive in AgentChattr by
623
- // POSTing /api/heartbeat/{name} every 5s. Without it, AC's 60s
624
- // crash-detection window deregisters the agent and chat messages
625
- // never reach it. Mirrors wrapper.py:_heartbeat (lines 715-748).
626
- if (session.acRegistrationName && session.acServerPort && session.acRegistrationToken) {
627
- // #394 / quadwork#253: pass getters (not raw values) so the 409
628
- // recovery path below can swap acRegistrationName/Token in place
629
- // and the very next heartbeat tick uses the replacement
630
- // credentials without us having to tear down + restart the
631
- // interval.
632
- session.acHeartbeatHandle = startHeartbeat(
633
- session.acServerPort,
634
- () => session.acRegistrationName,
635
- () => session.acRegistrationToken,
636
- { onConflict: () => recoverFrom409(project, agent, session) },
637
- );
638
- }
639
-
640
- // #393 / quadwork#251: queue watcher — the actual mechanism by
641
- // which agents pick up chat. Without this an agent can be
642
- // registered + heartbeating yet still never respond, because
643
- // AgentChattr only writes to {data_dir}/{name}_queue.jsonl and
644
- // expects the agent side to poll + inject `mcp read`.
645
- if (session.acRegistrationName && session.term) {
646
- try {
647
- const { dir: acDir } = resolveProjectChattr(project);
648
- if (acDir) {
649
- const dataDir = path.join(acDir, "data");
650
- session.queueWatcherHandle = startQueueWatcher(
651
- dataDir,
652
- session.acRegistrationName,
653
- session.term,
654
- );
655
- }
656
- } catch {
657
- // best-effort — failure here just means no chat injection
658
- }
659
- }
660
-
661
- // #565: deferred restart — if the agent spawned without AC
662
- // registration (AC wasn't ready or registration failed), wait for
663
- // AC to come up then stop + respawn the agent so it gets the full
664
- // MCP CLI args (--mcp-config / -c mcp_servers...url) that can only
665
- // be set at process launch time.
666
- if (!session.acRegistrationName && session.acServerPort && session.acInjectMode) {
667
- const deferredRestart = async () => {
668
- const ready = await waitForAgentChattrReady(session.acServerPort, 60000);
669
- if (!ready) {
670
- // #572: log timeout so operators know the health monitor will
671
- // handle recovery when AC eventually comes up.
672
- console.log(`[#565] Agent ${agent}: AC not reachable after 60s — health monitor will restart agent when AC recovers.`);
673
- return;
674
- }
675
- // Guard: agent may have been stopped manually while we waited.
676
- const current = agentSessions.get(key);
677
- if (!current || !current.term || current.state !== "running") return;
678
- console.log(`[#565] Agent ${agent}: AC is now reachable — restarting agent to gain chat integration.`);
679
- await stopAgentSession(key);
680
- await spawnAgentPty(project, agent);
681
- };
682
- deferredRestart().catch(() => {});
683
- }
684
-
685
507
  term.onExit(({ exitCode }) => {
686
508
  const current = agentSessions.get(key);
687
509
  if (current && current.term === term) {
510
+ cleanupPtyDispatcher(key);
688
511
  current.state = "stopped";
689
512
  current.error = exitCode ? `exit:${exitCode}` : null;
690
513
  current.term = null;
@@ -692,27 +515,6 @@ async function spawnAgentPty(project, agent) {
692
515
  if (v.readyState <= 1) v.close(1000, `exited:${exitCode}`);
693
516
  }
694
517
  current.viewers.clear();
695
- // #391 / quadwork#250: a crashed PTY must also clear its
696
- // heartbeat interval (otherwise it leaks and a later /start
697
- // double-registers) and free the AgentChattr slot (otherwise
698
- // the agent stays falsely `active` forever and the next
699
- // register lands at slot 2). Deregister is best-effort.
700
- if (current.acHeartbeatHandle) {
701
- stopHeartbeat(current.acHeartbeatHandle);
702
- current.acHeartbeatHandle = null;
703
- }
704
- if (current.queueWatcherHandle) {
705
- stopQueueWatcher(current.queueWatcherHandle);
706
- current.queueWatcherHandle = null;
707
- }
708
- if (current.acRegistrationName && current.acServerPort) {
709
- deregisterAgent(current.acServerPort, current.acRegistrationName).catch(() => {});
710
- if (current.projectId && current.agentId) {
711
- try { clearPersistedAgentToken(current.projectId, current.agentId); } catch {}
712
- }
713
- current.acRegistrationName = null;
714
- current.acRegistrationToken = null;
715
- }
716
518
  }
717
519
  });
718
520
 
@@ -723,16 +525,16 @@ async function spawnAgentPty(project, agent) {
723
525
  }
724
526
  }
725
527
 
726
- // Helper: stop an agent session — kill PTY, close WS, deregister.
727
- // Async because deregister must complete before a restart re-registers,
728
- // otherwise the old slot stays occupied and a fresh register lands at
729
- // head-2 instead of slot 1 (#241).
730
528
  async function stopAgentSession(key) {
731
529
  const session = agentSessions.get(key);
732
530
  if (!session) {
733
531
  agentSessions.set(key, { projectId: null, agentId: null, term: null, viewers: new Set(), viewerDims: new Map(), lastDims: null, state: "stopped", error: null });
734
532
  return;
735
533
  }
534
+ if (session.projectId && session.agentId && !session._suppressLifecycleMsg) {
535
+ emitSystemMessage(session.projectId, `${session.agentId} left`);
536
+ }
537
+ cleanupPtyDispatcher(key);
736
538
  if (session.term) {
737
539
  try { session.term.kill(); } catch {}
738
540
  session.term = null;
@@ -743,33 +545,6 @@ async function stopAgentSession(key) {
743
545
  session.viewers.clear();
744
546
  session.state = "stopped";
745
547
  session.error = null;
746
- // Stop heartbeat before deregister so we don't race a final POST
747
- // against AgentChattr removing the name (#391 / quadwork#250).
748
- if (session.acHeartbeatHandle) {
749
- stopHeartbeat(session.acHeartbeatHandle);
750
- session.acHeartbeatHandle = null;
751
- }
752
- // Stop queue watcher (#393 / quadwork#251) — the PTY is gone,
753
- // injecting into a dead term would throw on the next tick.
754
- if (session.queueWatcherHandle) {
755
- stopQueueWatcher(session.queueWatcherHandle);
756
- session.queueWatcherHandle = null;
757
- }
758
- // Best-effort deregister from AgentChattr (#241) so the slot frees
759
- // and the next register lands at slot 1 instead of head-2.
760
- if (session.acRegistrationName && session.acServerPort) {
761
- try {
762
- await deregisterAgent(session.acServerPort, session.acRegistrationName);
763
- } catch {
764
- // best-effort — failures are non-fatal
765
- }
766
- if (session.projectId && session.agentId) {
767
- clearPersistedAgentToken(session.projectId, session.agentId);
768
- }
769
- session.acRegistrationName = null;
770
- session.acRegistrationToken = null;
771
- }
772
- // Clean up MCP auth proxy if running
773
548
  const [projectId, agentId] = key.split("/");
774
549
  if (projectId && agentId) stopMcpProxy(projectId, agentId);
775
550
  }
@@ -779,487 +554,15 @@ app.get("/api/agents", (_req, res) => {
779
554
  for (const [key, session] of agentSessions) {
780
555
  agents[key] = { state: session.state, error: session.error || null };
781
556
  }
782
- for (const [pid, proc] of chattrProcesses) {
783
- agents[`_agentchattr/${pid}`] = { state: proc.state, error: proc.error };
784
- }
785
557
  res.json(agents);
786
558
  });
787
559
 
788
- // #424 / quadwork#304: best-effort auto-snapshot of chat history
789
- // before any AgentChattr restart. Defense-in-depth against
790
- // destructive ops like /clear that rewrite AC's JSONL log in place
791
- // — per #303 the log itself IS persistent across normal restarts,
792
- // so the snapshot's job is to give the operator a point-in-time
793
- // rollback if the log gets clobbered, not to prevent history loss
794
- // on ordinary lifecycle events.
795
- //
796
- // Snapshot contents = the same envelope GET /api/project-history
797
- // returns, so an operator (or a future "restore" button) can feed
798
- // the file straight into POST /api/project-history for replay.
799
- const HISTORY_SNAPSHOT_LIMIT = 5;
800
-
801
- async function snapshotProjectHistory(projectId) {
802
- try {
803
- const snapDir = path.join(require("os").homedir(), ".quadwork", projectId, "history-snapshots");
804
- ensureSecureDir(snapDir);
805
- const res = await fetch(`http://127.0.0.1:${PORT}/api/project-history?project=${encodeURIComponent(projectId)}`, {
806
- signal: AbortSignal.timeout(30000),
807
- });
808
- if (!res.ok) {
809
- console.warn(`[snapshot] ${projectId} history fetch returned ${res.status}; skipping snapshot`);
810
- return false;
811
- }
812
- const text = await res.text();
813
- const stamp = new Date().toISOString().replace(/[:.]/g, "-");
814
- const outPath = path.join(snapDir, `${stamp}.json`);
815
- fs.writeFileSync(outPath, text);
816
- console.log(`[snapshot] ${projectId} → ${outPath}`);
817
- // Prune to the newest HISTORY_SNAPSHOT_LIMIT files so the
818
- // directory can't grow unbounded across weeks of restarts.
819
- try {
820
- const entries = fs.readdirSync(snapDir)
821
- .filter((f) => f.endsWith(".json"))
822
- .map((f) => ({ f, t: fs.statSync(path.join(snapDir, f)).mtimeMs }))
823
- .sort((a, b) => b.t - a.t);
824
- for (const old of entries.slice(HISTORY_SNAPSHOT_LIMIT)) {
825
- try { fs.unlinkSync(path.join(snapDir, old.f)); } catch {}
826
- }
827
- } catch {
828
- // non-fatal — stale snapshots just linger
829
- }
830
- return true;
831
- } catch (err) {
832
- console.warn(`[snapshot] ${projectId} snapshot failed: ${err.message || err}`);
833
- return false;
834
- }
835
- }
836
-
837
- // Per-project AgentChattr lifecycle: /api/agentchattr/:project/:action
838
- // Backward compat: /api/agentchattr/:action uses first project
839
- async function handleAgentChattr(req, res) {
840
- let projectId, action;
841
- if (req.params.action) {
842
- projectId = req.params.projectOrAction;
843
- action = req.params.action;
844
- } else {
845
- // Backward compat: single-param = action, use first project
846
- action = req.params.projectOrAction;
847
- const cfg = readConfig();
848
- projectId = cfg.projects?.[0]?.id || "_default";
849
- }
850
-
851
- const { url: chattrUrl } = resolveProjectChattr(projectId);
852
- const chattrPort = new URL(chattrUrl).port || "8300";
853
-
854
- // Find per-project config.toml. Phase 2E / #181: prefer the
855
- // per-project AgentChattr clone ROOT (where the web/CLI wizards now
856
- // write it as of #184/#185 — and where run.py actually reads it from).
857
- // Fall back to the legacy <working_dir>/agentchattr/config.toml for
858
- // v1 setups that haven't been migrated yet (#188).
859
- const cfg = readConfig();
860
- const project = cfg.projects?.find((p) => p.id === projectId);
861
- const { dir: resolvedAcDir } = resolveProjectChattr(projectId);
862
- let projectConfigToml = null;
863
- if (resolvedAcDir && fs.existsSync(path.join(resolvedAcDir, "config.toml"))) {
864
- projectConfigToml = path.join(resolvedAcDir, "config.toml");
865
- } else if (project?.working_dir) {
866
- const legacyToml = path.join(project.working_dir, "agentchattr", "config.toml");
867
- if (fs.existsSync(legacyToml)) projectConfigToml = legacyToml;
868
- }
869
-
870
- function getProc() {
871
- return chattrProcesses.get(projectId) || { process: null, state: "stopped", error: null };
872
- }
873
- function setProc(val) {
874
- chattrProcesses.set(projectId, val);
875
- }
876
-
877
- function regenerateConfigToml() {
878
- // If project has a config.toml, update the port to match current config
879
- if (!projectConfigToml || !fs.existsSync(projectConfigToml)) return;
880
- try {
881
- let content = fs.readFileSync(projectConfigToml, "utf-8");
882
- content = content.replace(/^port = \d+/m, `port = ${chattrPort}`);
883
- writeSecureFile(projectConfigToml, content);
884
- } catch {}
885
- }
886
-
887
- async function spawnChattr() {
888
- // Sync config.toml port before starting
889
- regenerateConfigToml();
890
-
891
- // Use project config.toml if available (isolated data dir + ports), otherwise fall back to --port
892
- const extraArgs = (projectConfigToml && fs.existsSync(projectConfigToml))
893
- ? []
894
- : ["--port", chattrPort];
895
-
896
- // Resolve AgentChattr from its cloned directory
897
- const { dir: acDir } = resolveProjectChattr(projectId);
898
- // #394: backfill sender-overflow CSS/JS patch on every spawn so
899
- // existing installs receive the fix without manual update.
900
- patchAgentchattrCss(acDir);
901
- const acSpawn = resolveChattrSpawn(acDir);
902
- if (!acSpawn) {
903
- setProc({ process: null, state: "error", error: `AgentChattr not installed. Clone it: git clone https://github.com/bcurts/agentchattr.git ${acDir}` });
904
- return null;
905
- }
906
-
907
- // #569: redirect AC stdout/stderr to a log file so operators can
908
- // diagnose startup failures. Append mode preserves restart history.
909
- const acLogDir = path.join(os.homedir(), ".quadwork", projectId);
910
- try { fs.mkdirSync(acLogDir, { recursive: true, mode: 0o700 }); } catch {}
911
- const acLogPath = path.join(acLogDir, "agentchattr.log");
912
- const acLogFd = fs.openSync(acLogPath, "a");
913
- const child = spawn(acSpawn.command, [...acSpawn.args, ...extraArgs], {
914
- cwd: acSpawn.cwd,
915
- env: process.env,
916
- stdio: ["ignore", acLogFd, acLogFd],
917
- detached: true,
918
- });
919
-
920
- // Close our copy of the log fd — child inherits its own copy.
921
- fs.closeSync(acLogFd);
922
-
923
- // If pid is undefined, spawn failed
924
- if (!child.pid) {
925
- setProc({ process: null, state: "error", error: "Failed to start AgentChattr — check that Python venv is set up in " + acDir + ". Log: " + acLogPath });
926
- child.on("error", () => {});
927
- return null;
928
- }
929
-
930
- child.unref();
931
- child.on("error", (err) => {
932
- setProc({ process: null, state: "error", error: err.message });
933
- });
934
- child.on("exit", (code) => {
935
- const cur = getProc();
936
- if (cur.process === child) {
937
- setProc({ process: null, state: "stopped", error: code ? `exit:${code}` : null });
938
- }
939
- });
940
- // #580: wait for AC to actually bind the port before declaring success.
941
- // On fast-start installs this resolves in 1-2s; prevents false-down
942
- // detection on slow starts that triggered ghost agent cascades.
943
- const ready = await waitForAgentChattrReady(chattrPort, 30000);
944
- if (ready) {
945
- setProc({ process: child, state: "running", error: null, runningSince: Date.now() });
946
- return child;
947
- } else {
948
- setProc({ process: child, state: "error", error: "AgentChattr did not become ready within 30s" });
949
- return null;
950
- }
951
- }
952
-
953
- // #386: Kill any process listening on the AC port. Handles orphaned
954
- // processes that survive QuadWork restarts (detached + unref'd spawns
955
- // lose their tracked reference when the Node process recycles).
956
- function killProcessOnPort(port, signal = "SIGTERM") {
957
- try {
958
- const pids = execFileSync("lsof", ["-ti", `TCP:${port}`, "-sTCP:LISTEN"], {
959
- encoding: "utf-8",
960
- timeout: 5000,
961
- stdio: ["pipe", "pipe", "pipe"],
962
- }).trim();
963
- if (!pids) return;
964
- for (const line of pids.split("\n")) {
965
- const pid = parseInt(line, 10);
966
- if (pid > 0) {
967
- try { process.kill(pid, signal); } catch {}
968
- }
969
- }
970
- } catch {
971
- // lsof exits non-zero when no matching process — expected
972
- }
973
- }
974
-
975
- // #386: Poll until the port is free or timeout expires.
976
- function waitForPortFree(port, timeoutMs = 3000) {
977
- const start = Date.now();
978
- return new Promise((resolve) => {
979
- function check() {
980
- try {
981
- execFileSync("lsof", ["-ti", `TCP:${port}`, "-sTCP:LISTEN"], {
982
- encoding: "utf-8",
983
- timeout: 2000,
984
- stdio: ["pipe", "pipe", "pipe"],
985
- });
986
- // Still occupied — retry if within budget
987
- if (Date.now() - start < timeoutMs) {
988
- setTimeout(check, 200);
989
- } else {
990
- resolve(false);
991
- }
992
- } catch {
993
- // lsof found nothing — port is free
994
- resolve(true);
995
- }
996
- }
997
- check();
998
- });
999
- }
1000
-
1001
- if (action === "start") {
1002
- const proc = getProc();
1003
- if (proc.state === "running" && proc.process) {
1004
- return res.json({ ok: true, state: "running", message: "Already running" });
1005
- }
1006
- // #401: validate AgentChattr is installed BEFORE killing anything on
1007
- // the port. Without this guard, clicking Start when AC is missing
1008
- // kills an unrelated process then fails with "not installed".
1009
- const { dir: acDir } = resolveProjectChattr(projectId);
1010
- const acSpawn = resolveChattrSpawn(acDir);
1011
- if (!acSpawn) {
1012
- const errMsg = `AgentChattr not installed. Clone it: git clone https://github.com/bcurts/agentchattr.git ${acDir}`;
1013
- setProc({ process: null, state: "error", error: errMsg });
1014
- return res.status(500).json({ ok: false, state: "error", error: errMsg });
1015
- }
1016
-
1017
- // #393: kill any orphaned process on the port before spawning
1018
- // (same pattern as restart/stop from #386).
1019
- killProcessOnPort(chattrPort);
1020
- const portFree = await waitForPortFree(chattrPort, 3000);
1021
- if (!portFree) {
1022
- console.warn(`[agentchattr] ${projectId} port ${chattrPort} still occupied after 3s — spawning anyway`);
1023
- }
1024
- try {
1025
- const child = await spawnChattr();
1026
- if (!child) {
1027
- const errProc = getProc();
1028
- return res.status(500).json({ ok: false, state: "error", error: errProc.error || "Failed to start AgentChattr" });
1029
- }
1030
- // Sync token after AgentChattr starts (it generates its own)
1031
- setTimeout(() => syncChattrToken(projectId), 2000);
1032
- res.json({ ok: true, state: "running", pid: child.pid });
1033
- } catch (err) {
1034
- setProc({ process: null, state: "error", error: err.message });
1035
- res.status(500).json({ ok: false, state: "error", error: err.message });
1036
- }
1037
- } else if (action === "stop") {
1038
- const proc = getProc();
1039
- if (proc.process) {
1040
- try { proc.process.kill("SIGTERM"); } catch {}
1041
- }
1042
- // #386: also kill any orphaned process holding the port
1043
- killProcessOnPort(chattrPort);
1044
- setProc({ process: null, state: "stopped", error: null });
1045
- res.json({ ok: true, state: "stopped" });
1046
- } else if (action === "restart") {
1047
- // #424 / quadwork#304: snapshot history before killing the
1048
- // process. Best-effort and non-blocking-on-failure so a flaky
1049
- // snapshot doesn't leave the operator unable to restart AC.
1050
- await snapshotProjectHistory(projectId).catch(() => {});
1051
- // #424 / quadwork#304 Phase 3: latch the opt-in BEFORE the
1052
- // spawn so a restart that itself clears the flag can't starve
1053
- // the auto-restore. We capture the snapshot filename we just
1054
- // wrote + the project's auto_restore_after_restart flag and
1055
- // replay it in the post-spawn tick below if both are set.
1056
- const preRestartCfg = readConfig();
1057
- const preRestartProject = preRestartCfg.projects?.find((p) => p.id === projectId);
1058
- const shouldAutoRestore = !!(preRestartProject && preRestartProject.auto_restore_after_restart);
1059
- const proc = getProc();
1060
- if (proc.process) {
1061
- console.log(`[agentchattr] ${projectId} restart: killing AC (PID: ${proc.process.pid})`);
1062
- try { proc.process.kill("SIGTERM"); } catch {}
1063
- }
1064
- // #386: also kill any orphaned process holding the port (handles
1065
- // detached processes that survived a QuadWork restart).
1066
- killProcessOnPort(chattrPort);
1067
- setProc({ process: null, state: "stopped", error: null });
1068
- // #582: wait up to 5s for the port to be free, then SIGKILL
1069
- // any remaining process as a fallback before spawning.
1070
- let portFree = await waitForPortFree(chattrPort, 5000);
1071
- if (!portFree) {
1072
- console.warn(`[agentchattr] ${projectId} port ${chattrPort} still occupied after 5s — sending SIGKILL`);
1073
- killProcessOnPort(chattrPort, "SIGKILL");
1074
- portFree = await waitForPortFree(chattrPort, 3000);
1075
- if (!portFree) {
1076
- const portErr = `Port ${chattrPort} still occupied — cannot restart`;
1077
- console.error(`[agentchattr] ${projectId} ${portErr}`);
1078
- setProc({ process: null, state: "error", error: portErr });
1079
- return res.status(500).json({ ok: false, state: "error", error: portErr });
1080
- }
1081
- }
1082
- console.log(`[agentchattr] ${projectId} restart: port ${chattrPort} is free, spawning AC`);
1083
- try {
1084
- const child = await spawnChattr();
1085
- if (!child) {
1086
- const errProc = getProc();
1087
- console.error(`[agentchattr] ${projectId} restart: spawnChattr failed — ${errProc.error || "unknown error"}`);
1088
- return res.status(500).json({ ok: false, state: "error", error: errProc.error || "Failed to start AgentChattr" });
1089
- }
1090
- console.log(`[agentchattr] ${projectId} restart: AC spawned and ready (PID: ${child.pid})`);
1091
- // Sync token after AgentChattr restarts
1092
- setTimeout(() => syncChattrToken(projectId), 2000);
1093
- // #424 / quadwork#304 Phase 3: optional auto-restore.
1094
- // Fire the restore 3s after spawn so AC's ws is ready.
1095
- // Best-effort: never blocks the restart response or
1096
- // rolls back on error.
1097
- if (shouldAutoRestore) {
1098
- setTimeout(async () => {
1099
- try {
1100
- const snapDir = path.join(require("os").homedir(), ".quadwork", projectId, "history-snapshots");
1101
- if (!fs.existsSync(snapDir)) return;
1102
- const newest = fs.readdirSync(snapDir)
1103
- .filter((f) => f.endsWith(".json"))
1104
- .map((f) => ({ f, t: fs.statSync(path.join(snapDir, f)).mtimeMs }))
1105
- .sort((a, b) => b.t - a.t)[0];
1106
- if (!newest) return;
1107
- const r = await fetch(`http://127.0.0.1:${PORT}/api/project-history/restore?project=${encodeURIComponent(projectId)}&name=${encodeURIComponent(newest.f)}`, {
1108
- method: "POST",
1109
- });
1110
- if (r.ok) console.log(`[snapshot] ${projectId} auto-restored ${newest.f}`);
1111
- else console.warn(`[snapshot] ${projectId} auto-restore returned ${r.status}`);
1112
- } catch (err) {
1113
- console.warn(`[snapshot] ${projectId} auto-restore failed: ${err.message || err}`);
1114
- }
1115
- }, 3000);
1116
- }
1117
- res.json({ ok: true, state: "running", pid: child.pid });
1118
- // #447: auto-reset all agents after AC restart so they get
1119
- // fresh MCP tokens. #581: mark reset as scheduled immediately
1120
- // so the health monitor skips its own reset while ours is in-flight.
1121
- // #579: also skip if a reset already succeeded within the last 30s.
1122
- // Multiple restart sources (bridge-migrate, health monitor, dashboard)
1123
- // can fire in rapid succession — only the first should trigger a reset.
1124
- const existingReset = _acHealth.resetState.get(projectId);
1125
- const resetRecentlyDone = existingReset &&
1126
- (existingReset.status === "succeeded" || existingReset.status === "scheduled") &&
1127
- Date.now() - existingReset.timestamp < 30_000;
1128
- if (resetRecentlyDone) {
1129
- console.log(`[agentchattr] ${projectId} skipping auto-reset — one already ${existingReset.status} ${Math.round((Date.now() - existingReset.timestamp) / 1000)}s ago`);
1130
- } else {
1131
- _acHealth.resetState.set(projectId, { status: "scheduled", timestamp: Date.now() });
1132
- }
1133
- if (!resetRecentlyDone) setTimeout(async () => {
1134
- try {
1135
- const resetResp = await fetch(`http://127.0.0.1:${PORT}/api/agents/${encodeURIComponent(projectId)}/reset`, {
1136
- method: "POST",
1137
- });
1138
- if (resetResp.ok) {
1139
- const resetData = await resetResp.json();
1140
- _acHealth.resetState.set(projectId, { status: "succeeded", timestamp: Date.now() });
1141
- console.log(`[agentchattr] ${projectId} auto-reset ${resetData.restarted} agent(s) after AC restart`);
1142
- } else {
1143
- _acHealth.resetState.set(projectId, { status: "failed", timestamp: Date.now() });
1144
- console.warn(`[agentchattr] ${projectId} agent reset after AC restart returned ${resetResp.status}`);
1145
- }
1146
- } catch (err) {
1147
- _acHealth.resetState.set(projectId, { status: "failed", timestamp: Date.now() });
1148
- console.warn(`[agentchattr] ${projectId} agent reset after AC restart failed: ${err.message || err}`);
1149
- }
1150
- }, 2000);
1151
- } catch (err) {
1152
- setProc({ process: null, state: "error", error: err.message });
1153
- res.status(500).json({ ok: false, state: "error", error: err.message });
1154
- }
1155
- } else if (action === "update") {
1156
- // Update AgentChattr: stop → git pull → pip install → restart
1157
- const { dir: acDir } = resolveProjectChattr(projectId);
1158
- if (!acDir || !fs.existsSync(path.join(acDir, "run.py"))) {
1159
- return res.status(400).json({ ok: false, error: "AgentChattr not installed at " + (acDir || "unknown") });
1160
- }
1161
- try {
1162
- // Stop running process before pulling. Snapshot first so a
1163
- // botched git pull can still be rolled back from disk.
1164
- // #424 / quadwork#304: best-effort.
1165
- await snapshotProjectHistory(projectId).catch(() => {});
1166
- // Latch the auto-restore opt-in BEFORE stop, same as the
1167
- // explicit restart branch above — a config mutation during
1168
- // the git pull shouldn't starve the replay.
1169
- const updateCfgPre = readConfig();
1170
- const updateProjectPre = updateCfgPre.projects?.find((p) => p.id === projectId);
1171
- const updateShouldAutoRestore = !!(updateProjectPre && updateProjectPre.auto_restore_after_restart);
1172
- const proc = getProc();
1173
- const wasRunning = proc.process && proc.state === "running";
1174
- if (wasRunning) {
1175
- try { proc.process.kill("SIGTERM"); } catch {}
1176
- }
1177
- // #386: kill orphaned processes on the port too
1178
- killProcessOnPort(chattrPort);
1179
- if (wasRunning) {
1180
- setProc({ process: null, state: "stopped", error: null });
1181
- // Wait for the port to be released before pulling/restarting
1182
- await waitForPortFree(chattrPort, 3000);
1183
- }
1184
-
1185
- const pullResult = execFileSync("git", ["pull"], { cwd: acDir, encoding: "utf-8", timeout: 30000, stdio: "pipe" }).trim();
1186
- // #388: re-apply sender-overflow CSS patch after git pull
1187
- patchAgentchattrCss(acDir);
1188
- // #629: re-apply crash timeout patch after git pull (pull may revert app.py)
1189
- patchCrashTimeout(acDir);
1190
- const venvPython = path.join(acDir, ".venv", "bin", "python");
1191
- let pipResult = "";
1192
- const reqFile = path.join(acDir, "requirements.txt");
1193
- if (fs.existsSync(venvPython) && fs.existsSync(reqFile)) {
1194
- pipResult = execFileSync(venvPython, ["-m", "pip", "install", "-r", "requirements.txt"], { cwd: acDir, encoding: "utf-8", timeout: 120000, stdio: "pipe" }).trim();
1195
- }
1196
-
1197
- // Restart if it was running before the update
1198
- let restarted = false;
1199
- if (wasRunning) {
1200
- const child = await spawnChattr();
1201
- restarted = !!child;
1202
- if (child) {
1203
- setTimeout(() => syncChattrToken(projectId).catch(() => {}), 2000);
1204
- // #424 / quadwork#304 Phase 3: auto-restore after an
1205
- // update-triggered restart too (t2a re-review). Same
1206
- //3s wait + newest-snapshot-by-mtime path as the explicit
1207
- // restart branch, using the pre-stop latched opt-in.
1208
- if (updateShouldAutoRestore) {
1209
- setTimeout(async () => {
1210
- try {
1211
- const snapDir = path.join(require("os").homedir(), ".quadwork", projectId, "history-snapshots");
1212
- if (!fs.existsSync(snapDir)) return;
1213
- const newest = fs.readdirSync(snapDir)
1214
- .filter((f) => f.endsWith(".json"))
1215
- .map((f) => ({ f, t: fs.statSync(path.join(snapDir, f)).mtimeMs }))
1216
- .sort((a, b) => b.t - a.t)[0];
1217
- if (!newest) return;
1218
- const r = await fetch(`http://127.0.0.1:${PORT}/api/project-history/restore?project=${encodeURIComponent(projectId)}&name=${encodeURIComponent(newest.f)}`, {
1219
- method: "POST",
1220
- });
1221
- if (r.ok) console.log(`[snapshot] ${projectId} auto-restored ${newest.f} after update`);
1222
- else console.warn(`[snapshot] ${projectId} post-update auto-restore returned ${r.status}`);
1223
- } catch (err) {
1224
- console.warn(`[snapshot] ${projectId} post-update auto-restore failed: ${err.message || err}`);
1225
- }
1226
- }, 3000);
1227
- }
1228
- }
1229
- }
560
+ // Per-project AgentChattr lifecycle (removed in #723 AC stack deleted)
1230
561
 
1231
- res.json({ ok: true, pull: pullResult, pip: pipResult, restarted });
1232
- } catch (err) {
1233
- res.status(500).json({ ok: false, error: err.message });
1234
- }
1235
- } else {
1236
- res.status(400).json({ error: "Unknown action" });
1237
- }
562
+ // Stub endpoints return 410 Gone so dashboard code degrades gracefully
563
+ async function handleAgentChattr(_req, res) {
564
+ return res.status(410).json({ ok: false, error: "AgentChattr removed in Phase 3" });
1238
565
  }
1239
- app.post("/api/agentchattr/:projectOrAction/:action", handleAgentChattr);
1240
- app.post("/api/agentchattr/:projectOrAction", handleAgentChattr);
1241
-
1242
- // --- Reset agents: deregister all registered slots ---
1243
- // AgentChattr doesn't expose staleness metadata, so this clears all slots.
1244
- // Agents' wrapper heartbeat will auto-re-register with clean names.
1245
-
1246
- // #416: AC health status endpoint — returns the health monitor state
1247
- // for a project so the dashboard can surface auto-restart events.
1248
- app.get("/api/agentchattr/:project/health", (req, res) => {
1249
- const projectId = req.params.project;
1250
- const proc = chattrProcesses.get(projectId);
1251
- const health = _acHealth.state.get(projectId) || { lastRestart: 0, consecutiveFailures: 0 };
1252
- res.json({
1253
- state: proc?.state || "unknown",
1254
- error: proc?.error || null,
1255
- autoRestart: {
1256
- lastRestart: health.lastRestart || null,
1257
- consecutiveFailures: health.consecutiveFailures,
1258
- gaveUp: health.consecutiveFailures >= 3,
1259
- },
1260
- });
1261
- });
1262
-
1263
566
  app.post("/api/agents/:project/reset", async (req, res) => {
1264
567
  const projectId = req.params.project;
1265
568
 
@@ -1291,6 +594,8 @@ app.post("/api/agents/:project/reset", async (req, res) => {
1291
594
 
1292
595
  // Stop all agents first (handles deregistration best-effort)
1293
596
  for (const agentId of allAgentIds) {
597
+ const s = agentSessions.get(`${projectId}/${agentId}`);
598
+ if (s) s._suppressLifecycleMsg = true;
1294
599
  await stopAgentSession(`${projectId}/${agentId}`);
1295
600
  }
1296
601
 
@@ -1298,8 +603,9 @@ app.post("/api/agents/:project/reset", async (req, res) => {
1298
603
  let restarted = 0;
1299
604
  const errors = [];
1300
605
  for (const agentId of allAgentIds) {
1301
- const result = await spawnAgentPty(projectId, agentId);
606
+ const result = await spawnAgentPty(projectId, agentId, { suppressLifecycleMsg: true });
1302
607
  if (result.ok) {
608
+ emitSystemMessage(projectId, `${agentId} restarted`);
1303
609
  restarted++;
1304
610
  } else {
1305
611
  errors.push(`${agentId}: ${result.error}`);
@@ -1317,7 +623,7 @@ app.post("/api/agents/:project/reset", async (req, res) => {
1317
623
  }
1318
624
  });
1319
625
 
1320
- // --- Full Reset: restart all AC + agents across all projects (#657) ---
626
+ // --- Full Reset: restart all agents across all projects (#657) ---
1321
627
 
1322
628
  app.post("/api/full-reset", async (_req, res) => {
1323
629
  const start = Date.now();
@@ -1326,42 +632,21 @@ app.post("/api/full-reset", async (_req, res) => {
1326
632
  const cfg = readConfig();
1327
633
  const projects = (cfg.projects || []).filter((p) => !p.archived);
1328
634
 
1329
- // 1. Stop all agent sessions
1330
635
  console.log("[full-reset] stopping all agent sessions...");
1331
636
  const sessionKeys = [...agentSessions.keys()];
1332
637
  for (const key of sessionKeys) {
1333
638
  await stopAgentSession(key);
1334
639
  }
1335
640
 
1336
- // 2. Stop Butler if running
1337
641
  console.log("[full-reset] stopping Butler...");
1338
642
  stopButlerPty();
1339
643
 
1340
- // 3. Re-run startup migrations
1341
644
  console.log("[full-reset] running startup migrations...");
1342
645
  runStartupMigrations(cfg);
1343
646
 
1344
- // 4. Restart each project's AC + agents
1345
647
  let totalAgents = 0;
1346
648
  const errors = [];
1347
649
  for (const project of projects) {
1348
- console.log(`[full-reset] restarting AC for ${project.id}...`);
1349
- // Pre-mark reset as scheduled so AC restart's auto-reset timer is suppressed
1350
- _acHealth.resetState.set(project.id, { status: "scheduled", timestamp: Date.now() });
1351
- try {
1352
- const acResp = await fetch(`http://127.0.0.1:${PORT}/api/agentchattr/${encodeURIComponent(project.id)}/restart`, {
1353
- method: "POST",
1354
- });
1355
- if (!acResp.ok) {
1356
- const errData = await acResp.json().catch(() => ({}));
1357
- errors.push(`${project.id}: AC restart failed — ${errData.error || acResp.status}`);
1358
- continue;
1359
- }
1360
- } catch (err) {
1361
- errors.push(`${project.id}: AC — ${err.message}`);
1362
- continue;
1363
- }
1364
- // Explicitly reset agents and await result
1365
650
  try {
1366
651
  const resetResp = await fetch(`http://127.0.0.1:${PORT}/api/agents/${encodeURIComponent(project.id)}/reset`, {
1367
652
  method: "POST",
@@ -1377,7 +662,6 @@ app.post("/api/full-reset", async (_req, res) => {
1377
662
  }
1378
663
  }
1379
664
 
1380
- // 5. Restart Butler if enabled
1381
665
  if (cfg.butler?.enabled) {
1382
666
  console.log("[full-reset] restarting Butler...");
1383
667
  const result = spawnButlerPty();
@@ -1435,16 +719,45 @@ app.post("/api/agents/:project/:agent/restart", async (req, res) => {
1435
719
 
1436
720
  // #241: must await deregister before respawn so the slot frees and
1437
721
  // the fresh register lands at slot 1 instead of head-2.
722
+ const existing = agentSessions.get(key);
723
+ if (existing) existing._suppressLifecycleMsg = true;
1438
724
  await stopAgentSession(key);
1439
725
 
1440
- const result = await spawnAgentPty(project, agent);
726
+ const result = await spawnAgentPty(project, agent, { suppressLifecycleMsg: true });
1441
727
  if (result.ok) {
728
+ emitSystemMessage(project, `${agent} restarted`);
1442
729
  res.json({ ok: true, state: "running", pid: result.pid });
1443
730
  } else {
1444
731
  res.status(500).json({ ok: false, state: "error", error: result.error });
1445
732
  }
1446
733
  });
1447
734
 
735
+ // --- #706: Manual interrupt — send Ctrl+C to agent PTY ---
736
+
737
+ app.post("/api/agents/:project/:agent/interrupt", (req, res) => {
738
+ const key = `${req.params.project}/${req.params.agent}`;
739
+ const session = agentSessions.get(key);
740
+ if (!session || !session.term) {
741
+ return res.json({ ok: false, error: "Agent not running" });
742
+ }
743
+ safeWrite(session.term, "\x03");
744
+ console.log(`[interrupt] ${key}: operator sent Ctrl+C`);
745
+ res.json({ ok: true });
746
+ });
747
+
748
+ app.post("/api/agents/:project/interrupt-all", (req, res) => {
749
+ const { project } = req.params;
750
+ let count = 0;
751
+ for (const [key, session] of agentSessions) {
752
+ if (!key.startsWith(`${project}/`)) continue;
753
+ if (session.state !== "running" || !session.term) continue;
754
+ safeWrite(session.term, "\x03");
755
+ count++;
756
+ }
757
+ console.log(`[interrupt] ${project}: operator sent Ctrl+C to ${count} agent(s)`);
758
+ res.json({ ok: true, interrupted: count });
759
+ });
760
+
1448
761
  // --- Sessions tracking (for /api/projects dashboard) ---
1449
762
 
1450
763
  // Expose agentSessions to migrated routes
@@ -2380,327 +1693,35 @@ setInterval(autoStopPollingTick, AUTO_STOP_POLL_INTERVAL_MS);
2380
1693
  // delay is tens of seconds. Skipping projects without the opt-in
2381
1694
  // keeps the poller cheap for single-project setups.
2382
1695
 
2383
- const _loopGuardPausedState = new Map(); // projectId -> { paused: bool, scheduled: Timeout? }
2384
- const LOOP_GUARD_POLL_INTERVAL_MS = 10000;
2385
-
2386
- async function checkLoopGuardPause(project) {
2387
- if (!project || !project.auto_continue_loop_guard) return;
2388
- const { url: base, token: sessionToken } = resolveProjectChattr(project.id);
2389
- if (!base) return;
2390
- let paused = false;
2391
- try {
2392
- const r = await fetch(`${base}/api/status`, {
2393
- headers: sessionToken ? { "x-session-token": sessionToken } : {},
2394
- signal: AbortSignal.timeout(5000),
2395
- });
2396
- if (!r.ok) return;
2397
- const data = await r.json();
2398
- paused = !!(data && data.paused);
2399
- } catch {
2400
- return;
2401
- }
2402
- const state = _loopGuardPausedState.get(project.id) || { paused: false, scheduled: null };
2403
- // Transition false → true: schedule an auto-continue after the delay.
2404
- if (paused && !state.paused && !state.scheduled) {
2405
- const delaySec = Number.isFinite(project.auto_continue_delay_sec) && project.auto_continue_delay_sec >= 5
2406
- ? project.auto_continue_delay_sec
2407
- : 30;
2408
- console.log(`[loop-guard] ${project.id} paused — auto-continue in ${delaySec}s`);
2409
- state.scheduled = setTimeout(async () => {
2410
- try {
2411
- // Re-check the opt-in at fire time so a checkbox disable
2412
- // mid-wait actually stops the auto-continue.
2413
- const freshCfg = readConfig();
2414
- const fresh = freshCfg.projects?.find((p) => p.id === project.id);
2415
- if (!fresh || !fresh.auto_continue_loop_guard) {
2416
- console.log(`[loop-guard] ${project.id} auto-continue cancelled (opt-in disabled during wait)`);
2417
- } else {
2418
- // Re-check the router's pause state at fire time too. The
2419
- // 10s status poller may not have seen a manual operator
2420
- // /continue yet when the delay window (5–9s) is shorter
2421
- // than the poll interval — without this, a manual resume
2422
- // inside a 5s wait would be followed by a stale auto
2423
- // /continue that clobbers hop_count on an already-running
2424
- // chain (router.continue_routing resets the counter
2425
- // unconditionally). The re-check closes the race.
2426
- let stillPaused = false;
2427
- try {
2428
- const { url: freshBase, token: freshToken } = resolveProjectChattr(project.id);
2429
- if (freshBase) {
2430
- const sr = await fetch(`${freshBase}/api/status`, {
2431
- headers: freshToken ? { "x-session-token": freshToken } : {},
2432
- signal: AbortSignal.timeout(5000),
2433
- });
2434
- if (sr.ok) {
2435
- const sd = await sr.json();
2436
- stillPaused = !!(sd && sd.paused);
2437
- }
2438
- }
2439
- } catch {
2440
- // Status re-check failed — fall back to "don't fire".
2441
- // Stuck pause will still be caught on the next 10s tick.
2442
- }
2443
- if (!stillPaused) {
2444
- console.log(`[loop-guard] ${project.id} auto-continue cancelled (router already resumed)`);
2445
- } else {
2446
- const res = await fetch(`http://127.0.0.1:${PORT}/api/chat?project=${encodeURIComponent(project.id)}`, {
2447
- method: "POST",
2448
- headers: { "Content-Type": "application/json" },
2449
- body: JSON.stringify({ text: "/continue", channel: "general" }),
2450
- });
2451
- if (res.ok) console.log(`[loop-guard] ${project.id} auto-continued`);
2452
- else console.warn(`[loop-guard] ${project.id} auto-continue POST returned ${res.status}`);
2453
- }
2454
- }
2455
- } catch (err) {
2456
- console.warn(`[loop-guard] ${project.id} auto-continue failed: ${err.message || err}`);
2457
- }
2458
- const s2 = _loopGuardPausedState.get(project.id);
2459
- if (s2) s2.scheduled = null;
2460
- }, delaySec * 1000);
2461
- }
2462
- // Transition true → false: clear any pending timer.
2463
- if (!paused && state.paused && state.scheduled) {
2464
- clearTimeout(state.scheduled);
2465
- state.scheduled = null;
2466
- }
2467
- state.paused = paused;
2468
- _loopGuardPausedState.set(project.id, state);
2469
- }
2470
-
2471
- function runLoopGuardPollingTick() {
2472
- try {
2473
- const cfg = readConfig();
2474
- for (const p of (cfg.projects || [])) {
2475
- if (p && p.auto_continue_loop_guard) checkLoopGuardPause(p);
2476
- }
2477
- } catch {
2478
- // config unreadable — next tick will retry
2479
- }
2480
- }
2481
-
2482
- setInterval(runLoopGuardPollingTick, LOOP_GUARD_POLL_INTERVAL_MS);
2483
-
2484
1696
  // --- Start ---
2485
1697
 
2486
- // ---------------------------------------------------------------------------
2487
- // #416: AC health monitor auto-restart AgentChattr on crash detection.
2488
- // Runs a TCP connect probe every 30s for each project with a "running" AC
2489
- // process. If the port is dead, auto-restarts (reusing the existing restart
2490
- // logic). Rate-limited to one restart per 60s per project; gives up after
2491
- // 3 consecutive failures and surfaces a persistent error.
2492
- // ---------------------------------------------------------------------------
2493
- // #572: restart agents that are running without AC registration after AC
2494
- // recovers from a crash. Scans agentSessions for the given project,
2495
- // finds agents missing acRegistrationName, and stop+respawns them so
2496
- // they get MCP CLI flags at launch time.
2497
- async function restartUnregisteredAgents(projectId) {
2498
- const toRestart = [];
2499
- for (const [key, session] of agentSessions) {
2500
- if (session.projectId !== projectId) continue;
2501
- if (session.acRegistrationName) continue; // already registered
2502
- if (session.state !== "running") continue;
2503
- if (!session.acServerPort || !session.acInjectMode) continue;
2504
- toRestart.push({ key, agentId: session.agentId });
2505
- }
2506
-
2507
- if (toRestart.length === 0) return;
2508
- const samplePort = agentSessions.get(toRestart[0].key)?.acServerPort || "?";
2509
- console.log(`[health] AC recovered on port ${samplePort} — restarting ${toRestart.length} agent(s) for chat integration`);
2510
-
2511
- for (const { key, agentId } of toRestart) {
2512
- try {
2513
- console.log(`[health] Restarting agent ${agentId} for project ${projectId} to gain chat integration`);
2514
- await stopAgentSession(key);
2515
- await spawnAgentPty(projectId, agentId);
2516
- } catch (err) {
2517
- console.error(`[health] Failed to restart agent ${agentId}: ${err.message}`);
2518
- }
2519
- }
2520
- }
1698
+ // #705: auto-interrupt agents stuck with no PTY output for 10 minutes.
1699
+ const WATCHDOG_TIMEOUT_MS = 10 * 60 * 1000;
1700
+ let _watchdogHandle = null;
2521
1701
 
2522
- const _acHealth = {
2523
- // Per-project: { lastRestart: timestamp, consecutiveFailures: number }
2524
- state: new Map(),
2525
- intervalHandle: null,
2526
- // #581: per-project reset state prevents duplicate resets per restart event.
2527
- // Values: { status: "scheduled"|"succeeded"|"failed", timestamp: number }
2528
- resetState: new Map(),
2529
- // #579: per-project grace period. Projects whose AC entered "running"
2530
- // within the last 60s are skipped by the health monitor so startup
2531
- // migrations (bridge-migrate, ghost-fix) and fresh spawns can settle.
2532
- // Tracked via `runningSince` in chattrProcesses entries.
2533
- };
2534
-
2535
- function isPortAlive(port) {
2536
- return new Promise((resolve) => {
2537
- const sock = net.createConnection({ port, host: "127.0.0.1" }, () => {
2538
- sock.destroy();
2539
- resolve(true);
2540
- });
2541
- sock.on("error", () => resolve(false));
2542
- sock.setTimeout(2000, () => { sock.destroy(); resolve(false); });
2543
- });
2544
- }
2545
-
2546
- async function acHealthCheck() {
2547
- const cfg = readConfig();
2548
- for (const project of (cfg.projects || [])) {
2549
- const proc = chattrProcesses.get(project.id);
2550
- // Only monitor projects that were explicitly started (state === "running"
2551
- // or had a process). Skip intentionally stopped projects.
2552
- if (!proc || proc.state === "stopped") continue;
2553
- // #579: per-project grace period — skip projects whose AC entered
2554
- // "running" within the last 60s. This lets cmdStart spawns and
2555
- // startup migrations (bridge-migrate, ghost-fix) settle before the
2556
- // monitor acts, regardless of when the project was created.
2557
- if (proc.runningSince && Date.now() - proc.runningSince < 60_000) continue;
2558
-
2559
- const { url } = resolveProjectChattr(project.id);
2560
- const portMatch = url.match(/:(\d+)/);
2561
- const port = portMatch ? parseInt(portMatch[1], 10) : 8300;
2562
-
2563
- const alive = await isPortAlive(port);
2564
- const health = _acHealth.state.get(project.id) || { lastRestart: 0, consecutiveFailures: 0 };
2565
-
2566
- if (alive) {
2567
- // Healthy — reset failure counter
2568
- if (health.consecutiveFailures > 0) {
2569
- console.log(`[health] AC for ${project.id} recovered (port ${port} alive)`);
2570
- // #572: restart agents that are running without chat integration.
2571
- // These are agents where the #565 deferred restart timed out, or
2572
- // agents spawned while AC was down. MCP flags are set at process
2573
- // launch, so a full stop+respawn is required.
2574
- // #581: dedupe — skip if a reset is in-flight or succeeded within 60s.
2575
- // If "scheduled" (in-flight), keep consecutiveFailures=1 so the next
2576
- // healthy tick re-enters this branch and retries if state became "failed".
2577
- const rs = _acHealth.resetState.get(project.id);
2578
- const resetSucceeded = rs && rs.status === "succeeded" && Date.now() - rs.timestamp < 60000;
2579
- const resetInFlight = rs && rs.status === "scheduled";
2580
- if (resetSucceeded) {
2581
- // Already handled — clear failures normally
2582
- } else if (resetInFlight) {
2583
- // In-flight — preserve failures so we retry next tick if it fails
2584
- health.consecutiveFailures = 1;
2585
- _acHealth.state.set(project.id, health);
2586
- continue;
2587
- } else {
2588
- // No recent reset or previous attempt failed — fire one
2589
- _acHealth.resetState.set(project.id, { status: "scheduled", timestamp: Date.now() });
2590
- restartUnregisteredAgents(project.id).then(() => {
2591
- _acHealth.resetState.set(project.id, { status: "succeeded", timestamp: Date.now() });
2592
- }).catch((err) => {
2593
- _acHealth.resetState.set(project.id, { status: "failed", timestamp: Date.now() });
2594
- console.error(`[health] Failed to restart unregistered agents for ${project.id}:`, err.message);
2595
- });
2596
- }
2597
- }
2598
- health.consecutiveFailures = 0;
2599
- _acHealth.state.set(project.id, health);
2600
- continue;
2601
- }
2602
-
2603
- // Port is dead — check rate limits
2604
- if (health.consecutiveFailures >= 3) {
2605
- // Already gave up — don't spam restarts. The error state persists
2606
- // in chattrProcesses for the dashboard to surface.
2607
- continue;
2608
- }
2609
-
2610
- const now = Date.now();
2611
- if (now - health.lastRestart < 60_000) {
2612
- // Too soon since last restart attempt
2613
- continue;
2614
- }
2615
-
2616
- health.consecutiveFailures++;
2617
- health.lastRestart = now;
2618
- _acHealth.state.set(project.id, health);
2619
-
2620
- console.warn(`[health] AC for ${project.id} on port ${port} is down (failure ${health.consecutiveFailures}/3) — auto-restarting`);
2621
-
2622
- // Call the existing restart endpoint internally so we reuse the
2623
- // hardened path (killProcessOnPort, waitForPortFree, snapshot,
2624
- // auto-restore) instead of reimplementing spawn logic inline.
2625
- try {
2626
- const resp = await fetch(`http://127.0.0.1:${PORT}/api/agentchattr/${encodeURIComponent(project.id)}/restart`, {
2627
- method: "POST",
2628
- timeout: 15000,
2629
- });
2630
- if (resp.ok) {
2631
- const data = await resp.json();
2632
- console.log(`[health] AC for ${project.id} auto-restarted (PID: ${data.pid})`);
2633
- // #447: agent reset is now chained inside the restart endpoint
2634
- // itself (fires on a 2s timer), so no separate call needed here.
2635
- } else {
2636
- const body = await resp.text().catch(() => "");
2637
- console.error(`[health] AC auto-restart failed for ${project.id}: ${resp.status} ${body.slice(0, 120)}`);
2638
- chattrProcesses.set(project.id, { process: null, state: "error", error: `Auto-restart failed: ${resp.status}` });
2639
- }
2640
- } catch (err) {
2641
- console.error(`[health] AC auto-restart failed for ${project.id}:`, err.message);
2642
- chattrProcesses.set(project.id, { process: null, state: "error", error: `Auto-restart failed: ${err.message}` });
1702
+ function watchdogCheck() {
1703
+ for (const [key, session] of agentSessions) {
1704
+ if (session.state !== "running" || !session.term) continue;
1705
+ if (!session.lastOutputAt) continue;
1706
+ // #732: skip file-chat projectsidle is normal, PTY dispatch wakes them
1707
+ if (routes.getProjectChatMode(session.projectId) === "file") continue;
1708
+ if (Date.now() - session.lastOutputAt > WATCHDOG_TIMEOUT_MS) {
1709
+ console.log(`[watchdog] ${key}: no output for 10m sending Ctrl+C`);
1710
+ safeWrite(session.term, "\x03");
1711
+ session.lastOutputAt = Date.now();
2643
1712
  }
2644
1713
  }
2645
1714
  }
2646
1715
 
2647
- function startAcHealthMonitor() {
2648
- if (_acHealth.intervalHandle) return;
2649
- _acHealth.intervalHandle = setInterval(acHealthCheck, 30_000);
2650
- console.log("[health] AC health monitor started (30s interval, per-project 60s grace)");
1716
+ function startWatchdog() {
1717
+ if (_watchdogHandle) return;
1718
+ _watchdogHandle = setInterval(watchdogCheck, 60_000);
1719
+ console.log("[watchdog] stuck-agent watchdog started (60s interval, 10m threshold)");
2651
1720
  }
2652
1721
 
2653
1722
  // #657: extracted startup migrations so full-reset can re-run them
2654
1723
  function runStartupMigrations(cfg) {
2655
1724
  const projects = (cfg.projects || []).filter((p) => !p.archived);
2656
- const acRestartNeeded = [];
2657
-
2658
- // bridge-migrate
2659
- for (const p of projects) {
2660
- const acPath = projectAgentchattrConfigPath(p.id);
2661
- if (!fs.existsSync(acPath)) continue;
2662
- try {
2663
- const before = fs.readFileSync(acPath, "utf-8");
2664
- const hadOldDc = /^\[agents\.discord-bridge\]\s*$/m.test(before);
2665
- const hadOldTg = /^\[agents\.telegram-bridge\]\s*$/m.test(before);
2666
- const dc = patchAgentchattrConfigForDiscordBridge(before);
2667
- const tg = patchAgentchattrConfigForTelegramBridge(dc.text);
2668
- if (dc.changed || tg.changed) {
2669
- fs.writeFileSync(acPath, tg.text);
2670
- console.log(`[bridge-migrate] ${p.id}: migrated AC config slugs`);
2671
- if (hadOldDc || hadOldTg) {
2672
- if (!acRestartNeeded.includes(p.id)) acRestartNeeded.push(p.id);
2673
- }
2674
- }
2675
- } catch {}
2676
- }
2677
-
2678
- // bridge-refresh
2679
- const DISCORD_BRIDGE_SRC = path.join(__dirname, "..", "bridges", "discord", "discord_bridge.py");
2680
- const DISCORD_BRIDGE_DEST = path.join(os.homedir(), ".quadwork", "agentchattr-discord", "discord_bridge.py");
2681
- if (fs.existsSync(DISCORD_BRIDGE_SRC) && fs.existsSync(path.dirname(DISCORD_BRIDGE_DEST))) {
2682
- try {
2683
- fs.copyFileSync(DISCORD_BRIDGE_SRC, DISCORD_BRIDGE_DEST);
2684
- console.log("[bridge-refresh] refreshed Discord bridge script from package");
2685
- } catch (err) {
2686
- console.warn(`[bridge-refresh] failed to refresh Discord bridge script: ${err.message || err}`);
2687
- }
2688
- }
2689
-
2690
- // bridge slug patches
2691
- const BRIDGE_SLUG_PATCHES = [
2692
- { file: path.join(os.homedir(), ".quadwork", "agentchattr-telegram", "telegram_bridge.py"), old: '"telegram-bridge"', replacement: '"tg"' },
2693
- { file: path.join(os.homedir(), ".quadwork", "agentchattr-discord", "discord_bridge.py"), old: '"discord-bridge"', replacement: '"dc"' },
2694
- ];
2695
- for (const { file, old, replacement } of BRIDGE_SLUG_PATCHES) {
2696
- try {
2697
- if (!fs.existsSync(file)) continue;
2698
- const content = fs.readFileSync(file, "utf-8");
2699
- if (!content.includes(old)) continue;
2700
- fs.writeFileSync(file, content.replaceAll(old, replacement));
2701
- console.log(`[bridge-migrate] patched stale bridge_sender in ${path.basename(file)}`);
2702
- } catch {}
2703
- }
2704
1725
 
2705
1726
  // reseed stale slugs
2706
1727
  const SLUG_FIXES = [
@@ -2736,109 +1757,6 @@ function runStartupMigrations(cfg) {
2736
1757
  }
2737
1758
  }
2738
1759
 
2739
- // ghost-fix + idle-fix
2740
- for (const p of projects) {
2741
- const acDir = resolveProjectChattr(p.id).dir;
2742
- const regPath = path.join(acDir, "registry.py");
2743
- if (fs.existsSync(regPath)) {
2744
- try {
2745
- let reg = fs.readFileSync(regPath, "utf-8");
2746
- if (!reg.includes("force: bool")) {
2747
- reg = reg.replace(
2748
- /def register\(self, base: str, label: str \| None = None\) -> dict \| None:/,
2749
- "def register(self, base: str, label: str | None = None, force: bool = False) -> dict | None:",
2750
- );
2751
- reg = reg.replace(
2752
- " self._expire_reserved()\n\n # Find next free slot",
2753
- " self._expire_reserved()\n\n" +
2754
- " # quadwork#478 + #502: force-replace\n" +
2755
- " if force:\n" +
2756
- " ghosts = [n for n, i in self._instances.items() if i.base == base]\n" +
2757
- " for name in ghosts:\n" +
2758
- " del self._instances[name]\n" +
2759
- " stale_reserved = [rn for rn in self._reserved\n" +
2760
- " if self._parse_name(rn)[0] == base]\n" +
2761
- " for rn in stale_reserved:\n" +
2762
- " del self._reserved[rn]\n\n" +
2763
- " # Find next free slot",
2764
- );
2765
- fs.writeFileSync(regPath, reg);
2766
- console.log(`[ghost-fix] ${p.id}: patched registry.py with force-replace support`);
2767
- } else if (!reg.includes("stale_reserved")) {
2768
- reg = reg.replace(
2769
- /( +)for name in ghosts:\n\1 del self\._instances\[name\]\n\1 self\._reserved\[name\] = time\.time\(\)/,
2770
- "$1for name in ghosts:\n$1 del self._instances[name]\n" +
2771
- "$1stale_reserved = [rn for rn in self._reserved\n" +
2772
- "$1 if self._parse_name(rn)[0] == base]\n" +
2773
- "$1for rn in stale_reserved:\n" +
2774
- "$1 del self._reserved[rn]",
2775
- );
2776
- fs.writeFileSync(regPath, reg);
2777
- console.log(`[ghost-fix] ${p.id}: upgraded registry.py force-replace to clear _reserved (#502)`);
2778
- }
2779
- } catch (err) {
2780
- console.warn(`[ghost-fix] ${p.id}: failed to patch registry.py: ${err.message}`);
2781
- }
2782
- }
2783
- const appPath = path.join(acDir, "app.py");
2784
- if (fs.existsSync(appPath)) {
2785
- try {
2786
- let app = fs.readFileSync(appPath, "utf-8");
2787
- if (!app.includes("force = bool(body.get(\"force\"")) {
2788
- app = app.replace(
2789
- " result = registry.register(base, label)\n",
2790
- " force = bool(body.get(\"force\", False))\n result = registry.register(base, label, force=force)\n",
2791
- );
2792
- fs.writeFileSync(appPath, app);
2793
- console.log(`[ghost-fix] ${p.id}: patched app.py with force-replace support`);
2794
- }
2795
- } catch (err) {
2796
- console.warn(`[ghost-fix] ${p.id}: failed to patch app.py: ${err.message}`);
2797
- }
2798
- }
2799
- if (fs.existsSync(appPath)) {
2800
- try {
2801
- const app = fs.readFileSync(appPath, "utf-8");
2802
- if (app.includes("_CRASH_TIMEOUT = 15")) {
2803
- patchCrashTimeout(acDir);
2804
- console.log(`[idle-fix] ${p.id}: crash timeout patched on disk`);
2805
- acRestartNeeded.push(p.id);
2806
- }
2807
- } catch (err) {
2808
- console.warn(`[idle-fix] ${p.id}: failed to patch app.py crash timeout: ${err.message}`);
2809
- }
2810
- }
2811
- }
2812
-
2813
- // CLI-based agent sections
2814
- for (const p of projects) {
2815
- const acPath = projectAgentchattrConfigPath(p.id);
2816
- if (!fs.existsSync(acPath)) continue;
2817
- try {
2818
- let toml = fs.readFileSync(acPath, "utf-8");
2819
- const cliSections = new Set();
2820
- for (const [, agentCfg] of Object.entries(p.agents || {})) {
2821
- const cmd = agentCfg.command || "claude";
2822
- const cli = cmd.split("/").pop().split(" ")[0];
2823
- cliSections.add(cli);
2824
- }
2825
- let changed = false;
2826
- for (const cli of cliSections) {
2827
- if (!new RegExp(`^\\[agents\\.${cli}\\]`, "m").test(toml)) {
2828
- const injectMode = cli === "codex" ? "proxy_flag" : cli === "gemini" ? "env" : "flag";
2829
- toml += `\n[agents.${cli}]\ncommand = "${cli}"\nlabel = "${cli}"\nmcp_inject = "${injectMode}"\n`;
2830
- changed = true;
2831
- }
2832
- }
2833
- if (changed) {
2834
- fs.writeFileSync(acPath, toml);
2835
- console.log(`[#596] ${p.id}: added CLI-based agent sections to config.toml`);
2836
- }
2837
- } catch (err) {
2838
- console.warn(`[#596] ${p.id}: config.toml migration failed: ${err.message}`);
2839
- }
2840
- }
2841
-
2842
1760
  // #690: seed DESIGN-GUIDE.md into existing agent worktrees
2843
1761
  const designGuideSrc = path.join(__dirname, "..", "templates", "seeds", "DESIGN-GUIDE.md");
2844
1762
  if (fs.existsSync(designGuideSrc)) {
@@ -2859,97 +1777,66 @@ function runStartupMigrations(cfg) {
2859
1777
  }
2860
1778
  }
2861
1779
 
2862
- return acRestartNeeded;
2863
1780
  }
2864
1781
 
2865
1782
  server.listen(PORT, "127.0.0.1", async () => {
2866
1783
  console.log(`QuadWork server listening on http://127.0.0.1:${PORT}`);
2867
1784
  syncTriggersFromConfig();
2868
- // #579: detect AC processes already running (spawned by cmdStart before
2869
- // the server module loaded). Without this, chattrProcesses is empty on
2870
- // boot and the health monitor can't track cmdStart-spawned ACs, while
2871
- // the dashboard's Start button would redundantly kill+respawn them.
2872
1785
  const startupCfg = readConfig();
2873
- for (const p of (startupCfg.projects || [])) {
2874
- const { url: acUrl } = resolveProjectChattr(p.id);
2875
- const acPortMatch = acUrl.match(/:(\d+)/);
2876
- const acPort = acPortMatch ? parseInt(acPortMatch[1], 10) : 8300;
2877
- const alive = await isPortAlive(acPort);
2878
- if (alive && !chattrProcesses.has(p.id)) {
2879
- // AC is already running (e.g. spawned by cmdStart). Record it so
2880
- // the health monitor can track it and the dashboard shows the
2881
- // correct state. process is null because we don't own the child.
2882
- chattrProcesses.set(p.id, { process: null, state: "running", error: null, runningSince: Date.now() });
2883
- console.log(`[startup] ${p.id}: AC already alive on port ${acPort} — tracking`);
1786
+
1787
+ // #719: Migrate AC chat history to JSONL before initializing file-chat.
1788
+ const migrationFailed = new Set(runAcMigration(startupCfg));
1789
+
1790
+ // #722: One-time switchover — set all projects to file-based chat.
1791
+ if (!startupCfg.file_chat_switchover_done) {
1792
+ let switched = false;
1793
+ for (const p of (startupCfg.projects || [])) {
1794
+ if (p.chat_mode !== "file" && !migrationFailed.has(p.id)) {
1795
+ p.chat_mode = "file";
1796
+ switched = true;
1797
+ console.log(`[startup] ${p.id}: switched to file-based chat`);
1798
+ }
2884
1799
  }
1800
+ startupCfg.file_chat_switchover_done = true;
1801
+ writeConfig(startupCfg);
1802
+ if (switched) console.log("[startup] file-chat switchover complete");
2885
1803
  }
2886
- // Sync AgentChattr tokens for all projects on startup and backfill
2887
- // the sender-overflow CSS/JS patch (#402) so already-running AC
2888
- // instances receive the fix without requiring a restart.
2889
- // #448: retry after 5s for projects where AC isn't up yet at boot.
1804
+
1805
+ // Initialize file-chat engine for all projects.
2890
1806
  for (const p of (startupCfg.projects || [])) {
2891
- syncChattrToken(p.id).catch(() => {
2892
- setTimeout(() => syncChattrToken(p.id).catch(() => {}), 5000);
2893
- });
2894
- const { dir: acDir } = resolveProjectChattr(p.id);
2895
- if (acDir) patchAgentchattrCss(acDir);
2896
- }
2897
- const acRestartNeeded = runStartupMigrations(startupCfg);
2898
- startupCfg._acRestartNeeded = acRestartNeeded.length > 0 ? acRestartNeeded : undefined;
2899
- // #629: restart AC for projects where idle-fix patched the on-disk file
2900
- // so the running Python process picks up _CRASH_TIMEOUT = 120.
2901
- // Use port-alive check instead of chattrProcesses — AC may be running
2902
- // from a previous QuadWork instance (tracked with process: null).
2903
- if (startupCfg._acRestartNeeded) {
2904
- for (const projectId of startupCfg._acRestartNeeded) {
2905
- const { url } = resolveProjectChattr(projectId);
2906
- const portMatch = url.match(/:(\d+)/);
2907
- const port = portMatch ? parseInt(portMatch[1], 10) : 8300;
2908
- isPortAlive(port).then((alive) => {
2909
- if (!alive) return;
2910
- console.log(`[idle-fix] ${projectId}: restarting AC (port ${port}) so running process observes _CRASH_TIMEOUT = 120 (#629)`);
2911
- return fetch(`http://127.0.0.1:${PORT}/api/agentchattr/${encodeURIComponent(projectId)}/restart`, {
2912
- method: "POST",
2913
- headers: { "Content-Type": "application/json" },
2914
- body: JSON.stringify({ action: "restart" }),
2915
- });
2916
- }).then((r) => {
2917
- if (r && r.ok) console.log(`[idle-fix] ${projectId}: AC restarted successfully`);
2918
- else if (r) console.warn(`[idle-fix] ${projectId}: AC restart returned ${r.status}`);
2919
- }).catch((err) => {
2920
- console.warn(`[idle-fix] ${projectId}: AC restart failed: ${err.message}`);
2921
- });
1807
+ if (p.chat_mode === "file") {
1808
+ if (migrationFailed.has(p.id)) {
1809
+ console.error(`[startup] ${p.id}: migration failed — skipping file-chat init`);
1810
+ continue;
1811
+ }
1812
+ try {
1813
+ fileChat.initProject(p.id);
1814
+ console.log(`[startup] ${p.id}: file-chat engine initialized`);
1815
+ } catch (err) {
1816
+ console.error(`[startup] FATAL: ${p.id}: ${err.message}`);
1817
+ process.exit(1);
1818
+ }
2922
1819
  }
2923
1820
  }
2924
- // #631 + #632: auto-start Butler if enabled + auto_start
1821
+
1822
+ runStartupMigrations(startupCfg);
1823
+
2925
1824
  if (startupCfg.butler && startupCfg.butler.enabled && startupCfg.butler.auto_start) {
2926
1825
  const result = spawnButlerPty();
2927
1826
  if (result.ok) console.log(`[butler] auto-started (PID: ${result.pid})`);
2928
1827
  else console.warn(`[butler] auto-start failed: ${result.error}`);
2929
1828
  }
2930
- // #416: start the AC health monitor
2931
- startAcHealthMonitor();
1829
+ startWatchdog();
2932
1830
  });
2933
1831
 
2934
- /**
2935
- * Send SIGTERM to every AgentChattr child currently tracked by the
2936
- * server. Exported so bin/quadwork.js (`cmdInit` / `cmdStart`) can
2937
- * call it from its own SIGINT handler — AgentChattr children spawned
2938
- * by the dashboard's /api/agentchattr/{id}/start endpoint live in
2939
- * this process's in-memory `chattrProcesses` Map and are otherwise
2940
- * invisible to the CLI. Without this, a Ctrl+C in the foreground
2941
- * quadwork terminal would exit the Node process and orphan every
2942
- * dashboard-started python run.py. See review on quadwork#213.
2943
- */
2944
- function shutdownChattrProcesses() {
2945
- for (const [, proc] of chattrProcesses) {
2946
- if (proc && proc.process) {
2947
- try { proc.process.kill("SIGTERM"); } catch {}
1832
+ function shutdown() {
1833
+ stopButlerPty();
1834
+ const cfg = readConfig();
1835
+ for (const p of (cfg.projects || [])) {
1836
+ if (p.chat_mode === "file") {
1837
+ try { fileChat.shutdownProject(p.id); } catch {}
2948
1838
  }
2949
1839
  }
2950
- chattrProcesses.clear();
2951
- // #631: stop Butler PTY on shutdown
2952
- stopButlerPty();
2953
1840
  }
2954
1841
 
2955
- module.exports = { shutdownChattrProcesses };
1842
+ module.exports = { shutdown };