quadwork 1.19.3 → 2.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (118) hide show
  1. package/README.md +19 -35
  2. package/bin/quadwork.js +48 -1118
  3. package/out/404.html +1 -1
  4. package/out/__next.__PAGE__.txt +3 -3
  5. package/out/__next._full.txt +14 -14
  6. package/out/__next._head.txt +4 -4
  7. package/out/__next._index.txt +8 -8
  8. package/out/__next._tree.txt +2 -2
  9. package/out/_next/static/chunks/{030cjkhts487t.js → 079wdniva~de1.js} +1 -1
  10. package/out/_next/static/chunks/{0n~dq4kpx9xxx.js → 07lhk_q6pmm3r.js} +1 -1
  11. package/out/_next/static/chunks/0_79hkefw1mo2.js +1 -0
  12. package/out/_next/static/chunks/{08tog0xc~.es_.js → 0jllnzexn48._.js} +1 -1
  13. package/out/_next/static/chunks/0oxv9vrvc17to.js +2 -0
  14. package/out/_next/static/chunks/0py7102i226n5.js +1 -0
  15. package/out/_next/static/chunks/{13fv-yi7.v52g.js → 0q4bm04c1jl_3.js} +1 -1
  16. package/out/_next/static/chunks/{0_idxioyl0p7h.js → 0sjhy6oe3mbon.js} +1 -1
  17. package/out/_next/static/chunks/{11khe5i7gu158.js → 0z.9wnba-t6z8.js} +1 -1
  18. package/out/_next/static/chunks/13xk0vgfbrcld.css +2 -0
  19. package/out/_next/static/chunks/163_ddkdca5q4.js +25 -0
  20. package/out/_next/static/chunks/{turbopack-0qm-e3ifrz~2u.js → turbopack-0y2u-q0l2m67w.js} +1 -1
  21. package/out/_not-found/__next._full.txt +13 -13
  22. package/out/_not-found/__next._head.txt +4 -4
  23. package/out/_not-found/__next._index.txt +8 -8
  24. package/out/_not-found/__next._not-found.__PAGE__.txt +2 -2
  25. package/out/_not-found/__next._not-found.txt +3 -3
  26. package/out/_not-found/__next._tree.txt +2 -2
  27. package/out/_not-found.html +1 -1
  28. package/out/_not-found.txt +13 -13
  29. package/out/app-shell/__next._full.txt +13 -13
  30. package/out/app-shell/__next._head.txt +4 -4
  31. package/out/app-shell/__next._index.txt +8 -8
  32. package/out/app-shell/__next._tree.txt +2 -2
  33. package/out/app-shell/__next.app-shell.__PAGE__.txt +2 -2
  34. package/out/app-shell/__next.app-shell.txt +3 -3
  35. package/out/app-shell.html +1 -1
  36. package/out/app-shell.txt +13 -13
  37. package/out/index.html +1 -1
  38. package/out/index.txt +14 -14
  39. package/out/project/_/__next._full.txt +14 -14
  40. package/out/project/_/__next._head.txt +4 -4
  41. package/out/project/_/__next._index.txt +8 -8
  42. package/out/project/_/__next._tree.txt +2 -2
  43. package/out/project/_/__next.project.$d$id.__PAGE__.txt +3 -3
  44. package/out/project/_/__next.project.$d$id.txt +3 -3
  45. package/out/project/_/__next.project.txt +3 -3
  46. package/out/project/_/queue/__next._full.txt +14 -14
  47. package/out/project/_/queue/__next._head.txt +4 -4
  48. package/out/project/_/queue/__next._index.txt +8 -8
  49. package/out/project/_/queue/__next._tree.txt +2 -2
  50. package/out/project/_/queue/__next.project.$d$id.queue.__PAGE__.txt +3 -3
  51. package/out/project/_/queue/__next.project.$d$id.queue.txt +3 -3
  52. package/out/project/_/queue/__next.project.$d$id.txt +3 -3
  53. package/out/project/_/queue/__next.project.txt +3 -3
  54. package/out/project/_/queue.html +1 -1
  55. package/out/project/_/queue.txt +14 -14
  56. package/out/project/_.html +1 -1
  57. package/out/project/_.txt +14 -14
  58. package/out/settings/__next._full.txt +14 -14
  59. package/out/settings/__next._head.txt +4 -4
  60. package/out/settings/__next._index.txt +8 -8
  61. package/out/settings/__next._tree.txt +2 -2
  62. package/out/settings/__next.settings.__PAGE__.txt +3 -3
  63. package/out/settings/__next.settings.txt +3 -3
  64. package/out/settings.html +1 -1
  65. package/out/settings.txt +14 -14
  66. package/out/setup/__next._full.txt +14 -14
  67. package/out/setup/__next._head.txt +4 -4
  68. package/out/setup/__next._index.txt +8 -8
  69. package/out/setup/__next._tree.txt +2 -2
  70. package/out/setup/__next.setup.__PAGE__.txt +3 -3
  71. package/out/setup/__next.setup.txt +3 -3
  72. package/out/setup.html +1 -1
  73. package/out/setup.txt +14 -14
  74. package/package.json +4 -2
  75. package/server/ac-restore.js +128 -0
  76. package/server/bridges/discord.js +244 -0
  77. package/server/bridges/telegram.js +258 -0
  78. package/server/config.js +4 -60
  79. package/server/file-chat.js +318 -0
  80. package/server/index.js +129 -1294
  81. package/server/install-agentchattr.js +3 -284
  82. package/server/mcp-chat-shim.js +171 -0
  83. package/server/migrate-ac.js +158 -0
  84. package/server/pty-dispatcher.js +188 -0
  85. package/server/routes.js +155 -1398
  86. package/templates/CLAUDE.md +2 -2
  87. package/templates/OVERNIGHT-QUEUE.md +1 -1
  88. package/templates/seeds/butler.CLAUDE.md +30 -62
  89. package/templates/seeds/dev.AGENTS.md +10 -1
  90. package/templates/seeds/head.AGENTS.md +12 -8
  91. package/templates/seeds/re1.AGENTS.md +3 -3
  92. package/templates/seeds/re2.AGENTS.md +3 -3
  93. package/bridges/discord/__pycache__/discord_bridge.cpython-314.pyc +0 -0
  94. package/bridges/discord/discord_bridge.py +0 -666
  95. package/bridges/discord/requirements.txt +0 -2
  96. package/out/_next/static/chunks/08kw.2kplxa.6.css +0 -2
  97. package/out/_next/static/chunks/0_nm7se0m3twm.js +0 -25
  98. package/out/_next/static/chunks/0uz5svjlo9dwl.js +0 -1
  99. package/out/_next/static/chunks/0zahstmgdrpy5.js +0 -1
  100. package/out/_next/static/chunks/0zfotsowwll1x.js +0 -2
  101. package/server/__tests__/bridge-auto-stop-guard.test.js +0 -134
  102. package/server/__tests__/rate-limit-handling.test.js +0 -168
  103. package/server/__tests__/scrub-secrets.test.js +0 -235
  104. package/server/__tests__/v1110-security-qa.test.js +0 -312
  105. package/server/agentchattr-registry.js +0 -188
  106. package/server/install-agentchattr.patchCrashTimeout.test.js +0 -71
  107. package/server/queue-watcher.js +0 -171
  108. package/server/queue-watcher.test.js +0 -64
  109. package/server/routes.batchProgress.test.js +0 -94
  110. package/server/routes.chatWsSend.test.js +0 -161
  111. package/server/routes.discordBridge.test.js +0 -80
  112. package/server/routes.parseActiveBatch.test.js +0 -88
  113. package/server/routes.telegramBridge.test.js +0 -241
  114. package/templates/config.toml +0 -72
  115. package/templates/wrapper.py +0 -70
  116. /package/out/_next/static/{D66Um4H226QD5y4w5xTKq → MmPC1Rj12BOy4-HvMJjEX}/_buildManifest.js +0 -0
  117. /package/out/_next/static/{D66Um4H226QD5y4w5xTKq → MmPC1Rj12BOy4-HvMJjEX}/_clientMiddlewareManifest.js +0 -0
  118. /package/out/_next/static/{D66Um4H226QD5y4w5xTKq → MmPC1Rj12BOy4-HvMJjEX}/_ssgManifest.js +0 -0
package/server/index.js CHANGED
@@ -6,21 +6,23 @@ const os = require("os");
6
6
  const { WebSocketServer, WebSocket } = require("ws");
7
7
  const pty = require("node-pty");
8
8
  const { spawn } = require("child_process");
9
- const { readConfig, resolveAgentCwd, resolveAgentCommand, resolveProjectChattr, resolveChattrSpawn, syncChattrToken, CONFIG_PATH, ensureSecureDir, writeSecureFile, writeConfig } = require("./config");
9
+ const { readConfig, resolveAgentCwd, resolveAgentCommand, CONFIG_PATH, ensureSecureDir, writeSecureFile, writeConfig } = require("./config");
10
10
  const routes = require("./routes");
11
- const {
12
- patchAgentchattrConfigForDiscordBridge,
13
- patchAgentchattrConfigForTelegramBridge,
14
- projectAgentchattrConfigPath,
15
- } = routes;
16
- const { waitForAgentChattrReady, registerAgent, registerAgentWithRetry, deregisterAgent, startHeartbeat, stopHeartbeat } = require("./agentchattr-registry");
17
- const { patchAgentchattrCss, patchCrashTimeout } = require("./install-agentchattr");
18
- const { startQueueWatcher, stopQueueWatcher } = require("./queue-watcher");
11
+ const fileChat = require("./file-chat");
12
+ const { dispatchToAgentPTY, cleanupSession: cleanupPtyDispatcher } = require("./pty-dispatcher");
13
+ const { runAcMigration } = require("./migrate-ac");
19
14
 
20
15
  const net = require("net");
21
16
  const config = readConfig();
22
17
  const PORT = config.port || 8400;
23
18
 
19
+ function emitSystemMessage(projectId, text) {
20
+ try {
21
+ if (routes.getProjectChatMode(projectId) !== "file") return;
22
+ fileChat.appendMessage(projectId, { sender: "system", type: "system", text });
23
+ } catch {}
24
+ }
25
+
24
26
  const app = express();
25
27
  // #412 / quadwork#279: bump the global JSON body limit to 10mb so
26
28
  // POST /api/project-history can accept full chat exports. The
@@ -33,6 +35,14 @@ app.use(express.json({ limit: "10mb" }));
33
35
  // --- Mount migrated API routes (from Next.js) ---
34
36
  app.use(routes);
35
37
 
38
+ // #730: wire PTY injection dispatcher into the chat route
39
+ routes.setPtyDispatchCallback((projectId, msg) => {
40
+ dispatchToAgentPTY(projectId, msg, agentSessions, {
41
+ isLoopGuardPaused: fileChat.isLoopGuardPaused,
42
+ safeWrite,
43
+ });
44
+ });
45
+
36
46
  const server = http.createServer(app);
37
47
 
38
48
  // --- REST endpoints ---
@@ -163,9 +173,6 @@ app.get("/api/caffeinate/status", (_req, res) => {
163
173
  // PTY (term) is the source of truth for "running". WS is optional (attaches to view terminal).
164
174
  const agentSessions = new Map();
165
175
 
166
- // AgentChattr server processes — per-project (key = projectId)
167
- const chattrProcesses = new Map();
168
-
169
176
  // #631: Butler session — single global PTY (not per-project, no AC integration)
170
177
  let butlerSession = { term: null, viewers: new Set(), viewerDims: new Map(), lastDims: null, state: "stopped", error: null, scrollback: Buffer.alloc(0) };
171
178
 
@@ -317,6 +324,27 @@ function writeMcpConfigFile(projectId, agentId, mcpHttpPort, token) {
317
324
  return filePath;
318
325
  }
319
326
 
327
+ function writeFileChatMcpConfig(projectId, agentId, serverPort) {
328
+ const os = require("os");
329
+ const crypto = require("crypto");
330
+ const configDir = path.join(os.homedir(), ".quadwork", projectId);
331
+ ensureSecureDir(configDir);
332
+ const filePath = path.join(configDir, `mcp-${agentId}.json`);
333
+ const shimPath = path.join(__dirname, "mcp-chat-shim.js");
334
+ const token = crypto.randomBytes(16).toString("hex");
335
+ fileChat.registerShimToken(projectId, agentId, token);
336
+ const config = {
337
+ mcpServers: {
338
+ chat: {
339
+ command: "node",
340
+ args: [shimPath, "--project", projectId, "--agent", agentId, "--port", String(serverPort), "--token", token],
341
+ },
342
+ },
343
+ };
344
+ writeSecureFile(filePath, JSON.stringify(config, null, 2));
345
+ return { filePath, token };
346
+ }
347
+
320
348
  /**
321
349
  * Build extra launch args for an agent (permission flags + MCP injection).
322
350
  * Async because Codex proxy_flag mode needs to await proxy startup.
@@ -324,16 +352,12 @@ function writeMcpConfigFile(projectId, agentId, mcpHttpPort, token) {
324
352
  async function buildAgentArgs(projectId, agentId) {
325
353
  const cfg = readConfig();
326
354
  const project = cfg.projects?.find((p) => p.id === projectId);
327
- if (!project) return { args: [], acRegistrationName: null, acServerPort: null, acRegistrationToken: null, acInjectMode: null, acMcpHttpPort: null };
355
+ if (!project) return { args: [] };
328
356
 
329
357
  const agentCfg = project.agents?.[agentId] || {};
330
358
  const command = agentCfg.command || "claude";
331
- const cliBase = command.split("/").pop().split(" ")[0]; // extract base CLI name
359
+ const cliBase = command.split("/").pop().split(" ")[0];
332
360
  const args = [];
333
- let acRegistrationName = null;
334
- let acServerPort = null;
335
- let acRegistrationToken = null;
336
- let acInjectMode = null;
337
361
 
338
362
  // Permission bypass flags
339
363
  if (agentCfg.auto_approve !== false) {
@@ -367,93 +391,22 @@ async function buildAgentArgs(projectId, agentId) {
367
391
  }
368
392
  }
369
393
 
370
- // MCP config injection
371
- const mcpHttpPort = project.mcp_http_port;
372
- const token = project.agentchattr_token;
373
- if (mcpHttpPort) {
374
- const injectMode = agentCfg.mcp_inject || (cliBase === "codex" ? "proxy_flag" : cliBase === "gemini" ? "env" : "flag");
375
- acInjectMode = injectMode;
376
- if (injectMode === "flag") {
377
- // Claude/Kimi: register with AgentChattr to obtain a per-agent
378
- // token (#239 — session_token is browser auth, not MCP auth) and
379
- // write that into the per-agent MCP config file.
380
- const chattrInfo = resolveProjectChattr(projectId);
381
- acServerPort = Number(new URL(chattrInfo.url).port) || 8300;
382
- // #565: extend timeout to 30s — first setup may need AC to install
383
- // (git clone + venv + pip install) before it can bind a port.
384
- const acReady = await waitForAgentChattrReady(acServerPort, 30000);
385
- if (!acReady) {
386
- console.warn(`[#565] Agent ${agentId}: AC not reachable on port ${acServerPort} after 30s. Spawning without chat integration.`);
387
- // #565: preserve acServerPort and acInjectMode so deferred
388
- // recovery in spawnAgentPty can retry registration later.
389
- return { args, acRegistrationName: null, acServerPort, acRegistrationToken: null, acInjectMode: injectMode, acMcpHttpPort: mcpHttpPort || null };
390
- }
391
- // #242: best-effort deregister any stale registration of the
392
- // canonical name (left over by a crashed previous QuadWork
393
- // session) so the fresh register lands at slot 1 instead of
394
- // head-2 / re2-2. We need the previous agent's bearer
395
- // token because app.py:2123 requires authenticated agent
396
- // session for family names — load it from disk (persisted
397
- // across restarts). Failures are non-fatal.
398
- const stalePersistedToken = readPersistedAgentToken(projectId, agentId);
399
- if (stalePersistedToken) {
400
- await deregisterAgent(acServerPort, agentId, stalePersistedToken).catch(() => {});
401
- clearPersistedAgentToken(projectId, agentId);
402
- }
403
- // #478: force-replace so AC expires any ghost slots for this base
404
- // #565: retry with backoff and degrade gracefully if AC is not ready
405
- const registration = await registerAgentWithRetry(acServerPort, agentId, agentCfg.display_name || null, { force: true });
406
- if (!registration) {
407
- console.warn(`[#565] Agent ${agentId}: AC registration failed after retries (${registerAgent.lastError}). Spawning without chat integration.`);
408
- } else {
409
- acRegistrationName = registration.name;
410
- acRegistrationToken = registration.token;
411
- writePersistedAgentToken(projectId, agentId, registration.token);
412
- const mcpConfigPath = writeMcpConfigFile(projectId, agentId, mcpHttpPort, registration.token);
413
- const flag = agentCfg.mcp_flag || "--mcp-config";
414
- args.push(flag, mcpConfigPath);
415
- }
416
- } else if (injectMode === "proxy_flag") {
417
- // Codex: register with AgentChattr first (#240) so the proxy
418
- // injects a real per-agent token, not the global session token.
419
- // Resolve via resolveProjectChattr so legacy/global-config
420
- // projects without a per-project agentchattr_url still work.
421
- const chattrInfo = resolveProjectChattr(projectId);
422
- acServerPort = Number(new URL(chattrInfo.url).port) || 8300;
423
- // #565: extend timeout to 30s for first-setup scenario
424
- const acReady = await waitForAgentChattrReady(acServerPort, 30000);
425
- if (!acReady) {
426
- console.warn(`[#565] Agent ${agentId}: AC not reachable on port ${acServerPort} after 30s. Spawning without chat integration.`);
427
- // #565: preserve acServerPort and acInjectMode so deferred
428
- // recovery in spawnAgentPty can retry registration later.
429
- return { args, acRegistrationName: null, acServerPort, acRegistrationToken: null, acInjectMode: injectMode, acMcpHttpPort: mcpHttpPort || null };
430
- }
431
- // #242: best-effort deregister stale canonical name first using
432
- // the persisted bearer token from a previous session.
433
- const stalePersistedToken = readPersistedAgentToken(projectId, agentId);
434
- if (stalePersistedToken) {
435
- await deregisterAgent(acServerPort, agentId, stalePersistedToken).catch(() => {});
436
- clearPersistedAgentToken(projectId, agentId);
437
- }
438
- // #478: force-replace so AC expires any ghost slots for this base
439
- // #565: retry with backoff and degrade gracefully if AC is not ready
440
- const registration = await registerAgentWithRetry(acServerPort, agentId, agentCfg.display_name || null, { force: true });
441
- if (!registration) {
442
- console.warn(`[#565] Agent ${agentId}: AC registration failed after retries (${registerAgent.lastError}). Spawning without chat integration.`);
443
- } else {
444
- acRegistrationName = registration.name;
445
- acRegistrationToken = registration.token;
446
- writePersistedAgentToken(projectId, agentId, registration.token);
447
- const upstreamUrl = `http://127.0.0.1:${mcpHttpPort}`;
448
- const proxyUrl = await startMcpProxy(projectId, agentId, upstreamUrl, registration.token);
449
- if (proxyUrl) {
450
- args.push("-c", `mcp_servers.agentchattr.url="${proxyUrl}"`);
451
- }
452
- }
453
- }
394
+ // MCP config injection — file-chat shim
395
+ const injectMode = agentCfg.mcp_inject || (cliBase === "codex" ? "proxy_flag" : cliBase === "gemini" ? "env" : "flag");
396
+ if (injectMode === "flag") {
397
+ const { filePath: mcpConfigPath } = writeFileChatMcpConfig(projectId, agentId, PORT);
398
+ const mcpFlag = agentCfg.mcp_flag || "--mcp-config";
399
+ args.push(mcpFlag, mcpConfigPath);
400
+ } else if (injectMode === "proxy_flag") {
401
+ const { token: shimToken } = writeFileChatMcpConfig(projectId, agentId, PORT);
402
+ const shimPath = path.join(__dirname, "mcp-chat-shim.js");
403
+ args.push(
404
+ "-c", `mcp_servers.chat.command="node"`,
405
+ "-c", `mcp_servers.chat.args=["${shimPath}","--project","${projectId}","--agent","${agentId}","--port","${PORT}","--token","${shimToken}"]`,
406
+ );
454
407
  }
455
-
456
- return { args, acRegistrationName, acServerPort, acRegistrationToken, acInjectMode, acMcpHttpPort: mcpHttpPort || null };
408
+ // env mode (Gemini) handled in buildAgentEnv
409
+ return { args };
457
410
  }
458
411
 
459
412
  /**
@@ -470,18 +423,19 @@ function buildAgentEnv(projectId, agentId) {
470
423
  const env = {};
471
424
 
472
425
  // Gemini: inject MCP via env var
473
- if (cliBase === "gemini" && project.mcp_http_port) {
426
+ if (cliBase === "gemini") {
474
427
  const os = require("os");
475
428
  const configDir = path.join(os.homedir(), ".quadwork", projectId);
476
429
  ensureSecureDir(configDir);
477
430
  const settingsPath = path.join(configDir, `mcp-${agentId}-settings.json`);
478
- const url = `http://127.0.0.1:${project.mcp_http_port}/mcp`;
431
+
432
+ const { token: shimToken } = writeFileChatMcpConfig(projectId, agentId, PORT);
433
+ const shimPath = path.join(__dirname, "mcp-chat-shim.js");
479
434
  const settings = {
480
435
  mcpServers: {
481
- agentchattr: {
482
- type: "http",
483
- url,
484
- ...(project.agentchattr_token ? { headers: { Authorization: `Bearer ${project.agentchattr_token}` } } : {}),
436
+ chat: {
437
+ command: "node",
438
+ args: [shimPath, "--project", projectId, "--agent", agentId, "--port", String(PORT), "--token", shimToken],
485
439
  },
486
440
  },
487
441
  };
@@ -492,76 +446,8 @@ function buildAgentEnv(projectId, agentId) {
492
446
  return env;
493
447
  }
494
448
 
495
- /**
496
- * #394 / quadwork#253: recover from a heartbeat 409 (AgentChattr was
497
- * restarted, in-memory registry wiped, our token is now stale). Mirrors
498
- * wrapper.py:732-741. Re-registers the running agent, swaps the
499
- * tracked name/token on the live session so the heartbeat interval
500
- * picks up the new credentials on its next tick, refreshes whichever
501
- * MCP transport this agent uses (Claude config file vs Codex proxy),
502
- * and restarts the queue watcher in case the assigned name changed
503
- * (multi-instance slot bump).
504
- *
505
- * Best-effort: any failure here just means the next 5s heartbeat will
506
- * fail again and we'll re-enter recovery — no tight retry loop because
507
- * startHeartbeat guards re-entry with `recovering`.
508
- */
509
- async function recoverFrom409(projectId, agentId, session) {
510
- if (!session.acServerPort) return;
511
- const cfg = readConfig();
512
- const project = cfg.projects?.find((p) => p.id === projectId);
513
- const agentCfg = project?.agents?.[agentId] || {};
514
- // AC may need a moment to come back up after a restart — wait briefly.
515
- await waitForAgentChattrReady(session.acServerPort, 10000);
516
-
517
- // Best-effort cleanup of the stale registration on disk so the
518
- // fresh register isn't shoved into a slot 2 by leftover state.
519
- const stale = readPersistedAgentToken(projectId, agentId);
520
- if (stale) {
521
- await deregisterAgent(session.acServerPort, agentId, stale).catch(() => {});
522
- clearPersistedAgentToken(projectId, agentId);
523
- }
524
-
525
- // #478: force-replace so AC expires any ghost slots for this base
526
- const replacement = await registerAgent(session.acServerPort, agentId, agentCfg.display_name || null, { force: true });
527
- if (!replacement) return;
528
-
529
- const previousName = session.acRegistrationName;
530
- session.acRegistrationName = replacement.name;
531
- session.acRegistrationToken = replacement.token;
532
- writePersistedAgentToken(projectId, agentId, replacement.token);
533
-
534
- // Refresh whichever MCP transport this agent uses so subsequent
535
- // tool calls (and the queue-watcher's `mcp read` injections) hit
536
- // AC with the new bearer token instead of the now-rejected one.
537
- if (session.acInjectMode === "flag" && session.acMcpHttpPort) {
538
- try { writeMcpConfigFile(projectId, agentId, session.acMcpHttpPort, replacement.token); } catch {}
539
- } else if (session.acInjectMode === "proxy_flag") {
540
- // Codex is pinned to the original ephemeral proxy URL, so we
541
- // can't tear the listener down — mutate the token in place.
542
- try { updateMcpProxyToken(projectId, agentId, replacement.token); } catch {}
543
- }
544
-
545
- // If the assigned name changed (e.g. multi-instance slot collision)
546
- // the queue watcher is now polling the wrong file. Restart it
547
- // against the new name so chat reaches the right agent.
548
- if (replacement.name !== previousName && session.term) {
549
- if (session.queueWatcherHandle) {
550
- stopQueueWatcher(session.queueWatcherHandle);
551
- session.queueWatcherHandle = null;
552
- }
553
- try {
554
- const { dir: acDir } = resolveProjectChattr(projectId);
555
- if (acDir) {
556
- const dataDir = path.join(acDir, "data");
557
- session.queueWatcherHandle = startQueueWatcher(dataDir, replacement.name, session.term);
558
- }
559
- } catch {}
560
- }
561
- }
562
-
563
449
  // Helper: spawn a PTY for a project/agent and register in agentSessions
564
- async function spawnAgentPty(project, agent) {
450
+ async function spawnAgentPty(project, agent, opts = {}) {
565
451
  const key = `${project}/${agent}`;
566
452
 
567
453
  const cwd = resolveAgentCwd(project, agent);
@@ -593,13 +479,6 @@ async function spawnAgentPty(project, agent) {
593
479
  lastDims: null,
594
480
  state: "running",
595
481
  error: null,
596
- acRegistrationName: built.acRegistrationName,
597
- acServerPort: built.acServerPort,
598
- acRegistrationToken: built.acRegistrationToken,
599
- acInjectMode: built.acInjectMode,
600
- acMcpHttpPort: built.acMcpHttpPort,
601
- acHeartbeatHandle: null,
602
- queueWatcherHandle: null,
603
482
  lastOutputAt: Date.now(),
604
483
  // #418: ring buffer of recent PTY output so reconnecting WS
605
484
  // clients see the terminal state instead of a blank panel.
@@ -608,6 +487,10 @@ async function spawnAgentPty(project, agent) {
608
487
  };
609
488
  agentSessions.set(key, session);
610
489
 
490
+ if (!opts.suppressLifecycleMsg) {
491
+ emitSystemMessage(project, `${agent} joined`);
492
+ }
493
+
611
494
  // #418: capture PTY output into the scrollback ring buffer (64KB).
612
495
  // This runs independently of WS — even when no client is connected,
613
496
  // the buffer accumulates so the next connect gets replay.
@@ -621,72 +504,10 @@ async function spawnAgentPty(project, agent) {
621
504
  }
622
505
  });
623
506
 
624
- // #391 / quadwork#250: keep this agent alive in AgentChattr by
625
- // POSTing /api/heartbeat/{name} every 5s. Without it, AC's 60s
626
- // crash-detection window deregisters the agent and chat messages
627
- // never reach it. Mirrors wrapper.py:_heartbeat (lines 715-748).
628
- if (session.acRegistrationName && session.acServerPort && session.acRegistrationToken) {
629
- // #394 / quadwork#253: pass getters (not raw values) so the 409
630
- // recovery path below can swap acRegistrationName/Token in place
631
- // and the very next heartbeat tick uses the replacement
632
- // credentials without us having to tear down + restart the
633
- // interval.
634
- session.acHeartbeatHandle = startHeartbeat(
635
- session.acServerPort,
636
- () => session.acRegistrationName,
637
- () => session.acRegistrationToken,
638
- { onConflict: () => recoverFrom409(project, agent, session) },
639
- );
640
- }
641
-
642
- // #393 / quadwork#251: queue watcher — the actual mechanism by
643
- // which agents pick up chat. Without this an agent can be
644
- // registered + heartbeating yet still never respond, because
645
- // AgentChattr only writes to {data_dir}/{name}_queue.jsonl and
646
- // expects the agent side to poll + inject `mcp read`.
647
- if (session.acRegistrationName && session.term) {
648
- try {
649
- const { dir: acDir } = resolveProjectChattr(project);
650
- if (acDir) {
651
- const dataDir = path.join(acDir, "data");
652
- session.queueWatcherHandle = startQueueWatcher(
653
- dataDir,
654
- session.acRegistrationName,
655
- session.term,
656
- );
657
- }
658
- } catch {
659
- // best-effort — failure here just means no chat injection
660
- }
661
- }
662
-
663
- // #565: deferred restart — if the agent spawned without AC
664
- // registration (AC wasn't ready or registration failed), wait for
665
- // AC to come up then stop + respawn the agent so it gets the full
666
- // MCP CLI args (--mcp-config / -c mcp_servers...url) that can only
667
- // be set at process launch time.
668
- if (!session.acRegistrationName && session.acServerPort && session.acInjectMode) {
669
- const deferredRestart = async () => {
670
- const ready = await waitForAgentChattrReady(session.acServerPort, 60000);
671
- if (!ready) {
672
- // #572: log timeout so operators know the health monitor will
673
- // handle recovery when AC eventually comes up.
674
- console.log(`[#565] Agent ${agent}: AC not reachable after 60s — health monitor will restart agent when AC recovers.`);
675
- return;
676
- }
677
- // Guard: agent may have been stopped manually while we waited.
678
- const current = agentSessions.get(key);
679
- if (!current || !current.term || current.state !== "running") return;
680
- console.log(`[#565] Agent ${agent}: AC is now reachable — restarting agent to gain chat integration.`);
681
- await stopAgentSession(key);
682
- await spawnAgentPty(project, agent);
683
- };
684
- deferredRestart().catch(() => {});
685
- }
686
-
687
507
  term.onExit(({ exitCode }) => {
688
508
  const current = agentSessions.get(key);
689
509
  if (current && current.term === term) {
510
+ cleanupPtyDispatcher(key);
690
511
  current.state = "stopped";
691
512
  current.error = exitCode ? `exit:${exitCode}` : null;
692
513
  current.term = null;
@@ -694,27 +515,6 @@ async function spawnAgentPty(project, agent) {
694
515
  if (v.readyState <= 1) v.close(1000, `exited:${exitCode}`);
695
516
  }
696
517
  current.viewers.clear();
697
- // #391 / quadwork#250: a crashed PTY must also clear its
698
- // heartbeat interval (otherwise it leaks and a later /start
699
- // double-registers) and free the AgentChattr slot (otherwise
700
- // the agent stays falsely `active` forever and the next
701
- // register lands at slot 2). Deregister is best-effort.
702
- if (current.acHeartbeatHandle) {
703
- stopHeartbeat(current.acHeartbeatHandle);
704
- current.acHeartbeatHandle = null;
705
- }
706
- if (current.queueWatcherHandle) {
707
- stopQueueWatcher(current.queueWatcherHandle);
708
- current.queueWatcherHandle = null;
709
- }
710
- if (current.acRegistrationName && current.acServerPort) {
711
- deregisterAgent(current.acServerPort, current.acRegistrationName).catch(() => {});
712
- if (current.projectId && current.agentId) {
713
- try { clearPersistedAgentToken(current.projectId, current.agentId); } catch {}
714
- }
715
- current.acRegistrationName = null;
716
- current.acRegistrationToken = null;
717
- }
718
518
  }
719
519
  });
720
520
 
@@ -725,16 +525,16 @@ async function spawnAgentPty(project, agent) {
725
525
  }
726
526
  }
727
527
 
728
- // Helper: stop an agent session — kill PTY, close WS, deregister.
729
- // Async because deregister must complete before a restart re-registers,
730
- // otherwise the old slot stays occupied and a fresh register lands at
731
- // head-2 instead of slot 1 (#241).
732
528
  async function stopAgentSession(key) {
733
529
  const session = agentSessions.get(key);
734
530
  if (!session) {
735
531
  agentSessions.set(key, { projectId: null, agentId: null, term: null, viewers: new Set(), viewerDims: new Map(), lastDims: null, state: "stopped", error: null });
736
532
  return;
737
533
  }
534
+ if (session.projectId && session.agentId && !session._suppressLifecycleMsg) {
535
+ emitSystemMessage(session.projectId, `${session.agentId} left`);
536
+ }
537
+ cleanupPtyDispatcher(key);
738
538
  if (session.term) {
739
539
  try { session.term.kill(); } catch {}
740
540
  session.term = null;
@@ -745,33 +545,6 @@ async function stopAgentSession(key) {
745
545
  session.viewers.clear();
746
546
  session.state = "stopped";
747
547
  session.error = null;
748
- // Stop heartbeat before deregister so we don't race a final POST
749
- // against AgentChattr removing the name (#391 / quadwork#250).
750
- if (session.acHeartbeatHandle) {
751
- stopHeartbeat(session.acHeartbeatHandle);
752
- session.acHeartbeatHandle = null;
753
- }
754
- // Stop queue watcher (#393 / quadwork#251) — the PTY is gone,
755
- // injecting into a dead term would throw on the next tick.
756
- if (session.queueWatcherHandle) {
757
- stopQueueWatcher(session.queueWatcherHandle);
758
- session.queueWatcherHandle = null;
759
- }
760
- // Best-effort deregister from AgentChattr (#241) so the slot frees
761
- // and the next register lands at slot 1 instead of head-2.
762
- if (session.acRegistrationName && session.acServerPort) {
763
- try {
764
- await deregisterAgent(session.acServerPort, session.acRegistrationName);
765
- } catch {
766
- // best-effort — failures are non-fatal
767
- }
768
- if (session.projectId && session.agentId) {
769
- clearPersistedAgentToken(session.projectId, session.agentId);
770
- }
771
- session.acRegistrationName = null;
772
- session.acRegistrationToken = null;
773
- }
774
- // Clean up MCP auth proxy if running
775
548
  const [projectId, agentId] = key.split("/");
776
549
  if (projectId && agentId) stopMcpProxy(projectId, agentId);
777
550
  }
@@ -781,487 +554,15 @@ app.get("/api/agents", (_req, res) => {
781
554
  for (const [key, session] of agentSessions) {
782
555
  agents[key] = { state: session.state, error: session.error || null };
783
556
  }
784
- for (const [pid, proc] of chattrProcesses) {
785
- agents[`_agentchattr/${pid}`] = { state: proc.state, error: proc.error };
786
- }
787
557
  res.json(agents);
788
558
  });
789
559
 
790
- // #424 / quadwork#304: best-effort auto-snapshot of chat history
791
- // before any AgentChattr restart. Defense-in-depth against
792
- // destructive ops like /clear that rewrite AC's JSONL log in place
793
- // — per #303 the log itself IS persistent across normal restarts,
794
- // so the snapshot's job is to give the operator a point-in-time
795
- // rollback if the log gets clobbered, not to prevent history loss
796
- // on ordinary lifecycle events.
797
- //
798
- // Snapshot contents = the same envelope GET /api/project-history
799
- // returns, so an operator (or a future "restore" button) can feed
800
- // the file straight into POST /api/project-history for replay.
801
- const HISTORY_SNAPSHOT_LIMIT = 5;
802
-
803
- async function snapshotProjectHistory(projectId) {
804
- try {
805
- const snapDir = path.join(require("os").homedir(), ".quadwork", projectId, "history-snapshots");
806
- ensureSecureDir(snapDir);
807
- const res = await fetch(`http://127.0.0.1:${PORT}/api/project-history?project=${encodeURIComponent(projectId)}`, {
808
- signal: AbortSignal.timeout(30000),
809
- });
810
- if (!res.ok) {
811
- console.warn(`[snapshot] ${projectId} history fetch returned ${res.status}; skipping snapshot`);
812
- return false;
813
- }
814
- const text = await res.text();
815
- const stamp = new Date().toISOString().replace(/[:.]/g, "-");
816
- const outPath = path.join(snapDir, `${stamp}.json`);
817
- fs.writeFileSync(outPath, text);
818
- console.log(`[snapshot] ${projectId} → ${outPath}`);
819
- // Prune to the newest HISTORY_SNAPSHOT_LIMIT files so the
820
- // directory can't grow unbounded across weeks of restarts.
821
- try {
822
- const entries = fs.readdirSync(snapDir)
823
- .filter((f) => f.endsWith(".json"))
824
- .map((f) => ({ f, t: fs.statSync(path.join(snapDir, f)).mtimeMs }))
825
- .sort((a, b) => b.t - a.t);
826
- for (const old of entries.slice(HISTORY_SNAPSHOT_LIMIT)) {
827
- try { fs.unlinkSync(path.join(snapDir, old.f)); } catch {}
828
- }
829
- } catch {
830
- // non-fatal — stale snapshots just linger
831
- }
832
- return true;
833
- } catch (err) {
834
- console.warn(`[snapshot] ${projectId} snapshot failed: ${err.message || err}`);
835
- return false;
836
- }
837
- }
838
-
839
- // Per-project AgentChattr lifecycle: /api/agentchattr/:project/:action
840
- // Backward compat: /api/agentchattr/:action uses first project
841
- async function handleAgentChattr(req, res) {
842
- let projectId, action;
843
- if (req.params.action) {
844
- projectId = req.params.projectOrAction;
845
- action = req.params.action;
846
- } else {
847
- // Backward compat: single-param = action, use first project
848
- action = req.params.projectOrAction;
849
- const cfg = readConfig();
850
- projectId = cfg.projects?.[0]?.id || "_default";
851
- }
852
-
853
- const { url: chattrUrl } = resolveProjectChattr(projectId);
854
- const chattrPort = new URL(chattrUrl).port || "8300";
855
-
856
- // Find per-project config.toml. Phase 2E / #181: prefer the
857
- // per-project AgentChattr clone ROOT (where the web/CLI wizards now
858
- // write it as of #184/#185 — and where run.py actually reads it from).
859
- // Fall back to the legacy <working_dir>/agentchattr/config.toml for
860
- // v1 setups that haven't been migrated yet (#188).
861
- const cfg = readConfig();
862
- const project = cfg.projects?.find((p) => p.id === projectId);
863
- const { dir: resolvedAcDir } = resolveProjectChattr(projectId);
864
- let projectConfigToml = null;
865
- if (resolvedAcDir && fs.existsSync(path.join(resolvedAcDir, "config.toml"))) {
866
- projectConfigToml = path.join(resolvedAcDir, "config.toml");
867
- } else if (project?.working_dir) {
868
- const legacyToml = path.join(project.working_dir, "agentchattr", "config.toml");
869
- if (fs.existsSync(legacyToml)) projectConfigToml = legacyToml;
870
- }
871
-
872
- function getProc() {
873
- return chattrProcesses.get(projectId) || { process: null, state: "stopped", error: null };
874
- }
875
- function setProc(val) {
876
- chattrProcesses.set(projectId, val);
877
- }
878
-
879
- function regenerateConfigToml() {
880
- // If project has a config.toml, update the port to match current config
881
- if (!projectConfigToml || !fs.existsSync(projectConfigToml)) return;
882
- try {
883
- let content = fs.readFileSync(projectConfigToml, "utf-8");
884
- content = content.replace(/^port = \d+/m, `port = ${chattrPort}`);
885
- writeSecureFile(projectConfigToml, content);
886
- } catch {}
887
- }
888
-
889
- async function spawnChattr() {
890
- // Sync config.toml port before starting
891
- regenerateConfigToml();
892
-
893
- // Use project config.toml if available (isolated data dir + ports), otherwise fall back to --port
894
- const extraArgs = (projectConfigToml && fs.existsSync(projectConfigToml))
895
- ? []
896
- : ["--port", chattrPort];
897
-
898
- // Resolve AgentChattr from its cloned directory
899
- const { dir: acDir } = resolveProjectChattr(projectId);
900
- // #394: backfill sender-overflow CSS/JS patch on every spawn so
901
- // existing installs receive the fix without manual update.
902
- patchAgentchattrCss(acDir);
903
- const acSpawn = resolveChattrSpawn(acDir);
904
- if (!acSpawn) {
905
- setProc({ process: null, state: "error", error: `AgentChattr not installed. Clone it: git clone https://github.com/bcurts/agentchattr.git ${acDir}` });
906
- return null;
907
- }
908
-
909
- // #569: redirect AC stdout/stderr to a log file so operators can
910
- // diagnose startup failures. Append mode preserves restart history.
911
- const acLogDir = path.join(os.homedir(), ".quadwork", projectId);
912
- try { fs.mkdirSync(acLogDir, { recursive: true, mode: 0o700 }); } catch {}
913
- const acLogPath = path.join(acLogDir, "agentchattr.log");
914
- const acLogFd = fs.openSync(acLogPath, "a");
915
- const child = spawn(acSpawn.command, [...acSpawn.args, ...extraArgs], {
916
- cwd: acSpawn.cwd,
917
- env: process.env,
918
- stdio: ["ignore", acLogFd, acLogFd],
919
- detached: true,
920
- });
921
-
922
- // Close our copy of the log fd — child inherits its own copy.
923
- fs.closeSync(acLogFd);
924
-
925
- // If pid is undefined, spawn failed
926
- if (!child.pid) {
927
- setProc({ process: null, state: "error", error: "Failed to start AgentChattr — check that Python venv is set up in " + acDir + ". Log: " + acLogPath });
928
- child.on("error", () => {});
929
- return null;
930
- }
931
-
932
- child.unref();
933
- child.on("error", (err) => {
934
- setProc({ process: null, state: "error", error: err.message });
935
- });
936
- child.on("exit", (code) => {
937
- const cur = getProc();
938
- if (cur.process === child) {
939
- setProc({ process: null, state: "stopped", error: code ? `exit:${code}` : null });
940
- }
941
- });
942
- // #580: wait for AC to actually bind the port before declaring success.
943
- // On fast-start installs this resolves in 1-2s; prevents false-down
944
- // detection on slow starts that triggered ghost agent cascades.
945
- const ready = await waitForAgentChattrReady(chattrPort, 30000);
946
- if (ready) {
947
- setProc({ process: child, state: "running", error: null, runningSince: Date.now() });
948
- return child;
949
- } else {
950
- setProc({ process: child, state: "error", error: "AgentChattr did not become ready within 30s" });
951
- return null;
952
- }
953
- }
954
-
955
- // #386: Kill any process listening on the AC port. Handles orphaned
956
- // processes that survive QuadWork restarts (detached + unref'd spawns
957
- // lose their tracked reference when the Node process recycles).
958
- function killProcessOnPort(port, signal = "SIGTERM") {
959
- try {
960
- const pids = execFileSync("lsof", ["-ti", `TCP:${port}`, "-sTCP:LISTEN"], {
961
- encoding: "utf-8",
962
- timeout: 5000,
963
- stdio: ["pipe", "pipe", "pipe"],
964
- }).trim();
965
- if (!pids) return;
966
- for (const line of pids.split("\n")) {
967
- const pid = parseInt(line, 10);
968
- if (pid > 0) {
969
- try { process.kill(pid, signal); } catch {}
970
- }
971
- }
972
- } catch {
973
- // lsof exits non-zero when no matching process — expected
974
- }
975
- }
976
-
977
- // #386: Poll until the port is free or timeout expires.
978
- function waitForPortFree(port, timeoutMs = 3000) {
979
- const start = Date.now();
980
- return new Promise((resolve) => {
981
- function check() {
982
- try {
983
- execFileSync("lsof", ["-ti", `TCP:${port}`, "-sTCP:LISTEN"], {
984
- encoding: "utf-8",
985
- timeout: 2000,
986
- stdio: ["pipe", "pipe", "pipe"],
987
- });
988
- // Still occupied — retry if within budget
989
- if (Date.now() - start < timeoutMs) {
990
- setTimeout(check, 200);
991
- } else {
992
- resolve(false);
993
- }
994
- } catch {
995
- // lsof found nothing — port is free
996
- resolve(true);
997
- }
998
- }
999
- check();
1000
- });
1001
- }
1002
-
1003
- if (action === "start") {
1004
- const proc = getProc();
1005
- if (proc.state === "running" && proc.process) {
1006
- return res.json({ ok: true, state: "running", message: "Already running" });
1007
- }
1008
- // #401: validate AgentChattr is installed BEFORE killing anything on
1009
- // the port. Without this guard, clicking Start when AC is missing
1010
- // kills an unrelated process then fails with "not installed".
1011
- const { dir: acDir } = resolveProjectChattr(projectId);
1012
- const acSpawn = resolveChattrSpawn(acDir);
1013
- if (!acSpawn) {
1014
- const errMsg = `AgentChattr not installed. Clone it: git clone https://github.com/bcurts/agentchattr.git ${acDir}`;
1015
- setProc({ process: null, state: "error", error: errMsg });
1016
- return res.status(500).json({ ok: false, state: "error", error: errMsg });
1017
- }
1018
-
1019
- // #393: kill any orphaned process on the port before spawning
1020
- // (same pattern as restart/stop from #386).
1021
- killProcessOnPort(chattrPort);
1022
- const portFree = await waitForPortFree(chattrPort, 3000);
1023
- if (!portFree) {
1024
- console.warn(`[agentchattr] ${projectId} port ${chattrPort} still occupied after 3s — spawning anyway`);
1025
- }
1026
- try {
1027
- const child = await spawnChattr();
1028
- if (!child) {
1029
- const errProc = getProc();
1030
- return res.status(500).json({ ok: false, state: "error", error: errProc.error || "Failed to start AgentChattr" });
1031
- }
1032
- // Sync token after AgentChattr starts (it generates its own)
1033
- setTimeout(() => syncChattrToken(projectId), 2000);
1034
- res.json({ ok: true, state: "running", pid: child.pid });
1035
- } catch (err) {
1036
- setProc({ process: null, state: "error", error: err.message });
1037
- res.status(500).json({ ok: false, state: "error", error: err.message });
1038
- }
1039
- } else if (action === "stop") {
1040
- const proc = getProc();
1041
- if (proc.process) {
1042
- try { proc.process.kill("SIGTERM"); } catch {}
1043
- }
1044
- // #386: also kill any orphaned process holding the port
1045
- killProcessOnPort(chattrPort);
1046
- setProc({ process: null, state: "stopped", error: null });
1047
- res.json({ ok: true, state: "stopped" });
1048
- } else if (action === "restart") {
1049
- // #424 / quadwork#304: snapshot history before killing the
1050
- // process. Best-effort and non-blocking-on-failure so a flaky
1051
- // snapshot doesn't leave the operator unable to restart AC.
1052
- await snapshotProjectHistory(projectId).catch(() => {});
1053
- // #424 / quadwork#304 Phase 3: latch the opt-in BEFORE the
1054
- // spawn so a restart that itself clears the flag can't starve
1055
- // the auto-restore. We capture the snapshot filename we just
1056
- // wrote + the project's auto_restore_after_restart flag and
1057
- // replay it in the post-spawn tick below if both are set.
1058
- const preRestartCfg = readConfig();
1059
- const preRestartProject = preRestartCfg.projects?.find((p) => p.id === projectId);
1060
- const shouldAutoRestore = !!(preRestartProject && preRestartProject.auto_restore_after_restart);
1061
- const proc = getProc();
1062
- if (proc.process) {
1063
- console.log(`[agentchattr] ${projectId} restart: killing AC (PID: ${proc.process.pid})`);
1064
- try { proc.process.kill("SIGTERM"); } catch {}
1065
- }
1066
- // #386: also kill any orphaned process holding the port (handles
1067
- // detached processes that survived a QuadWork restart).
1068
- killProcessOnPort(chattrPort);
1069
- setProc({ process: null, state: "stopped", error: null });
1070
- // #582: wait up to 5s for the port to be free, then SIGKILL
1071
- // any remaining process as a fallback before spawning.
1072
- let portFree = await waitForPortFree(chattrPort, 5000);
1073
- if (!portFree) {
1074
- console.warn(`[agentchattr] ${projectId} port ${chattrPort} still occupied after 5s — sending SIGKILL`);
1075
- killProcessOnPort(chattrPort, "SIGKILL");
1076
- portFree = await waitForPortFree(chattrPort, 3000);
1077
- if (!portFree) {
1078
- const portErr = `Port ${chattrPort} still occupied — cannot restart`;
1079
- console.error(`[agentchattr] ${projectId} ${portErr}`);
1080
- setProc({ process: null, state: "error", error: portErr });
1081
- return res.status(500).json({ ok: false, state: "error", error: portErr });
1082
- }
1083
- }
1084
- console.log(`[agentchattr] ${projectId} restart: port ${chattrPort} is free, spawning AC`);
1085
- try {
1086
- const child = await spawnChattr();
1087
- if (!child) {
1088
- const errProc = getProc();
1089
- console.error(`[agentchattr] ${projectId} restart: spawnChattr failed — ${errProc.error || "unknown error"}`);
1090
- return res.status(500).json({ ok: false, state: "error", error: errProc.error || "Failed to start AgentChattr" });
1091
- }
1092
- console.log(`[agentchattr] ${projectId} restart: AC spawned and ready (PID: ${child.pid})`);
1093
- // Sync token after AgentChattr restarts
1094
- setTimeout(() => syncChattrToken(projectId), 2000);
1095
- // #424 / quadwork#304 Phase 3: optional auto-restore.
1096
- // Fire the restore 3s after spawn so AC's ws is ready.
1097
- // Best-effort: never blocks the restart response or
1098
- // rolls back on error.
1099
- if (shouldAutoRestore) {
1100
- setTimeout(async () => {
1101
- try {
1102
- const snapDir = path.join(require("os").homedir(), ".quadwork", projectId, "history-snapshots");
1103
- if (!fs.existsSync(snapDir)) return;
1104
- const newest = fs.readdirSync(snapDir)
1105
- .filter((f) => f.endsWith(".json"))
1106
- .map((f) => ({ f, t: fs.statSync(path.join(snapDir, f)).mtimeMs }))
1107
- .sort((a, b) => b.t - a.t)[0];
1108
- if (!newest) return;
1109
- const r = await fetch(`http://127.0.0.1:${PORT}/api/project-history/restore?project=${encodeURIComponent(projectId)}&name=${encodeURIComponent(newest.f)}`, {
1110
- method: "POST",
1111
- });
1112
- if (r.ok) console.log(`[snapshot] ${projectId} auto-restored ${newest.f}`);
1113
- else console.warn(`[snapshot] ${projectId} auto-restore returned ${r.status}`);
1114
- } catch (err) {
1115
- console.warn(`[snapshot] ${projectId} auto-restore failed: ${err.message || err}`);
1116
- }
1117
- }, 3000);
1118
- }
1119
- res.json({ ok: true, state: "running", pid: child.pid });
1120
- // #447: auto-reset all agents after AC restart so they get
1121
- // fresh MCP tokens. #581: mark reset as scheduled immediately
1122
- // so the health monitor skips its own reset while ours is in-flight.
1123
- // #579: also skip if a reset already succeeded within the last 30s.
1124
- // Multiple restart sources (bridge-migrate, health monitor, dashboard)
1125
- // can fire in rapid succession — only the first should trigger a reset.
1126
- const existingReset = _acHealth.resetState.get(projectId);
1127
- const resetRecentlyDone = existingReset &&
1128
- (existingReset.status === "succeeded" || existingReset.status === "scheduled") &&
1129
- Date.now() - existingReset.timestamp < 30_000;
1130
- if (resetRecentlyDone) {
1131
- console.log(`[agentchattr] ${projectId} skipping auto-reset — one already ${existingReset.status} ${Math.round((Date.now() - existingReset.timestamp) / 1000)}s ago`);
1132
- } else {
1133
- _acHealth.resetState.set(projectId, { status: "scheduled", timestamp: Date.now() });
1134
- }
1135
- if (!resetRecentlyDone) setTimeout(async () => {
1136
- try {
1137
- const resetResp = await fetch(`http://127.0.0.1:${PORT}/api/agents/${encodeURIComponent(projectId)}/reset`, {
1138
- method: "POST",
1139
- });
1140
- if (resetResp.ok) {
1141
- const resetData = await resetResp.json();
1142
- _acHealth.resetState.set(projectId, { status: "succeeded", timestamp: Date.now() });
1143
- console.log(`[agentchattr] ${projectId} auto-reset ${resetData.restarted} agent(s) after AC restart`);
1144
- } else {
1145
- _acHealth.resetState.set(projectId, { status: "failed", timestamp: Date.now() });
1146
- console.warn(`[agentchattr] ${projectId} agent reset after AC restart returned ${resetResp.status}`);
1147
- }
1148
- } catch (err) {
1149
- _acHealth.resetState.set(projectId, { status: "failed", timestamp: Date.now() });
1150
- console.warn(`[agentchattr] ${projectId} agent reset after AC restart failed: ${err.message || err}`);
1151
- }
1152
- }, 2000);
1153
- } catch (err) {
1154
- setProc({ process: null, state: "error", error: err.message });
1155
- res.status(500).json({ ok: false, state: "error", error: err.message });
1156
- }
1157
- } else if (action === "update") {
1158
- // Update AgentChattr: stop → git pull → pip install → restart
1159
- const { dir: acDir } = resolveProjectChattr(projectId);
1160
- if (!acDir || !fs.existsSync(path.join(acDir, "run.py"))) {
1161
- return res.status(400).json({ ok: false, error: "AgentChattr not installed at " + (acDir || "unknown") });
1162
- }
1163
- try {
1164
- // Stop running process before pulling. Snapshot first so a
1165
- // botched git pull can still be rolled back from disk.
1166
- // #424 / quadwork#304: best-effort.
1167
- await snapshotProjectHistory(projectId).catch(() => {});
1168
- // Latch the auto-restore opt-in BEFORE stop, same as the
1169
- // explicit restart branch above — a config mutation during
1170
- // the git pull shouldn't starve the replay.
1171
- const updateCfgPre = readConfig();
1172
- const updateProjectPre = updateCfgPre.projects?.find((p) => p.id === projectId);
1173
- const updateShouldAutoRestore = !!(updateProjectPre && updateProjectPre.auto_restore_after_restart);
1174
- const proc = getProc();
1175
- const wasRunning = proc.process && proc.state === "running";
1176
- if (wasRunning) {
1177
- try { proc.process.kill("SIGTERM"); } catch {}
1178
- }
1179
- // #386: kill orphaned processes on the port too
1180
- killProcessOnPort(chattrPort);
1181
- if (wasRunning) {
1182
- setProc({ process: null, state: "stopped", error: null });
1183
- // Wait for the port to be released before pulling/restarting
1184
- await waitForPortFree(chattrPort, 3000);
1185
- }
1186
-
1187
- const pullResult = execFileSync("git", ["pull"], { cwd: acDir, encoding: "utf-8", timeout: 30000, stdio: "pipe" }).trim();
1188
- // #388: re-apply sender-overflow CSS patch after git pull
1189
- patchAgentchattrCss(acDir);
1190
- // #629: re-apply crash timeout patch after git pull (pull may revert app.py)
1191
- patchCrashTimeout(acDir);
1192
- const venvPython = path.join(acDir, ".venv", "bin", "python");
1193
- let pipResult = "";
1194
- const reqFile = path.join(acDir, "requirements.txt");
1195
- if (fs.existsSync(venvPython) && fs.existsSync(reqFile)) {
1196
- pipResult = execFileSync(venvPython, ["-m", "pip", "install", "-r", "requirements.txt"], { cwd: acDir, encoding: "utf-8", timeout: 120000, stdio: "pipe" }).trim();
1197
- }
1198
-
1199
- // Restart if it was running before the update
1200
- let restarted = false;
1201
- if (wasRunning) {
1202
- const child = await spawnChattr();
1203
- restarted = !!child;
1204
- if (child) {
1205
- setTimeout(() => syncChattrToken(projectId).catch(() => {}), 2000);
1206
- // #424 / quadwork#304 Phase 3: auto-restore after an
1207
- // update-triggered restart too (t2a re-review). Same
1208
- //3s wait + newest-snapshot-by-mtime path as the explicit
1209
- // restart branch, using the pre-stop latched opt-in.
1210
- if (updateShouldAutoRestore) {
1211
- setTimeout(async () => {
1212
- try {
1213
- const snapDir = path.join(require("os").homedir(), ".quadwork", projectId, "history-snapshots");
1214
- if (!fs.existsSync(snapDir)) return;
1215
- const newest = fs.readdirSync(snapDir)
1216
- .filter((f) => f.endsWith(".json"))
1217
- .map((f) => ({ f, t: fs.statSync(path.join(snapDir, f)).mtimeMs }))
1218
- .sort((a, b) => b.t - a.t)[0];
1219
- if (!newest) return;
1220
- const r = await fetch(`http://127.0.0.1:${PORT}/api/project-history/restore?project=${encodeURIComponent(projectId)}&name=${encodeURIComponent(newest.f)}`, {
1221
- method: "POST",
1222
- });
1223
- if (r.ok) console.log(`[snapshot] ${projectId} auto-restored ${newest.f} after update`);
1224
- else console.warn(`[snapshot] ${projectId} post-update auto-restore returned ${r.status}`);
1225
- } catch (err) {
1226
- console.warn(`[snapshot] ${projectId} post-update auto-restore failed: ${err.message || err}`);
1227
- }
1228
- }, 3000);
1229
- }
1230
- }
1231
- }
560
+ // Per-project AgentChattr lifecycle (removed in #723 AC stack deleted)
1232
561
 
1233
- res.json({ ok: true, pull: pullResult, pip: pipResult, restarted });
1234
- } catch (err) {
1235
- res.status(500).json({ ok: false, error: err.message });
1236
- }
1237
- } else {
1238
- res.status(400).json({ error: "Unknown action" });
1239
- }
562
+ // Stub endpoints return 410 Gone so dashboard code degrades gracefully
563
+ async function handleAgentChattr(_req, res) {
564
+ return res.status(410).json({ ok: false, error: "AgentChattr removed in Phase 3" });
1240
565
  }
1241
- app.post("/api/agentchattr/:projectOrAction/:action", handleAgentChattr);
1242
- app.post("/api/agentchattr/:projectOrAction", handleAgentChattr);
1243
-
1244
- // --- Reset agents: deregister all registered slots ---
1245
- // AgentChattr doesn't expose staleness metadata, so this clears all slots.
1246
- // Agents' wrapper heartbeat will auto-re-register with clean names.
1247
-
1248
- // #416: AC health status endpoint — returns the health monitor state
1249
- // for a project so the dashboard can surface auto-restart events.
1250
- app.get("/api/agentchattr/:project/health", (req, res) => {
1251
- const projectId = req.params.project;
1252
- const proc = chattrProcesses.get(projectId);
1253
- const health = _acHealth.state.get(projectId) || { lastRestart: 0, consecutiveFailures: 0 };
1254
- res.json({
1255
- state: proc?.state || "unknown",
1256
- error: proc?.error || null,
1257
- autoRestart: {
1258
- lastRestart: health.lastRestart || null,
1259
- consecutiveFailures: health.consecutiveFailures,
1260
- gaveUp: health.consecutiveFailures >= 3,
1261
- },
1262
- });
1263
- });
1264
-
1265
566
  app.post("/api/agents/:project/reset", async (req, res) => {
1266
567
  const projectId = req.params.project;
1267
568
 
@@ -1293,6 +594,8 @@ app.post("/api/agents/:project/reset", async (req, res) => {
1293
594
 
1294
595
  // Stop all agents first (handles deregistration best-effort)
1295
596
  for (const agentId of allAgentIds) {
597
+ const s = agentSessions.get(`${projectId}/${agentId}`);
598
+ if (s) s._suppressLifecycleMsg = true;
1296
599
  await stopAgentSession(`${projectId}/${agentId}`);
1297
600
  }
1298
601
 
@@ -1300,8 +603,9 @@ app.post("/api/agents/:project/reset", async (req, res) => {
1300
603
  let restarted = 0;
1301
604
  const errors = [];
1302
605
  for (const agentId of allAgentIds) {
1303
- const result = await spawnAgentPty(projectId, agentId);
606
+ const result = await spawnAgentPty(projectId, agentId, { suppressLifecycleMsg: true });
1304
607
  if (result.ok) {
608
+ emitSystemMessage(projectId, `${agentId} restarted`);
1305
609
  restarted++;
1306
610
  } else {
1307
611
  errors.push(`${agentId}: ${result.error}`);
@@ -1319,7 +623,7 @@ app.post("/api/agents/:project/reset", async (req, res) => {
1319
623
  }
1320
624
  });
1321
625
 
1322
- // --- Full Reset: restart all AC + agents across all projects (#657) ---
626
+ // --- Full Reset: restart all agents across all projects (#657) ---
1323
627
 
1324
628
  app.post("/api/full-reset", async (_req, res) => {
1325
629
  const start = Date.now();
@@ -1328,42 +632,21 @@ app.post("/api/full-reset", async (_req, res) => {
1328
632
  const cfg = readConfig();
1329
633
  const projects = (cfg.projects || []).filter((p) => !p.archived);
1330
634
 
1331
- // 1. Stop all agent sessions
1332
635
  console.log("[full-reset] stopping all agent sessions...");
1333
636
  const sessionKeys = [...agentSessions.keys()];
1334
637
  for (const key of sessionKeys) {
1335
638
  await stopAgentSession(key);
1336
639
  }
1337
640
 
1338
- // 2. Stop Butler if running
1339
641
  console.log("[full-reset] stopping Butler...");
1340
642
  stopButlerPty();
1341
643
 
1342
- // 3. Re-run startup migrations
1343
644
  console.log("[full-reset] running startup migrations...");
1344
645
  runStartupMigrations(cfg);
1345
646
 
1346
- // 4. Restart each project's AC + agents
1347
647
  let totalAgents = 0;
1348
648
  const errors = [];
1349
649
  for (const project of projects) {
1350
- console.log(`[full-reset] restarting AC for ${project.id}...`);
1351
- // Pre-mark reset as scheduled so AC restart's auto-reset timer is suppressed
1352
- _acHealth.resetState.set(project.id, { status: "scheduled", timestamp: Date.now() });
1353
- try {
1354
- const acResp = await fetch(`http://127.0.0.1:${PORT}/api/agentchattr/${encodeURIComponent(project.id)}/restart`, {
1355
- method: "POST",
1356
- });
1357
- if (!acResp.ok) {
1358
- const errData = await acResp.json().catch(() => ({}));
1359
- errors.push(`${project.id}: AC restart failed — ${errData.error || acResp.status}`);
1360
- continue;
1361
- }
1362
- } catch (err) {
1363
- errors.push(`${project.id}: AC — ${err.message}`);
1364
- continue;
1365
- }
1366
- // Explicitly reset agents and await result
1367
650
  try {
1368
651
  const resetResp = await fetch(`http://127.0.0.1:${PORT}/api/agents/${encodeURIComponent(project.id)}/reset`, {
1369
652
  method: "POST",
@@ -1379,7 +662,6 @@ app.post("/api/full-reset", async (_req, res) => {
1379
662
  }
1380
663
  }
1381
664
 
1382
- // 5. Restart Butler if enabled
1383
665
  if (cfg.butler?.enabled) {
1384
666
  console.log("[full-reset] restarting Butler...");
1385
667
  const result = spawnButlerPty();
@@ -1437,10 +719,13 @@ app.post("/api/agents/:project/:agent/restart", async (req, res) => {
1437
719
 
1438
720
  // #241: must await deregister before respawn so the slot frees and
1439
721
  // the fresh register lands at slot 1 instead of head-2.
722
+ const existing = agentSessions.get(key);
723
+ if (existing) existing._suppressLifecycleMsg = true;
1440
724
  await stopAgentSession(key);
1441
725
 
1442
- const result = await spawnAgentPty(project, agent);
726
+ const result = await spawnAgentPty(project, agent, { suppressLifecycleMsg: true });
1443
727
  if (result.ok) {
728
+ emitSystemMessage(project, `${agent} restarted`);
1444
729
  res.json({ ok: true, state: "running", pid: result.pid });
1445
730
  } else {
1446
731
  res.status(500).json({ ok: false, state: "error", error: result.error });
@@ -2408,276 +1693,8 @@ setInterval(autoStopPollingTick, AUTO_STOP_POLL_INTERVAL_MS);
2408
1693
  // delay is tens of seconds. Skipping projects without the opt-in
2409
1694
  // keeps the poller cheap for single-project setups.
2410
1695
 
2411
- const _loopGuardPausedState = new Map(); // projectId -> { paused: bool, scheduled: Timeout? }
2412
- const LOOP_GUARD_POLL_INTERVAL_MS = 10000;
2413
-
2414
- async function checkLoopGuardPause(project) {
2415
- if (!project || !project.auto_continue_loop_guard) return;
2416
- const { url: base, token: sessionToken } = resolveProjectChattr(project.id);
2417
- if (!base) return;
2418
- let paused = false;
2419
- try {
2420
- const r = await fetch(`${base}/api/status`, {
2421
- headers: sessionToken ? { "x-session-token": sessionToken } : {},
2422
- signal: AbortSignal.timeout(5000),
2423
- });
2424
- if (!r.ok) return;
2425
- const data = await r.json();
2426
- paused = !!(data && data.paused);
2427
- } catch {
2428
- return;
2429
- }
2430
- const state = _loopGuardPausedState.get(project.id) || { paused: false, scheduled: null };
2431
- // Transition false → true: schedule an auto-continue after the delay.
2432
- if (paused && !state.paused && !state.scheduled) {
2433
- const delaySec = Number.isFinite(project.auto_continue_delay_sec) && project.auto_continue_delay_sec >= 5
2434
- ? project.auto_continue_delay_sec
2435
- : 30;
2436
- console.log(`[loop-guard] ${project.id} paused — auto-continue in ${delaySec}s`);
2437
- state.scheduled = setTimeout(async () => {
2438
- try {
2439
- // Re-check the opt-in at fire time so a checkbox disable
2440
- // mid-wait actually stops the auto-continue.
2441
- const freshCfg = readConfig();
2442
- const fresh = freshCfg.projects?.find((p) => p.id === project.id);
2443
- if (!fresh || !fresh.auto_continue_loop_guard) {
2444
- console.log(`[loop-guard] ${project.id} auto-continue cancelled (opt-in disabled during wait)`);
2445
- } else {
2446
- // Re-check the router's pause state at fire time too. The
2447
- // 10s status poller may not have seen a manual operator
2448
- // /continue yet when the delay window (5–9s) is shorter
2449
- // than the poll interval — without this, a manual resume
2450
- // inside a 5s wait would be followed by a stale auto
2451
- // /continue that clobbers hop_count on an already-running
2452
- // chain (router.continue_routing resets the counter
2453
- // unconditionally). The re-check closes the race.
2454
- let stillPaused = false;
2455
- try {
2456
- const { url: freshBase, token: freshToken } = resolveProjectChattr(project.id);
2457
- if (freshBase) {
2458
- const sr = await fetch(`${freshBase}/api/status`, {
2459
- headers: freshToken ? { "x-session-token": freshToken } : {},
2460
- signal: AbortSignal.timeout(5000),
2461
- });
2462
- if (sr.ok) {
2463
- const sd = await sr.json();
2464
- stillPaused = !!(sd && sd.paused);
2465
- }
2466
- }
2467
- } catch {
2468
- // Status re-check failed — fall back to "don't fire".
2469
- // Stuck pause will still be caught on the next 10s tick.
2470
- }
2471
- if (!stillPaused) {
2472
- console.log(`[loop-guard] ${project.id} auto-continue cancelled (router already resumed)`);
2473
- } else {
2474
- const res = await fetch(`http://127.0.0.1:${PORT}/api/chat?project=${encodeURIComponent(project.id)}`, {
2475
- method: "POST",
2476
- headers: { "Content-Type": "application/json" },
2477
- body: JSON.stringify({ text: "/continue", channel: "general" }),
2478
- });
2479
- if (res.ok) console.log(`[loop-guard] ${project.id} auto-continued`);
2480
- else console.warn(`[loop-guard] ${project.id} auto-continue POST returned ${res.status}`);
2481
- }
2482
- }
2483
- } catch (err) {
2484
- console.warn(`[loop-guard] ${project.id} auto-continue failed: ${err.message || err}`);
2485
- }
2486
- const s2 = _loopGuardPausedState.get(project.id);
2487
- if (s2) s2.scheduled = null;
2488
- }, delaySec * 1000);
2489
- }
2490
- // Transition true → false: clear any pending timer.
2491
- if (!paused && state.paused && state.scheduled) {
2492
- clearTimeout(state.scheduled);
2493
- state.scheduled = null;
2494
- }
2495
- state.paused = paused;
2496
- _loopGuardPausedState.set(project.id, state);
2497
- }
2498
-
2499
- function runLoopGuardPollingTick() {
2500
- try {
2501
- const cfg = readConfig();
2502
- for (const p of (cfg.projects || [])) {
2503
- if (p && p.auto_continue_loop_guard) checkLoopGuardPause(p);
2504
- }
2505
- } catch {
2506
- // config unreadable — next tick will retry
2507
- }
2508
- }
2509
-
2510
- setInterval(runLoopGuardPollingTick, LOOP_GUARD_POLL_INTERVAL_MS);
2511
-
2512
1696
  // --- Start ---
2513
1697
 
2514
- // ---------------------------------------------------------------------------
2515
- // #416: AC health monitor — auto-restart AgentChattr on crash detection.
2516
- // Runs a TCP connect probe every 30s for each project with a "running" AC
2517
- // process. If the port is dead, auto-restarts (reusing the existing restart
2518
- // logic). Rate-limited to one restart per 60s per project; gives up after
2519
- // 3 consecutive failures and surfaces a persistent error.
2520
- // ---------------------------------------------------------------------------
2521
- // #572: restart agents that are running without AC registration after AC
2522
- // recovers from a crash. Scans agentSessions for the given project,
2523
- // finds agents missing acRegistrationName, and stop+respawns them so
2524
- // they get MCP CLI flags at launch time.
2525
- async function restartUnregisteredAgents(projectId) {
2526
- const toRestart = [];
2527
- for (const [key, session] of agentSessions) {
2528
- if (session.projectId !== projectId) continue;
2529
- if (session.acRegistrationName) continue; // already registered
2530
- if (session.state !== "running") continue;
2531
- if (!session.acServerPort || !session.acInjectMode) continue;
2532
- toRestart.push({ key, agentId: session.agentId });
2533
- }
2534
-
2535
- if (toRestart.length === 0) return;
2536
- const samplePort = agentSessions.get(toRestart[0].key)?.acServerPort || "?";
2537
- console.log(`[health] AC recovered on port ${samplePort} — restarting ${toRestart.length} agent(s) for chat integration`);
2538
-
2539
- for (const { key, agentId } of toRestart) {
2540
- try {
2541
- console.log(`[health] Restarting agent ${agentId} for project ${projectId} to gain chat integration`);
2542
- await stopAgentSession(key);
2543
- await spawnAgentPty(projectId, agentId);
2544
- } catch (err) {
2545
- console.error(`[health] Failed to restart agent ${agentId}: ${err.message}`);
2546
- }
2547
- }
2548
- }
2549
-
2550
- const _acHealth = {
2551
- // Per-project: { lastRestart: timestamp, consecutiveFailures: number }
2552
- state: new Map(),
2553
- intervalHandle: null,
2554
- // #581: per-project reset state — prevents duplicate resets per restart event.
2555
- // Values: { status: "scheduled"|"succeeded"|"failed", timestamp: number }
2556
- resetState: new Map(),
2557
- // #579: per-project grace period. Projects whose AC entered "running"
2558
- // within the last 60s are skipped by the health monitor so startup
2559
- // migrations (bridge-migrate, ghost-fix) and fresh spawns can settle.
2560
- // Tracked via `runningSince` in chattrProcesses entries.
2561
- };
2562
-
2563
- function isPortAlive(port) {
2564
- return new Promise((resolve) => {
2565
- const sock = net.createConnection({ port, host: "127.0.0.1" }, () => {
2566
- sock.destroy();
2567
- resolve(true);
2568
- });
2569
- sock.on("error", () => resolve(false));
2570
- sock.setTimeout(2000, () => { sock.destroy(); resolve(false); });
2571
- });
2572
- }
2573
-
2574
- async function acHealthCheck() {
2575
- const cfg = readConfig();
2576
- for (const project of (cfg.projects || [])) {
2577
- const proc = chattrProcesses.get(project.id);
2578
- // Only monitor projects that were explicitly started (state === "running"
2579
- // or had a process). Skip intentionally stopped projects.
2580
- if (!proc || proc.state === "stopped") continue;
2581
- // #579: per-project grace period — skip projects whose AC entered
2582
- // "running" within the last 60s. This lets cmdStart spawns and
2583
- // startup migrations (bridge-migrate, ghost-fix) settle before the
2584
- // monitor acts, regardless of when the project was created.
2585
- if (proc.runningSince && Date.now() - proc.runningSince < 60_000) continue;
2586
-
2587
- const { url } = resolveProjectChattr(project.id);
2588
- const portMatch = url.match(/:(\d+)/);
2589
- const port = portMatch ? parseInt(portMatch[1], 10) : 8300;
2590
-
2591
- const alive = await isPortAlive(port);
2592
- const health = _acHealth.state.get(project.id) || { lastRestart: 0, consecutiveFailures: 0 };
2593
-
2594
- if (alive) {
2595
- // Healthy — reset failure counter
2596
- if (health.consecutiveFailures > 0) {
2597
- console.log(`[health] AC for ${project.id} recovered (port ${port} alive)`);
2598
- // #572: restart agents that are running without chat integration.
2599
- // These are agents where the #565 deferred restart timed out, or
2600
- // agents spawned while AC was down. MCP flags are set at process
2601
- // launch, so a full stop+respawn is required.
2602
- // #581: dedupe — skip if a reset is in-flight or succeeded within 60s.
2603
- // If "scheduled" (in-flight), keep consecutiveFailures=1 so the next
2604
- // healthy tick re-enters this branch and retries if state became "failed".
2605
- const rs = _acHealth.resetState.get(project.id);
2606
- const resetSucceeded = rs && rs.status === "succeeded" && Date.now() - rs.timestamp < 60000;
2607
- const resetInFlight = rs && rs.status === "scheduled";
2608
- if (resetSucceeded) {
2609
- // Already handled — clear failures normally
2610
- } else if (resetInFlight) {
2611
- // In-flight — preserve failures so we retry next tick if it fails
2612
- health.consecutiveFailures = 1;
2613
- _acHealth.state.set(project.id, health);
2614
- continue;
2615
- } else {
2616
- // No recent reset or previous attempt failed — fire one
2617
- _acHealth.resetState.set(project.id, { status: "scheduled", timestamp: Date.now() });
2618
- restartUnregisteredAgents(project.id).then(() => {
2619
- _acHealth.resetState.set(project.id, { status: "succeeded", timestamp: Date.now() });
2620
- }).catch((err) => {
2621
- _acHealth.resetState.set(project.id, { status: "failed", timestamp: Date.now() });
2622
- console.error(`[health] Failed to restart unregistered agents for ${project.id}:`, err.message);
2623
- });
2624
- }
2625
- }
2626
- health.consecutiveFailures = 0;
2627
- _acHealth.state.set(project.id, health);
2628
- continue;
2629
- }
2630
-
2631
- // Port is dead — check rate limits
2632
- if (health.consecutiveFailures >= 3) {
2633
- // Already gave up — don't spam restarts. The error state persists
2634
- // in chattrProcesses for the dashboard to surface.
2635
- continue;
2636
- }
2637
-
2638
- const now = Date.now();
2639
- if (now - health.lastRestart < 60_000) {
2640
- // Too soon since last restart attempt
2641
- continue;
2642
- }
2643
-
2644
- health.consecutiveFailures++;
2645
- health.lastRestart = now;
2646
- _acHealth.state.set(project.id, health);
2647
-
2648
- console.warn(`[health] AC for ${project.id} on port ${port} is down (failure ${health.consecutiveFailures}/3) — auto-restarting`);
2649
-
2650
- // Call the existing restart endpoint internally so we reuse the
2651
- // hardened path (killProcessOnPort, waitForPortFree, snapshot,
2652
- // auto-restore) instead of reimplementing spawn logic inline.
2653
- try {
2654
- const resp = await fetch(`http://127.0.0.1:${PORT}/api/agentchattr/${encodeURIComponent(project.id)}/restart`, {
2655
- method: "POST",
2656
- timeout: 15000,
2657
- });
2658
- if (resp.ok) {
2659
- const data = await resp.json();
2660
- console.log(`[health] AC for ${project.id} auto-restarted (PID: ${data.pid})`);
2661
- // #447: agent reset is now chained inside the restart endpoint
2662
- // itself (fires on a 2s timer), so no separate call needed here.
2663
- } else {
2664
- const body = await resp.text().catch(() => "");
2665
- console.error(`[health] AC auto-restart failed for ${project.id}: ${resp.status} ${body.slice(0, 120)}`);
2666
- chattrProcesses.set(project.id, { process: null, state: "error", error: `Auto-restart failed: ${resp.status}` });
2667
- }
2668
- } catch (err) {
2669
- console.error(`[health] AC auto-restart failed for ${project.id}:`, err.message);
2670
- chattrProcesses.set(project.id, { process: null, state: "error", error: `Auto-restart failed: ${err.message}` });
2671
- }
2672
- }
2673
- }
2674
-
2675
- function startAcHealthMonitor() {
2676
- if (_acHealth.intervalHandle) return;
2677
- _acHealth.intervalHandle = setInterval(acHealthCheck, 30_000);
2678
- console.log("[health] AC health monitor started (30s interval, per-project 60s grace)");
2679
- }
2680
-
2681
1698
  // #705: auto-interrupt agents stuck with no PTY output for 10 minutes.
2682
1699
  const WATCHDOG_TIMEOUT_MS = 10 * 60 * 1000;
2683
1700
  let _watchdogHandle = null;
@@ -2686,6 +1703,8 @@ function watchdogCheck() {
2686
1703
  for (const [key, session] of agentSessions) {
2687
1704
  if (session.state !== "running" || !session.term) continue;
2688
1705
  if (!session.lastOutputAt) continue;
1706
+ // #732: skip file-chat projects — idle is normal, PTY dispatch wakes them
1707
+ if (routes.getProjectChatMode(session.projectId) === "file") continue;
2689
1708
  if (Date.now() - session.lastOutputAt > WATCHDOG_TIMEOUT_MS) {
2690
1709
  console.log(`[watchdog] ${key}: no output for 10m — sending Ctrl+C`);
2691
1710
  safeWrite(session.term, "\x03");
@@ -2703,54 +1722,6 @@ function startWatchdog() {
2703
1722
  // #657: extracted startup migrations so full-reset can re-run them
2704
1723
  function runStartupMigrations(cfg) {
2705
1724
  const projects = (cfg.projects || []).filter((p) => !p.archived);
2706
- const acRestartNeeded = [];
2707
-
2708
- // bridge-migrate
2709
- for (const p of projects) {
2710
- const acPath = projectAgentchattrConfigPath(p.id);
2711
- if (!fs.existsSync(acPath)) continue;
2712
- try {
2713
- const before = fs.readFileSync(acPath, "utf-8");
2714
- const hadOldDc = /^\[agents\.discord-bridge\]\s*$/m.test(before);
2715
- const hadOldTg = /^\[agents\.telegram-bridge\]\s*$/m.test(before);
2716
- const dc = patchAgentchattrConfigForDiscordBridge(before);
2717
- const tg = patchAgentchattrConfigForTelegramBridge(dc.text);
2718
- if (dc.changed || tg.changed) {
2719
- fs.writeFileSync(acPath, tg.text);
2720
- console.log(`[bridge-migrate] ${p.id}: migrated AC config slugs`);
2721
- if (hadOldDc || hadOldTg) {
2722
- if (!acRestartNeeded.includes(p.id)) acRestartNeeded.push(p.id);
2723
- }
2724
- }
2725
- } catch {}
2726
- }
2727
-
2728
- // bridge-refresh
2729
- const DISCORD_BRIDGE_SRC = path.join(__dirname, "..", "bridges", "discord", "discord_bridge.py");
2730
- const DISCORD_BRIDGE_DEST = path.join(os.homedir(), ".quadwork", "agentchattr-discord", "discord_bridge.py");
2731
- if (fs.existsSync(DISCORD_BRIDGE_SRC) && fs.existsSync(path.dirname(DISCORD_BRIDGE_DEST))) {
2732
- try {
2733
- fs.copyFileSync(DISCORD_BRIDGE_SRC, DISCORD_BRIDGE_DEST);
2734
- console.log("[bridge-refresh] refreshed Discord bridge script from package");
2735
- } catch (err) {
2736
- console.warn(`[bridge-refresh] failed to refresh Discord bridge script: ${err.message || err}`);
2737
- }
2738
- }
2739
-
2740
- // bridge slug patches
2741
- const BRIDGE_SLUG_PATCHES = [
2742
- { file: path.join(os.homedir(), ".quadwork", "agentchattr-telegram", "telegram_bridge.py"), old: '"telegram-bridge"', replacement: '"tg"' },
2743
- { file: path.join(os.homedir(), ".quadwork", "agentchattr-discord", "discord_bridge.py"), old: '"discord-bridge"', replacement: '"dc"' },
2744
- ];
2745
- for (const { file, old, replacement } of BRIDGE_SLUG_PATCHES) {
2746
- try {
2747
- if (!fs.existsSync(file)) continue;
2748
- const content = fs.readFileSync(file, "utf-8");
2749
- if (!content.includes(old)) continue;
2750
- fs.writeFileSync(file, content.replaceAll(old, replacement));
2751
- console.log(`[bridge-migrate] patched stale bridge_sender in ${path.basename(file)}`);
2752
- } catch {}
2753
- }
2754
1725
 
2755
1726
  // reseed stale slugs
2756
1727
  const SLUG_FIXES = [
@@ -2786,109 +1757,6 @@ function runStartupMigrations(cfg) {
2786
1757
  }
2787
1758
  }
2788
1759
 
2789
- // ghost-fix + idle-fix
2790
- for (const p of projects) {
2791
- const acDir = resolveProjectChattr(p.id).dir;
2792
- const regPath = path.join(acDir, "registry.py");
2793
- if (fs.existsSync(regPath)) {
2794
- try {
2795
- let reg = fs.readFileSync(regPath, "utf-8");
2796
- if (!reg.includes("force: bool")) {
2797
- reg = reg.replace(
2798
- /def register\(self, base: str, label: str \| None = None\) -> dict \| None:/,
2799
- "def register(self, base: str, label: str | None = None, force: bool = False) -> dict | None:",
2800
- );
2801
- reg = reg.replace(
2802
- " self._expire_reserved()\n\n # Find next free slot",
2803
- " self._expire_reserved()\n\n" +
2804
- " # quadwork#478 + #502: force-replace\n" +
2805
- " if force:\n" +
2806
- " ghosts = [n for n, i in self._instances.items() if i.base == base]\n" +
2807
- " for name in ghosts:\n" +
2808
- " del self._instances[name]\n" +
2809
- " stale_reserved = [rn for rn in self._reserved\n" +
2810
- " if self._parse_name(rn)[0] == base]\n" +
2811
- " for rn in stale_reserved:\n" +
2812
- " del self._reserved[rn]\n\n" +
2813
- " # Find next free slot",
2814
- );
2815
- fs.writeFileSync(regPath, reg);
2816
- console.log(`[ghost-fix] ${p.id}: patched registry.py with force-replace support`);
2817
- } else if (!reg.includes("stale_reserved")) {
2818
- reg = reg.replace(
2819
- /( +)for name in ghosts:\n\1 del self\._instances\[name\]\n\1 self\._reserved\[name\] = time\.time\(\)/,
2820
- "$1for name in ghosts:\n$1 del self._instances[name]\n" +
2821
- "$1stale_reserved = [rn for rn in self._reserved\n" +
2822
- "$1 if self._parse_name(rn)[0] == base]\n" +
2823
- "$1for rn in stale_reserved:\n" +
2824
- "$1 del self._reserved[rn]",
2825
- );
2826
- fs.writeFileSync(regPath, reg);
2827
- console.log(`[ghost-fix] ${p.id}: upgraded registry.py force-replace to clear _reserved (#502)`);
2828
- }
2829
- } catch (err) {
2830
- console.warn(`[ghost-fix] ${p.id}: failed to patch registry.py: ${err.message}`);
2831
- }
2832
- }
2833
- const appPath = path.join(acDir, "app.py");
2834
- if (fs.existsSync(appPath)) {
2835
- try {
2836
- let app = fs.readFileSync(appPath, "utf-8");
2837
- if (!app.includes("force = bool(body.get(\"force\"")) {
2838
- app = app.replace(
2839
- " result = registry.register(base, label)\n",
2840
- " force = bool(body.get(\"force\", False))\n result = registry.register(base, label, force=force)\n",
2841
- );
2842
- fs.writeFileSync(appPath, app);
2843
- console.log(`[ghost-fix] ${p.id}: patched app.py with force-replace support`);
2844
- }
2845
- } catch (err) {
2846
- console.warn(`[ghost-fix] ${p.id}: failed to patch app.py: ${err.message}`);
2847
- }
2848
- }
2849
- if (fs.existsSync(appPath)) {
2850
- try {
2851
- const app = fs.readFileSync(appPath, "utf-8");
2852
- if (app.includes("_CRASH_TIMEOUT = 15")) {
2853
- patchCrashTimeout(acDir);
2854
- console.log(`[idle-fix] ${p.id}: crash timeout patched on disk`);
2855
- acRestartNeeded.push(p.id);
2856
- }
2857
- } catch (err) {
2858
- console.warn(`[idle-fix] ${p.id}: failed to patch app.py crash timeout: ${err.message}`);
2859
- }
2860
- }
2861
- }
2862
-
2863
- // CLI-based agent sections
2864
- for (const p of projects) {
2865
- const acPath = projectAgentchattrConfigPath(p.id);
2866
- if (!fs.existsSync(acPath)) continue;
2867
- try {
2868
- let toml = fs.readFileSync(acPath, "utf-8");
2869
- const cliSections = new Set();
2870
- for (const [, agentCfg] of Object.entries(p.agents || {})) {
2871
- const cmd = agentCfg.command || "claude";
2872
- const cli = cmd.split("/").pop().split(" ")[0];
2873
- cliSections.add(cli);
2874
- }
2875
- let changed = false;
2876
- for (const cli of cliSections) {
2877
- if (!new RegExp(`^\\[agents\\.${cli}\\]`, "m").test(toml)) {
2878
- const injectMode = cli === "codex" ? "proxy_flag" : cli === "gemini" ? "env" : "flag";
2879
- toml += `\n[agents.${cli}]\ncommand = "${cli}"\nlabel = "${cli}"\nmcp_inject = "${injectMode}"\n`;
2880
- changed = true;
2881
- }
2882
- }
2883
- if (changed) {
2884
- fs.writeFileSync(acPath, toml);
2885
- console.log(`[#596] ${p.id}: added CLI-based agent sections to config.toml`);
2886
- }
2887
- } catch (err) {
2888
- console.warn(`[#596] ${p.id}: config.toml migration failed: ${err.message}`);
2889
- }
2890
- }
2891
-
2892
1760
  // #690: seed DESIGN-GUIDE.md into existing agent worktrees
2893
1761
  const designGuideSrc = path.join(__dirname, "..", "templates", "seeds", "DESIGN-GUIDE.md");
2894
1762
  if (fs.existsSync(designGuideSrc)) {
@@ -2909,99 +1777,66 @@ function runStartupMigrations(cfg) {
2909
1777
  }
2910
1778
  }
2911
1779
 
2912
- return acRestartNeeded;
2913
1780
  }
2914
1781
 
2915
1782
  server.listen(PORT, "127.0.0.1", async () => {
2916
1783
  console.log(`QuadWork server listening on http://127.0.0.1:${PORT}`);
2917
1784
  syncTriggersFromConfig();
2918
- // #579: detect AC processes already running (spawned by cmdStart before
2919
- // the server module loaded). Without this, chattrProcesses is empty on
2920
- // boot and the health monitor can't track cmdStart-spawned ACs, while
2921
- // the dashboard's Start button would redundantly kill+respawn them.
2922
1785
  const startupCfg = readConfig();
2923
- for (const p of (startupCfg.projects || [])) {
2924
- const { url: acUrl } = resolveProjectChattr(p.id);
2925
- const acPortMatch = acUrl.match(/:(\d+)/);
2926
- const acPort = acPortMatch ? parseInt(acPortMatch[1], 10) : 8300;
2927
- const alive = await isPortAlive(acPort);
2928
- if (alive && !chattrProcesses.has(p.id)) {
2929
- // AC is already running (e.g. spawned by cmdStart). Record it so
2930
- // the health monitor can track it and the dashboard shows the
2931
- // correct state. process is null because we don't own the child.
2932
- chattrProcesses.set(p.id, { process: null, state: "running", error: null, runningSince: Date.now() });
2933
- console.log(`[startup] ${p.id}: AC already alive on port ${acPort} — tracking`);
1786
+
1787
+ // #719: Migrate AC chat history to JSONL before initializing file-chat.
1788
+ const migrationFailed = new Set(runAcMigration(startupCfg));
1789
+
1790
+ // #722: One-time switchover — set all projects to file-based chat.
1791
+ if (!startupCfg.file_chat_switchover_done) {
1792
+ let switched = false;
1793
+ for (const p of (startupCfg.projects || [])) {
1794
+ if (p.chat_mode !== "file" && !migrationFailed.has(p.id)) {
1795
+ p.chat_mode = "file";
1796
+ switched = true;
1797
+ console.log(`[startup] ${p.id}: switched to file-based chat`);
1798
+ }
2934
1799
  }
1800
+ startupCfg.file_chat_switchover_done = true;
1801
+ writeConfig(startupCfg);
1802
+ if (switched) console.log("[startup] file-chat switchover complete");
2935
1803
  }
2936
- // Sync AgentChattr tokens for all projects on startup and backfill
2937
- // the sender-overflow CSS/JS patch (#402) so already-running AC
2938
- // instances receive the fix without requiring a restart.
2939
- // #448: retry after 5s for projects where AC isn't up yet at boot.
1804
+
1805
+ // Initialize file-chat engine for all projects.
2940
1806
  for (const p of (startupCfg.projects || [])) {
2941
- syncChattrToken(p.id).catch(() => {
2942
- setTimeout(() => syncChattrToken(p.id).catch(() => {}), 5000);
2943
- });
2944
- const { dir: acDir } = resolveProjectChattr(p.id);
2945
- if (acDir) patchAgentchattrCss(acDir);
2946
- }
2947
- const acRestartNeeded = runStartupMigrations(startupCfg);
2948
- startupCfg._acRestartNeeded = acRestartNeeded.length > 0 ? acRestartNeeded : undefined;
2949
- // #629: restart AC for projects where idle-fix patched the on-disk file
2950
- // so the running Python process picks up _CRASH_TIMEOUT = 120.
2951
- // Use port-alive check instead of chattrProcesses — AC may be running
2952
- // from a previous QuadWork instance (tracked with process: null).
2953
- if (startupCfg._acRestartNeeded) {
2954
- for (const projectId of startupCfg._acRestartNeeded) {
2955
- const { url } = resolveProjectChattr(projectId);
2956
- const portMatch = url.match(/:(\d+)/);
2957
- const port = portMatch ? parseInt(portMatch[1], 10) : 8300;
2958
- isPortAlive(port).then((alive) => {
2959
- if (!alive) return;
2960
- console.log(`[idle-fix] ${projectId}: restarting AC (port ${port}) so running process observes _CRASH_TIMEOUT = 120 (#629)`);
2961
- return fetch(`http://127.0.0.1:${PORT}/api/agentchattr/${encodeURIComponent(projectId)}/restart`, {
2962
- method: "POST",
2963
- headers: { "Content-Type": "application/json" },
2964
- body: JSON.stringify({ action: "restart" }),
2965
- });
2966
- }).then((r) => {
2967
- if (r && r.ok) console.log(`[idle-fix] ${projectId}: AC restarted successfully`);
2968
- else if (r) console.warn(`[idle-fix] ${projectId}: AC restart returned ${r.status}`);
2969
- }).catch((err) => {
2970
- console.warn(`[idle-fix] ${projectId}: AC restart failed: ${err.message}`);
2971
- });
1807
+ if (p.chat_mode === "file") {
1808
+ if (migrationFailed.has(p.id)) {
1809
+ console.error(`[startup] ${p.id}: migration failed — skipping file-chat init`);
1810
+ continue;
1811
+ }
1812
+ try {
1813
+ fileChat.initProject(p.id);
1814
+ console.log(`[startup] ${p.id}: file-chat engine initialized`);
1815
+ } catch (err) {
1816
+ console.error(`[startup] FATAL: ${p.id}: ${err.message}`);
1817
+ process.exit(1);
1818
+ }
2972
1819
  }
2973
1820
  }
2974
- // #631 + #632: auto-start Butler if enabled + auto_start
1821
+
1822
+ runStartupMigrations(startupCfg);
1823
+
2975
1824
  if (startupCfg.butler && startupCfg.butler.enabled && startupCfg.butler.auto_start) {
2976
1825
  const result = spawnButlerPty();
2977
1826
  if (result.ok) console.log(`[butler] auto-started (PID: ${result.pid})`);
2978
1827
  else console.warn(`[butler] auto-start failed: ${result.error}`);
2979
1828
  }
2980
- // #416: start the AC health monitor
2981
- startAcHealthMonitor();
2982
- // #705: start the stuck-agent watchdog
2983
1829
  startWatchdog();
2984
1830
  });
2985
1831
 
2986
- /**
2987
- * Send SIGTERM to every AgentChattr child currently tracked by the
2988
- * server. Exported so bin/quadwork.js (`cmdInit` / `cmdStart`) can
2989
- * call it from its own SIGINT handler — AgentChattr children spawned
2990
- * by the dashboard's /api/agentchattr/{id}/start endpoint live in
2991
- * this process's in-memory `chattrProcesses` Map and are otherwise
2992
- * invisible to the CLI. Without this, a Ctrl+C in the foreground
2993
- * quadwork terminal would exit the Node process and orphan every
2994
- * dashboard-started python run.py. See review on quadwork#213.
2995
- */
2996
- function shutdownChattrProcesses() {
2997
- for (const [, proc] of chattrProcesses) {
2998
- if (proc && proc.process) {
2999
- try { proc.process.kill("SIGTERM"); } catch {}
1832
+ function shutdown() {
1833
+ stopButlerPty();
1834
+ const cfg = readConfig();
1835
+ for (const p of (cfg.projects || [])) {
1836
+ if (p.chat_mode === "file") {
1837
+ try { fileChat.shutdownProject(p.id); } catch {}
3000
1838
  }
3001
1839
  }
3002
- chattrProcesses.clear();
3003
- // #631: stop Butler PTY on shutdown
3004
- stopButlerPty();
3005
1840
  }
3006
1841
 
3007
- module.exports = { shutdownChattrProcesses };
1842
+ module.exports = { shutdown };