quadwork 1.19.2 → 2.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +19 -35
- package/bin/quadwork.js +48 -1118
- package/out/404.html +1 -1
- package/out/__next.__PAGE__.txt +3 -3
- package/out/__next._full.txt +14 -14
- package/out/__next._head.txt +4 -4
- package/out/__next._index.txt +8 -8
- package/out/__next._tree.txt +2 -2
- package/out/_next/static/chunks/{030cjkhts487t.js → 079wdniva~de1.js} +1 -1
- package/out/_next/static/chunks/{0n~dq4kpx9xxx.js → 07lhk_q6pmm3r.js} +1 -1
- package/out/_next/static/chunks/0_79hkefw1mo2.js +1 -0
- package/out/_next/static/chunks/{153f.fj8jlvle.js → 0_lyyn..t63bc.js} +1 -1
- package/out/_next/static/chunks/0oxv9vrvc17to.js +2 -0
- package/out/_next/static/chunks/0py7102i226n5.js +1 -0
- package/out/_next/static/chunks/{13fv-yi7.v52g.js → 0q4bm04c1jl_3.js} +1 -1
- package/out/_next/static/chunks/{0_idxioyl0p7h.js → 0sjhy6oe3mbon.js} +1 -1
- package/out/_next/static/chunks/13xk0vgfbrcld.css +2 -0
- package/out/_next/static/chunks/14k3bfe537f9_.js +25 -0
- package/out/_next/static/chunks/{turbopack-0qm-e3ifrz~2u.js → turbopack-0y2u-q0l2m67w.js} +1 -1
- package/out/_not-found/__next._full.txt +13 -13
- package/out/_not-found/__next._head.txt +4 -4
- package/out/_not-found/__next._index.txt +8 -8
- package/out/_not-found/__next._not-found.__PAGE__.txt +2 -2
- package/out/_not-found/__next._not-found.txt +3 -3
- package/out/_not-found/__next._tree.txt +2 -2
- package/out/_not-found.html +1 -1
- package/out/_not-found.txt +13 -13
- package/out/app-shell/__next._full.txt +13 -13
- package/out/app-shell/__next._head.txt +4 -4
- package/out/app-shell/__next._index.txt +8 -8
- package/out/app-shell/__next._tree.txt +2 -2
- package/out/app-shell/__next.app-shell.__PAGE__.txt +2 -2
- package/out/app-shell/__next.app-shell.txt +3 -3
- package/out/app-shell.html +1 -1
- package/out/app-shell.txt +13 -13
- package/out/index.html +1 -1
- package/out/index.txt +14 -14
- package/out/project/_/__next._full.txt +14 -14
- package/out/project/_/__next._head.txt +4 -4
- package/out/project/_/__next._index.txt +8 -8
- package/out/project/_/__next._tree.txt +2 -2
- package/out/project/_/__next.project.$d$id.__PAGE__.txt +3 -3
- package/out/project/_/__next.project.$d$id.txt +3 -3
- package/out/project/_/__next.project.txt +3 -3
- package/out/project/_/queue/__next._full.txt +14 -14
- package/out/project/_/queue/__next._head.txt +4 -4
- package/out/project/_/queue/__next._index.txt +8 -8
- package/out/project/_/queue/__next._tree.txt +2 -2
- package/out/project/_/queue/__next.project.$d$id.queue.__PAGE__.txt +3 -3
- package/out/project/_/queue/__next.project.$d$id.queue.txt +3 -3
- package/out/project/_/queue/__next.project.$d$id.txt +3 -3
- package/out/project/_/queue/__next.project.txt +3 -3
- package/out/project/_/queue.html +1 -1
- package/out/project/_/queue.txt +14 -14
- package/out/project/_.html +1 -1
- package/out/project/_.txt +14 -14
- package/out/settings/__next._full.txt +14 -14
- package/out/settings/__next._head.txt +4 -4
- package/out/settings/__next._index.txt +8 -8
- package/out/settings/__next._tree.txt +2 -2
- package/out/settings/__next.settings.__PAGE__.txt +3 -3
- package/out/settings/__next.settings.txt +3 -3
- package/out/settings.html +1 -1
- package/out/settings.txt +14 -14
- package/out/setup/__next._full.txt +14 -14
- package/out/setup/__next._head.txt +4 -4
- package/out/setup/__next._index.txt +8 -8
- package/out/setup/__next._tree.txt +2 -2
- package/out/setup/__next.setup.__PAGE__.txt +3 -3
- package/out/setup/__next.setup.txt +3 -3
- package/out/setup.html +1 -1
- package/out/setup.txt +14 -14
- package/package.json +4 -2
- package/server/ac-restore.js +128 -0
- package/server/bridges/discord.js +183 -0
- package/server/bridges/telegram.js +210 -0
- package/server/config.js +4 -60
- package/server/file-chat.js +318 -0
- package/server/index.js +173 -1286
- package/server/install-agentchattr.js +3 -284
- package/server/mcp-chat-shim.js +171 -0
- package/server/migrate-ac.js +158 -0
- package/server/pty-dispatcher.js +188 -0
- package/server/routes.js +149 -1397
- package/templates/CLAUDE.md +2 -2
- package/templates/OVERNIGHT-QUEUE.md +1 -1
- package/templates/seeds/butler.CLAUDE.md +30 -62
- package/templates/seeds/dev.AGENTS.md +10 -1
- package/templates/seeds/head.AGENTS.md +3 -3
- package/templates/seeds/re1.AGENTS.md +3 -3
- package/templates/seeds/re2.AGENTS.md +3 -3
- package/bridges/discord/__pycache__/discord_bridge.cpython-314.pyc +0 -0
- package/bridges/discord/discord_bridge.py +0 -666
- package/bridges/discord/requirements.txt +0 -2
- package/out/_next/static/chunks/0_bb~2.5h2ntm.css +0 -2
- package/out/_next/static/chunks/0makcdqkwobp6.js +0 -25
- package/out/_next/static/chunks/0uz5svjlo9dwl.js +0 -1
- package/out/_next/static/chunks/0zahstmgdrpy5.js +0 -1
- package/out/_next/static/chunks/0zfotsowwll1x.js +0 -2
- package/server/__tests__/bridge-auto-stop-guard.test.js +0 -134
- package/server/__tests__/rate-limit-handling.test.js +0 -168
- package/server/__tests__/scrub-secrets.test.js +0 -235
- package/server/__tests__/v1110-security-qa.test.js +0 -312
- package/server/agentchattr-registry.js +0 -188
- package/server/install-agentchattr.patchCrashTimeout.test.js +0 -71
- package/server/queue-watcher.js +0 -171
- package/server/queue-watcher.test.js +0 -64
- package/server/routes.batchProgress.test.js +0 -94
- package/server/routes.chatWsSend.test.js +0 -161
- package/server/routes.discordBridge.test.js +0 -80
- package/server/routes.parseActiveBatch.test.js +0 -88
- package/server/routes.telegramBridge.test.js +0 -241
- package/templates/config.toml +0 -72
- package/templates/wrapper.py +0 -70
- /package/out/_next/static/{K7A3YZrh4sLaRRP1-Lq7v → 479UD5Kit4YvCmtgO25VT}/_buildManifest.js +0 -0
- /package/out/_next/static/{K7A3YZrh4sLaRRP1-Lq7v → 479UD5Kit4YvCmtgO25VT}/_clientMiddlewareManifest.js +0 -0
- /package/out/_next/static/{K7A3YZrh4sLaRRP1-Lq7v → 479UD5Kit4YvCmtgO25VT}/_ssgManifest.js +0 -0
package/server/index.js
CHANGED
|
@@ -6,21 +6,23 @@ const os = require("os");
|
|
|
6
6
|
const { WebSocketServer, WebSocket } = require("ws");
|
|
7
7
|
const pty = require("node-pty");
|
|
8
8
|
const { spawn } = require("child_process");
|
|
9
|
-
const { readConfig, resolveAgentCwd, resolveAgentCommand,
|
|
9
|
+
const { readConfig, resolveAgentCwd, resolveAgentCommand, CONFIG_PATH, ensureSecureDir, writeSecureFile, writeConfig } = require("./config");
|
|
10
10
|
const routes = require("./routes");
|
|
11
|
-
const
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
projectAgentchattrConfigPath,
|
|
15
|
-
} = routes;
|
|
16
|
-
const { waitForAgentChattrReady, registerAgent, registerAgentWithRetry, deregisterAgent, startHeartbeat, stopHeartbeat } = require("./agentchattr-registry");
|
|
17
|
-
const { patchAgentchattrCss, patchCrashTimeout } = require("./install-agentchattr");
|
|
18
|
-
const { startQueueWatcher, stopQueueWatcher } = require("./queue-watcher");
|
|
11
|
+
const fileChat = require("./file-chat");
|
|
12
|
+
const { dispatchToAgentPTY, cleanupSession: cleanupPtyDispatcher } = require("./pty-dispatcher");
|
|
13
|
+
const { runAcMigration } = require("./migrate-ac");
|
|
19
14
|
|
|
20
15
|
const net = require("net");
|
|
21
16
|
const config = readConfig();
|
|
22
17
|
const PORT = config.port || 8400;
|
|
23
18
|
|
|
19
|
+
function emitSystemMessage(projectId, text) {
|
|
20
|
+
try {
|
|
21
|
+
if (routes.getProjectChatMode(projectId) !== "file") return;
|
|
22
|
+
fileChat.appendMessage(projectId, { sender: "system", type: "system", text });
|
|
23
|
+
} catch {}
|
|
24
|
+
}
|
|
25
|
+
|
|
24
26
|
const app = express();
|
|
25
27
|
// #412 / quadwork#279: bump the global JSON body limit to 10mb so
|
|
26
28
|
// POST /api/project-history can accept full chat exports. The
|
|
@@ -33,6 +35,14 @@ app.use(express.json({ limit: "10mb" }));
|
|
|
33
35
|
// --- Mount migrated API routes (from Next.js) ---
|
|
34
36
|
app.use(routes);
|
|
35
37
|
|
|
38
|
+
// #730: wire PTY injection dispatcher into the chat route
|
|
39
|
+
routes.setPtyDispatchCallback((projectId, msg) => {
|
|
40
|
+
dispatchToAgentPTY(projectId, msg, agentSessions, {
|
|
41
|
+
isLoopGuardPaused: fileChat.isLoopGuardPaused,
|
|
42
|
+
safeWrite,
|
|
43
|
+
});
|
|
44
|
+
});
|
|
45
|
+
|
|
36
46
|
const server = http.createServer(app);
|
|
37
47
|
|
|
38
48
|
// --- REST endpoints ---
|
|
@@ -163,9 +173,6 @@ app.get("/api/caffeinate/status", (_req, res) => {
|
|
|
163
173
|
// PTY (term) is the source of truth for "running". WS is optional (attaches to view terminal).
|
|
164
174
|
const agentSessions = new Map();
|
|
165
175
|
|
|
166
|
-
// AgentChattr server processes — per-project (key = projectId)
|
|
167
|
-
const chattrProcesses = new Map();
|
|
168
|
-
|
|
169
176
|
// #631: Butler session — single global PTY (not per-project, no AC integration)
|
|
170
177
|
let butlerSession = { term: null, viewers: new Set(), viewerDims: new Map(), lastDims: null, state: "stopped", error: null, scrollback: Buffer.alloc(0) };
|
|
171
178
|
|
|
@@ -317,6 +324,27 @@ function writeMcpConfigFile(projectId, agentId, mcpHttpPort, token) {
|
|
|
317
324
|
return filePath;
|
|
318
325
|
}
|
|
319
326
|
|
|
327
|
+
function writeFileChatMcpConfig(projectId, agentId, serverPort) {
|
|
328
|
+
const os = require("os");
|
|
329
|
+
const crypto = require("crypto");
|
|
330
|
+
const configDir = path.join(os.homedir(), ".quadwork", projectId);
|
|
331
|
+
ensureSecureDir(configDir);
|
|
332
|
+
const filePath = path.join(configDir, `mcp-${agentId}.json`);
|
|
333
|
+
const shimPath = path.join(__dirname, "mcp-chat-shim.js");
|
|
334
|
+
const token = crypto.randomBytes(16).toString("hex");
|
|
335
|
+
fileChat.registerShimToken(projectId, agentId, token);
|
|
336
|
+
const config = {
|
|
337
|
+
mcpServers: {
|
|
338
|
+
chat: {
|
|
339
|
+
command: "node",
|
|
340
|
+
args: [shimPath, "--project", projectId, "--agent", agentId, "--port", String(serverPort), "--token", token],
|
|
341
|
+
},
|
|
342
|
+
},
|
|
343
|
+
};
|
|
344
|
+
writeSecureFile(filePath, JSON.stringify(config, null, 2));
|
|
345
|
+
return { filePath, token };
|
|
346
|
+
}
|
|
347
|
+
|
|
320
348
|
/**
|
|
321
349
|
* Build extra launch args for an agent (permission flags + MCP injection).
|
|
322
350
|
* Async because Codex proxy_flag mode needs to await proxy startup.
|
|
@@ -324,16 +352,12 @@ function writeMcpConfigFile(projectId, agentId, mcpHttpPort, token) {
|
|
|
324
352
|
async function buildAgentArgs(projectId, agentId) {
|
|
325
353
|
const cfg = readConfig();
|
|
326
354
|
const project = cfg.projects?.find((p) => p.id === projectId);
|
|
327
|
-
if (!project) return { args: []
|
|
355
|
+
if (!project) return { args: [] };
|
|
328
356
|
|
|
329
357
|
const agentCfg = project.agents?.[agentId] || {};
|
|
330
358
|
const command = agentCfg.command || "claude";
|
|
331
|
-
const cliBase = command.split("/").pop().split(" ")[0];
|
|
359
|
+
const cliBase = command.split("/").pop().split(" ")[0];
|
|
332
360
|
const args = [];
|
|
333
|
-
let acRegistrationName = null;
|
|
334
|
-
let acServerPort = null;
|
|
335
|
-
let acRegistrationToken = null;
|
|
336
|
-
let acInjectMode = null;
|
|
337
361
|
|
|
338
362
|
// Permission bypass flags
|
|
339
363
|
if (agentCfg.auto_approve !== false) {
|
|
@@ -367,93 +391,22 @@ async function buildAgentArgs(projectId, agentId) {
|
|
|
367
391
|
}
|
|
368
392
|
}
|
|
369
393
|
|
|
370
|
-
// MCP config injection
|
|
371
|
-
const
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
const
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
|
|
383
|
-
// (git clone + venv + pip install) before it can bind a port.
|
|
384
|
-
const acReady = await waitForAgentChattrReady(acServerPort, 30000);
|
|
385
|
-
if (!acReady) {
|
|
386
|
-
console.warn(`[#565] Agent ${agentId}: AC not reachable on port ${acServerPort} after 30s. Spawning without chat integration.`);
|
|
387
|
-
// #565: preserve acServerPort and acInjectMode so deferred
|
|
388
|
-
// recovery in spawnAgentPty can retry registration later.
|
|
389
|
-
return { args, acRegistrationName: null, acServerPort, acRegistrationToken: null, acInjectMode: injectMode, acMcpHttpPort: mcpHttpPort || null };
|
|
390
|
-
}
|
|
391
|
-
// #242: best-effort deregister any stale registration of the
|
|
392
|
-
// canonical name (left over by a crashed previous QuadWork
|
|
393
|
-
// session) so the fresh register lands at slot 1 instead of
|
|
394
|
-
// head-2 / re2-2. We need the previous agent's bearer
|
|
395
|
-
// token because app.py:2123 requires authenticated agent
|
|
396
|
-
// session for family names — load it from disk (persisted
|
|
397
|
-
// across restarts). Failures are non-fatal.
|
|
398
|
-
const stalePersistedToken = readPersistedAgentToken(projectId, agentId);
|
|
399
|
-
if (stalePersistedToken) {
|
|
400
|
-
await deregisterAgent(acServerPort, agentId, stalePersistedToken).catch(() => {});
|
|
401
|
-
clearPersistedAgentToken(projectId, agentId);
|
|
402
|
-
}
|
|
403
|
-
// #478: force-replace so AC expires any ghost slots for this base
|
|
404
|
-
// #565: retry with backoff and degrade gracefully if AC is not ready
|
|
405
|
-
const registration = await registerAgentWithRetry(acServerPort, agentId, agentCfg.display_name || null, { force: true });
|
|
406
|
-
if (!registration) {
|
|
407
|
-
console.warn(`[#565] Agent ${agentId}: AC registration failed after retries (${registerAgent.lastError}). Spawning without chat integration.`);
|
|
408
|
-
} else {
|
|
409
|
-
acRegistrationName = registration.name;
|
|
410
|
-
acRegistrationToken = registration.token;
|
|
411
|
-
writePersistedAgentToken(projectId, agentId, registration.token);
|
|
412
|
-
const mcpConfigPath = writeMcpConfigFile(projectId, agentId, mcpHttpPort, registration.token);
|
|
413
|
-
const flag = agentCfg.mcp_flag || "--mcp-config";
|
|
414
|
-
args.push(flag, mcpConfigPath);
|
|
415
|
-
}
|
|
416
|
-
} else if (injectMode === "proxy_flag") {
|
|
417
|
-
// Codex: register with AgentChattr first (#240) so the proxy
|
|
418
|
-
// injects a real per-agent token, not the global session token.
|
|
419
|
-
// Resolve via resolveProjectChattr so legacy/global-config
|
|
420
|
-
// projects without a per-project agentchattr_url still work.
|
|
421
|
-
const chattrInfo = resolveProjectChattr(projectId);
|
|
422
|
-
acServerPort = Number(new URL(chattrInfo.url).port) || 8300;
|
|
423
|
-
// #565: extend timeout to 30s for first-setup scenario
|
|
424
|
-
const acReady = await waitForAgentChattrReady(acServerPort, 30000);
|
|
425
|
-
if (!acReady) {
|
|
426
|
-
console.warn(`[#565] Agent ${agentId}: AC not reachable on port ${acServerPort} after 30s. Spawning without chat integration.`);
|
|
427
|
-
// #565: preserve acServerPort and acInjectMode so deferred
|
|
428
|
-
// recovery in spawnAgentPty can retry registration later.
|
|
429
|
-
return { args, acRegistrationName: null, acServerPort, acRegistrationToken: null, acInjectMode: injectMode, acMcpHttpPort: mcpHttpPort || null };
|
|
430
|
-
}
|
|
431
|
-
// #242: best-effort deregister stale canonical name first using
|
|
432
|
-
// the persisted bearer token from a previous session.
|
|
433
|
-
const stalePersistedToken = readPersistedAgentToken(projectId, agentId);
|
|
434
|
-
if (stalePersistedToken) {
|
|
435
|
-
await deregisterAgent(acServerPort, agentId, stalePersistedToken).catch(() => {});
|
|
436
|
-
clearPersistedAgentToken(projectId, agentId);
|
|
437
|
-
}
|
|
438
|
-
// #478: force-replace so AC expires any ghost slots for this base
|
|
439
|
-
// #565: retry with backoff and degrade gracefully if AC is not ready
|
|
440
|
-
const registration = await registerAgentWithRetry(acServerPort, agentId, agentCfg.display_name || null, { force: true });
|
|
441
|
-
if (!registration) {
|
|
442
|
-
console.warn(`[#565] Agent ${agentId}: AC registration failed after retries (${registerAgent.lastError}). Spawning without chat integration.`);
|
|
443
|
-
} else {
|
|
444
|
-
acRegistrationName = registration.name;
|
|
445
|
-
acRegistrationToken = registration.token;
|
|
446
|
-
writePersistedAgentToken(projectId, agentId, registration.token);
|
|
447
|
-
const upstreamUrl = `http://127.0.0.1:${mcpHttpPort}`;
|
|
448
|
-
const proxyUrl = await startMcpProxy(projectId, agentId, upstreamUrl, registration.token);
|
|
449
|
-
if (proxyUrl) {
|
|
450
|
-
args.push("-c", `mcp_servers.agentchattr.url="${proxyUrl}"`);
|
|
451
|
-
}
|
|
452
|
-
}
|
|
453
|
-
}
|
|
394
|
+
// MCP config injection — file-chat shim
|
|
395
|
+
const injectMode = agentCfg.mcp_inject || (cliBase === "codex" ? "proxy_flag" : cliBase === "gemini" ? "env" : "flag");
|
|
396
|
+
if (injectMode === "flag") {
|
|
397
|
+
const { filePath: mcpConfigPath } = writeFileChatMcpConfig(projectId, agentId, PORT);
|
|
398
|
+
const mcpFlag = agentCfg.mcp_flag || "--mcp-config";
|
|
399
|
+
args.push(mcpFlag, mcpConfigPath);
|
|
400
|
+
} else if (injectMode === "proxy_flag") {
|
|
401
|
+
const { token: shimToken } = writeFileChatMcpConfig(projectId, agentId, PORT);
|
|
402
|
+
const shimPath = path.join(__dirname, "mcp-chat-shim.js");
|
|
403
|
+
args.push(
|
|
404
|
+
"-c", `mcp_servers.chat.command="node"`,
|
|
405
|
+
"-c", `mcp_servers.chat.args=["${shimPath}","--project","${projectId}","--agent","${agentId}","--port","${PORT}","--token","${shimToken}"]`,
|
|
406
|
+
);
|
|
454
407
|
}
|
|
455
|
-
|
|
456
|
-
return { args
|
|
408
|
+
// env mode (Gemini) handled in buildAgentEnv
|
|
409
|
+
return { args };
|
|
457
410
|
}
|
|
458
411
|
|
|
459
412
|
/**
|
|
@@ -470,18 +423,19 @@ function buildAgentEnv(projectId, agentId) {
|
|
|
470
423
|
const env = {};
|
|
471
424
|
|
|
472
425
|
// Gemini: inject MCP via env var
|
|
473
|
-
if (cliBase === "gemini"
|
|
426
|
+
if (cliBase === "gemini") {
|
|
474
427
|
const os = require("os");
|
|
475
428
|
const configDir = path.join(os.homedir(), ".quadwork", projectId);
|
|
476
429
|
ensureSecureDir(configDir);
|
|
477
430
|
const settingsPath = path.join(configDir, `mcp-${agentId}-settings.json`);
|
|
478
|
-
|
|
431
|
+
|
|
432
|
+
const { token: shimToken } = writeFileChatMcpConfig(projectId, agentId, PORT);
|
|
433
|
+
const shimPath = path.join(__dirname, "mcp-chat-shim.js");
|
|
479
434
|
const settings = {
|
|
480
435
|
mcpServers: {
|
|
481
|
-
|
|
482
|
-
|
|
483
|
-
|
|
484
|
-
...(project.agentchattr_token ? { headers: { Authorization: `Bearer ${project.agentchattr_token}` } } : {}),
|
|
436
|
+
chat: {
|
|
437
|
+
command: "node",
|
|
438
|
+
args: [shimPath, "--project", projectId, "--agent", agentId, "--port", String(PORT), "--token", shimToken],
|
|
485
439
|
},
|
|
486
440
|
},
|
|
487
441
|
};
|
|
@@ -492,76 +446,8 @@ function buildAgentEnv(projectId, agentId) {
|
|
|
492
446
|
return env;
|
|
493
447
|
}
|
|
494
448
|
|
|
495
|
-
/**
|
|
496
|
-
* #394 / quadwork#253: recover from a heartbeat 409 (AgentChattr was
|
|
497
|
-
* restarted, in-memory registry wiped, our token is now stale). Mirrors
|
|
498
|
-
* wrapper.py:732-741. Re-registers the running agent, swaps the
|
|
499
|
-
* tracked name/token on the live session so the heartbeat interval
|
|
500
|
-
* picks up the new credentials on its next tick, refreshes whichever
|
|
501
|
-
* MCP transport this agent uses (Claude config file vs Codex proxy),
|
|
502
|
-
* and restarts the queue watcher in case the assigned name changed
|
|
503
|
-
* (multi-instance slot bump).
|
|
504
|
-
*
|
|
505
|
-
* Best-effort: any failure here just means the next 5s heartbeat will
|
|
506
|
-
* fail again and we'll re-enter recovery — no tight retry loop because
|
|
507
|
-
* startHeartbeat guards re-entry with `recovering`.
|
|
508
|
-
*/
|
|
509
|
-
async function recoverFrom409(projectId, agentId, session) {
|
|
510
|
-
if (!session.acServerPort) return;
|
|
511
|
-
const cfg = readConfig();
|
|
512
|
-
const project = cfg.projects?.find((p) => p.id === projectId);
|
|
513
|
-
const agentCfg = project?.agents?.[agentId] || {};
|
|
514
|
-
// AC may need a moment to come back up after a restart — wait briefly.
|
|
515
|
-
await waitForAgentChattrReady(session.acServerPort, 10000);
|
|
516
|
-
|
|
517
|
-
// Best-effort cleanup of the stale registration on disk so the
|
|
518
|
-
// fresh register isn't shoved into a slot 2 by leftover state.
|
|
519
|
-
const stale = readPersistedAgentToken(projectId, agentId);
|
|
520
|
-
if (stale) {
|
|
521
|
-
await deregisterAgent(session.acServerPort, agentId, stale).catch(() => {});
|
|
522
|
-
clearPersistedAgentToken(projectId, agentId);
|
|
523
|
-
}
|
|
524
|
-
|
|
525
|
-
// #478: force-replace so AC expires any ghost slots for this base
|
|
526
|
-
const replacement = await registerAgent(session.acServerPort, agentId, agentCfg.display_name || null, { force: true });
|
|
527
|
-
if (!replacement) return;
|
|
528
|
-
|
|
529
|
-
const previousName = session.acRegistrationName;
|
|
530
|
-
session.acRegistrationName = replacement.name;
|
|
531
|
-
session.acRegistrationToken = replacement.token;
|
|
532
|
-
writePersistedAgentToken(projectId, agentId, replacement.token);
|
|
533
|
-
|
|
534
|
-
// Refresh whichever MCP transport this agent uses so subsequent
|
|
535
|
-
// tool calls (and the queue-watcher's `mcp read` injections) hit
|
|
536
|
-
// AC with the new bearer token instead of the now-rejected one.
|
|
537
|
-
if (session.acInjectMode === "flag" && session.acMcpHttpPort) {
|
|
538
|
-
try { writeMcpConfigFile(projectId, agentId, session.acMcpHttpPort, replacement.token); } catch {}
|
|
539
|
-
} else if (session.acInjectMode === "proxy_flag") {
|
|
540
|
-
// Codex is pinned to the original ephemeral proxy URL, so we
|
|
541
|
-
// can't tear the listener down — mutate the token in place.
|
|
542
|
-
try { updateMcpProxyToken(projectId, agentId, replacement.token); } catch {}
|
|
543
|
-
}
|
|
544
|
-
|
|
545
|
-
// If the assigned name changed (e.g. multi-instance slot collision)
|
|
546
|
-
// the queue watcher is now polling the wrong file. Restart it
|
|
547
|
-
// against the new name so chat reaches the right agent.
|
|
548
|
-
if (replacement.name !== previousName && session.term) {
|
|
549
|
-
if (session.queueWatcherHandle) {
|
|
550
|
-
stopQueueWatcher(session.queueWatcherHandle);
|
|
551
|
-
session.queueWatcherHandle = null;
|
|
552
|
-
}
|
|
553
|
-
try {
|
|
554
|
-
const { dir: acDir } = resolveProjectChattr(projectId);
|
|
555
|
-
if (acDir) {
|
|
556
|
-
const dataDir = path.join(acDir, "data");
|
|
557
|
-
session.queueWatcherHandle = startQueueWatcher(dataDir, replacement.name, session.term);
|
|
558
|
-
}
|
|
559
|
-
} catch {}
|
|
560
|
-
}
|
|
561
|
-
}
|
|
562
|
-
|
|
563
449
|
// Helper: spawn a PTY for a project/agent and register in agentSessions
|
|
564
|
-
async function spawnAgentPty(project, agent) {
|
|
450
|
+
async function spawnAgentPty(project, agent, opts = {}) {
|
|
565
451
|
const key = `${project}/${agent}`;
|
|
566
452
|
|
|
567
453
|
const cwd = resolveAgentCwd(project, agent);
|
|
@@ -593,13 +479,7 @@ async function spawnAgentPty(project, agent) {
|
|
|
593
479
|
lastDims: null,
|
|
594
480
|
state: "running",
|
|
595
481
|
error: null,
|
|
596
|
-
|
|
597
|
-
acServerPort: built.acServerPort,
|
|
598
|
-
acRegistrationToken: built.acRegistrationToken,
|
|
599
|
-
acInjectMode: built.acInjectMode,
|
|
600
|
-
acMcpHttpPort: built.acMcpHttpPort,
|
|
601
|
-
acHeartbeatHandle: null,
|
|
602
|
-
queueWatcherHandle: null,
|
|
482
|
+
lastOutputAt: Date.now(),
|
|
603
483
|
// #418: ring buffer of recent PTY output so reconnecting WS
|
|
604
484
|
// clients see the terminal state instead of a blank panel.
|
|
605
485
|
// #538: scrollback is scrubbed of likely secrets before replay.
|
|
@@ -607,11 +487,16 @@ async function spawnAgentPty(project, agent) {
|
|
|
607
487
|
};
|
|
608
488
|
agentSessions.set(key, session);
|
|
609
489
|
|
|
490
|
+
if (!opts.suppressLifecycleMsg) {
|
|
491
|
+
emitSystemMessage(project, `${agent} joined`);
|
|
492
|
+
}
|
|
493
|
+
|
|
610
494
|
// #418: capture PTY output into the scrollback ring buffer (64KB).
|
|
611
495
|
// This runs independently of WS — even when no client is connected,
|
|
612
496
|
// the buffer accumulates so the next connect gets replay.
|
|
613
497
|
const SCROLLBACK_SIZE = 64 * 1024;
|
|
614
498
|
term.onData((data) => {
|
|
499
|
+
session.lastOutputAt = Date.now();
|
|
615
500
|
const chunk = Buffer.from(data);
|
|
616
501
|
session.scrollback = Buffer.concat([session.scrollback, chunk]);
|
|
617
502
|
if (session.scrollback.length > SCROLLBACK_SIZE) {
|
|
@@ -619,72 +504,10 @@ async function spawnAgentPty(project, agent) {
|
|
|
619
504
|
}
|
|
620
505
|
});
|
|
621
506
|
|
|
622
|
-
// #391 / quadwork#250: keep this agent alive in AgentChattr by
|
|
623
|
-
// POSTing /api/heartbeat/{name} every 5s. Without it, AC's 60s
|
|
624
|
-
// crash-detection window deregisters the agent and chat messages
|
|
625
|
-
// never reach it. Mirrors wrapper.py:_heartbeat (lines 715-748).
|
|
626
|
-
if (session.acRegistrationName && session.acServerPort && session.acRegistrationToken) {
|
|
627
|
-
// #394 / quadwork#253: pass getters (not raw values) so the 409
|
|
628
|
-
// recovery path below can swap acRegistrationName/Token in place
|
|
629
|
-
// and the very next heartbeat tick uses the replacement
|
|
630
|
-
// credentials without us having to tear down + restart the
|
|
631
|
-
// interval.
|
|
632
|
-
session.acHeartbeatHandle = startHeartbeat(
|
|
633
|
-
session.acServerPort,
|
|
634
|
-
() => session.acRegistrationName,
|
|
635
|
-
() => session.acRegistrationToken,
|
|
636
|
-
{ onConflict: () => recoverFrom409(project, agent, session) },
|
|
637
|
-
);
|
|
638
|
-
}
|
|
639
|
-
|
|
640
|
-
// #393 / quadwork#251: queue watcher — the actual mechanism by
|
|
641
|
-
// which agents pick up chat. Without this an agent can be
|
|
642
|
-
// registered + heartbeating yet still never respond, because
|
|
643
|
-
// AgentChattr only writes to {data_dir}/{name}_queue.jsonl and
|
|
644
|
-
// expects the agent side to poll + inject `mcp read`.
|
|
645
|
-
if (session.acRegistrationName && session.term) {
|
|
646
|
-
try {
|
|
647
|
-
const { dir: acDir } = resolveProjectChattr(project);
|
|
648
|
-
if (acDir) {
|
|
649
|
-
const dataDir = path.join(acDir, "data");
|
|
650
|
-
session.queueWatcherHandle = startQueueWatcher(
|
|
651
|
-
dataDir,
|
|
652
|
-
session.acRegistrationName,
|
|
653
|
-
session.term,
|
|
654
|
-
);
|
|
655
|
-
}
|
|
656
|
-
} catch {
|
|
657
|
-
// best-effort — failure here just means no chat injection
|
|
658
|
-
}
|
|
659
|
-
}
|
|
660
|
-
|
|
661
|
-
// #565: deferred restart — if the agent spawned without AC
|
|
662
|
-
// registration (AC wasn't ready or registration failed), wait for
|
|
663
|
-
// AC to come up then stop + respawn the agent so it gets the full
|
|
664
|
-
// MCP CLI args (--mcp-config / -c mcp_servers...url) that can only
|
|
665
|
-
// be set at process launch time.
|
|
666
|
-
if (!session.acRegistrationName && session.acServerPort && session.acInjectMode) {
|
|
667
|
-
const deferredRestart = async () => {
|
|
668
|
-
const ready = await waitForAgentChattrReady(session.acServerPort, 60000);
|
|
669
|
-
if (!ready) {
|
|
670
|
-
// #572: log timeout so operators know the health monitor will
|
|
671
|
-
// handle recovery when AC eventually comes up.
|
|
672
|
-
console.log(`[#565] Agent ${agent}: AC not reachable after 60s — health monitor will restart agent when AC recovers.`);
|
|
673
|
-
return;
|
|
674
|
-
}
|
|
675
|
-
// Guard: agent may have been stopped manually while we waited.
|
|
676
|
-
const current = agentSessions.get(key);
|
|
677
|
-
if (!current || !current.term || current.state !== "running") return;
|
|
678
|
-
console.log(`[#565] Agent ${agent}: AC is now reachable — restarting agent to gain chat integration.`);
|
|
679
|
-
await stopAgentSession(key);
|
|
680
|
-
await spawnAgentPty(project, agent);
|
|
681
|
-
};
|
|
682
|
-
deferredRestart().catch(() => {});
|
|
683
|
-
}
|
|
684
|
-
|
|
685
507
|
term.onExit(({ exitCode }) => {
|
|
686
508
|
const current = agentSessions.get(key);
|
|
687
509
|
if (current && current.term === term) {
|
|
510
|
+
cleanupPtyDispatcher(key);
|
|
688
511
|
current.state = "stopped";
|
|
689
512
|
current.error = exitCode ? `exit:${exitCode}` : null;
|
|
690
513
|
current.term = null;
|
|
@@ -692,27 +515,6 @@ async function spawnAgentPty(project, agent) {
|
|
|
692
515
|
if (v.readyState <= 1) v.close(1000, `exited:${exitCode}`);
|
|
693
516
|
}
|
|
694
517
|
current.viewers.clear();
|
|
695
|
-
// #391 / quadwork#250: a crashed PTY must also clear its
|
|
696
|
-
// heartbeat interval (otherwise it leaks and a later /start
|
|
697
|
-
// double-registers) and free the AgentChattr slot (otherwise
|
|
698
|
-
// the agent stays falsely `active` forever and the next
|
|
699
|
-
// register lands at slot 2). Deregister is best-effort.
|
|
700
|
-
if (current.acHeartbeatHandle) {
|
|
701
|
-
stopHeartbeat(current.acHeartbeatHandle);
|
|
702
|
-
current.acHeartbeatHandle = null;
|
|
703
|
-
}
|
|
704
|
-
if (current.queueWatcherHandle) {
|
|
705
|
-
stopQueueWatcher(current.queueWatcherHandle);
|
|
706
|
-
current.queueWatcherHandle = null;
|
|
707
|
-
}
|
|
708
|
-
if (current.acRegistrationName && current.acServerPort) {
|
|
709
|
-
deregisterAgent(current.acServerPort, current.acRegistrationName).catch(() => {});
|
|
710
|
-
if (current.projectId && current.agentId) {
|
|
711
|
-
try { clearPersistedAgentToken(current.projectId, current.agentId); } catch {}
|
|
712
|
-
}
|
|
713
|
-
current.acRegistrationName = null;
|
|
714
|
-
current.acRegistrationToken = null;
|
|
715
|
-
}
|
|
716
518
|
}
|
|
717
519
|
});
|
|
718
520
|
|
|
@@ -723,16 +525,16 @@ async function spawnAgentPty(project, agent) {
|
|
|
723
525
|
}
|
|
724
526
|
}
|
|
725
527
|
|
|
726
|
-
// Helper: stop an agent session — kill PTY, close WS, deregister.
|
|
727
|
-
// Async because deregister must complete before a restart re-registers,
|
|
728
|
-
// otherwise the old slot stays occupied and a fresh register lands at
|
|
729
|
-
// head-2 instead of slot 1 (#241).
|
|
730
528
|
async function stopAgentSession(key) {
|
|
731
529
|
const session = agentSessions.get(key);
|
|
732
530
|
if (!session) {
|
|
733
531
|
agentSessions.set(key, { projectId: null, agentId: null, term: null, viewers: new Set(), viewerDims: new Map(), lastDims: null, state: "stopped", error: null });
|
|
734
532
|
return;
|
|
735
533
|
}
|
|
534
|
+
if (session.projectId && session.agentId && !session._suppressLifecycleMsg) {
|
|
535
|
+
emitSystemMessage(session.projectId, `${session.agentId} left`);
|
|
536
|
+
}
|
|
537
|
+
cleanupPtyDispatcher(key);
|
|
736
538
|
if (session.term) {
|
|
737
539
|
try { session.term.kill(); } catch {}
|
|
738
540
|
session.term = null;
|
|
@@ -743,33 +545,6 @@ async function stopAgentSession(key) {
|
|
|
743
545
|
session.viewers.clear();
|
|
744
546
|
session.state = "stopped";
|
|
745
547
|
session.error = null;
|
|
746
|
-
// Stop heartbeat before deregister so we don't race a final POST
|
|
747
|
-
// against AgentChattr removing the name (#391 / quadwork#250).
|
|
748
|
-
if (session.acHeartbeatHandle) {
|
|
749
|
-
stopHeartbeat(session.acHeartbeatHandle);
|
|
750
|
-
session.acHeartbeatHandle = null;
|
|
751
|
-
}
|
|
752
|
-
// Stop queue watcher (#393 / quadwork#251) — the PTY is gone,
|
|
753
|
-
// injecting into a dead term would throw on the next tick.
|
|
754
|
-
if (session.queueWatcherHandle) {
|
|
755
|
-
stopQueueWatcher(session.queueWatcherHandle);
|
|
756
|
-
session.queueWatcherHandle = null;
|
|
757
|
-
}
|
|
758
|
-
// Best-effort deregister from AgentChattr (#241) so the slot frees
|
|
759
|
-
// and the next register lands at slot 1 instead of head-2.
|
|
760
|
-
if (session.acRegistrationName && session.acServerPort) {
|
|
761
|
-
try {
|
|
762
|
-
await deregisterAgent(session.acServerPort, session.acRegistrationName);
|
|
763
|
-
} catch {
|
|
764
|
-
// best-effort — failures are non-fatal
|
|
765
|
-
}
|
|
766
|
-
if (session.projectId && session.agentId) {
|
|
767
|
-
clearPersistedAgentToken(session.projectId, session.agentId);
|
|
768
|
-
}
|
|
769
|
-
session.acRegistrationName = null;
|
|
770
|
-
session.acRegistrationToken = null;
|
|
771
|
-
}
|
|
772
|
-
// Clean up MCP auth proxy if running
|
|
773
548
|
const [projectId, agentId] = key.split("/");
|
|
774
549
|
if (projectId && agentId) stopMcpProxy(projectId, agentId);
|
|
775
550
|
}
|
|
@@ -779,487 +554,15 @@ app.get("/api/agents", (_req, res) => {
|
|
|
779
554
|
for (const [key, session] of agentSessions) {
|
|
780
555
|
agents[key] = { state: session.state, error: session.error || null };
|
|
781
556
|
}
|
|
782
|
-
for (const [pid, proc] of chattrProcesses) {
|
|
783
|
-
agents[`_agentchattr/${pid}`] = { state: proc.state, error: proc.error };
|
|
784
|
-
}
|
|
785
557
|
res.json(agents);
|
|
786
558
|
});
|
|
787
559
|
|
|
788
|
-
//
|
|
789
|
-
// before any AgentChattr restart. Defense-in-depth against
|
|
790
|
-
// destructive ops like /clear that rewrite AC's JSONL log in place
|
|
791
|
-
// — per #303 the log itself IS persistent across normal restarts,
|
|
792
|
-
// so the snapshot's job is to give the operator a point-in-time
|
|
793
|
-
// rollback if the log gets clobbered, not to prevent history loss
|
|
794
|
-
// on ordinary lifecycle events.
|
|
795
|
-
//
|
|
796
|
-
// Snapshot contents = the same envelope GET /api/project-history
|
|
797
|
-
// returns, so an operator (or a future "restore" button) can feed
|
|
798
|
-
// the file straight into POST /api/project-history for replay.
|
|
799
|
-
const HISTORY_SNAPSHOT_LIMIT = 5;
|
|
800
|
-
|
|
801
|
-
async function snapshotProjectHistory(projectId) {
|
|
802
|
-
try {
|
|
803
|
-
const snapDir = path.join(require("os").homedir(), ".quadwork", projectId, "history-snapshots");
|
|
804
|
-
ensureSecureDir(snapDir);
|
|
805
|
-
const res = await fetch(`http://127.0.0.1:${PORT}/api/project-history?project=${encodeURIComponent(projectId)}`, {
|
|
806
|
-
signal: AbortSignal.timeout(30000),
|
|
807
|
-
});
|
|
808
|
-
if (!res.ok) {
|
|
809
|
-
console.warn(`[snapshot] ${projectId} history fetch returned ${res.status}; skipping snapshot`);
|
|
810
|
-
return false;
|
|
811
|
-
}
|
|
812
|
-
const text = await res.text();
|
|
813
|
-
const stamp = new Date().toISOString().replace(/[:.]/g, "-");
|
|
814
|
-
const outPath = path.join(snapDir, `${stamp}.json`);
|
|
815
|
-
fs.writeFileSync(outPath, text);
|
|
816
|
-
console.log(`[snapshot] ${projectId} → ${outPath}`);
|
|
817
|
-
// Prune to the newest HISTORY_SNAPSHOT_LIMIT files so the
|
|
818
|
-
// directory can't grow unbounded across weeks of restarts.
|
|
819
|
-
try {
|
|
820
|
-
const entries = fs.readdirSync(snapDir)
|
|
821
|
-
.filter((f) => f.endsWith(".json"))
|
|
822
|
-
.map((f) => ({ f, t: fs.statSync(path.join(snapDir, f)).mtimeMs }))
|
|
823
|
-
.sort((a, b) => b.t - a.t);
|
|
824
|
-
for (const old of entries.slice(HISTORY_SNAPSHOT_LIMIT)) {
|
|
825
|
-
try { fs.unlinkSync(path.join(snapDir, old.f)); } catch {}
|
|
826
|
-
}
|
|
827
|
-
} catch {
|
|
828
|
-
// non-fatal — stale snapshots just linger
|
|
829
|
-
}
|
|
830
|
-
return true;
|
|
831
|
-
} catch (err) {
|
|
832
|
-
console.warn(`[snapshot] ${projectId} snapshot failed: ${err.message || err}`);
|
|
833
|
-
return false;
|
|
834
|
-
}
|
|
835
|
-
}
|
|
836
|
-
|
|
837
|
-
// Per-project AgentChattr lifecycle: /api/agentchattr/:project/:action
|
|
838
|
-
// Backward compat: /api/agentchattr/:action uses first project
|
|
839
|
-
async function handleAgentChattr(req, res) {
|
|
840
|
-
let projectId, action;
|
|
841
|
-
if (req.params.action) {
|
|
842
|
-
projectId = req.params.projectOrAction;
|
|
843
|
-
action = req.params.action;
|
|
844
|
-
} else {
|
|
845
|
-
// Backward compat: single-param = action, use first project
|
|
846
|
-
action = req.params.projectOrAction;
|
|
847
|
-
const cfg = readConfig();
|
|
848
|
-
projectId = cfg.projects?.[0]?.id || "_default";
|
|
849
|
-
}
|
|
850
|
-
|
|
851
|
-
const { url: chattrUrl } = resolveProjectChattr(projectId);
|
|
852
|
-
const chattrPort = new URL(chattrUrl).port || "8300";
|
|
853
|
-
|
|
854
|
-
// Find per-project config.toml. Phase 2E / #181: prefer the
|
|
855
|
-
// per-project AgentChattr clone ROOT (where the web/CLI wizards now
|
|
856
|
-
// write it as of #184/#185 — and where run.py actually reads it from).
|
|
857
|
-
// Fall back to the legacy <working_dir>/agentchattr/config.toml for
|
|
858
|
-
// v1 setups that haven't been migrated yet (#188).
|
|
859
|
-
const cfg = readConfig();
|
|
860
|
-
const project = cfg.projects?.find((p) => p.id === projectId);
|
|
861
|
-
const { dir: resolvedAcDir } = resolveProjectChattr(projectId);
|
|
862
|
-
let projectConfigToml = null;
|
|
863
|
-
if (resolvedAcDir && fs.existsSync(path.join(resolvedAcDir, "config.toml"))) {
|
|
864
|
-
projectConfigToml = path.join(resolvedAcDir, "config.toml");
|
|
865
|
-
} else if (project?.working_dir) {
|
|
866
|
-
const legacyToml = path.join(project.working_dir, "agentchattr", "config.toml");
|
|
867
|
-
if (fs.existsSync(legacyToml)) projectConfigToml = legacyToml;
|
|
868
|
-
}
|
|
869
|
-
|
|
870
|
-
function getProc() {
|
|
871
|
-
return chattrProcesses.get(projectId) || { process: null, state: "stopped", error: null };
|
|
872
|
-
}
|
|
873
|
-
function setProc(val) {
|
|
874
|
-
chattrProcesses.set(projectId, val);
|
|
875
|
-
}
|
|
876
|
-
|
|
877
|
-
function regenerateConfigToml() {
|
|
878
|
-
// If project has a config.toml, update the port to match current config
|
|
879
|
-
if (!projectConfigToml || !fs.existsSync(projectConfigToml)) return;
|
|
880
|
-
try {
|
|
881
|
-
let content = fs.readFileSync(projectConfigToml, "utf-8");
|
|
882
|
-
content = content.replace(/^port = \d+/m, `port = ${chattrPort}`);
|
|
883
|
-
writeSecureFile(projectConfigToml, content);
|
|
884
|
-
} catch {}
|
|
885
|
-
}
|
|
886
|
-
|
|
887
|
-
async function spawnChattr() {
|
|
888
|
-
// Sync config.toml port before starting
|
|
889
|
-
regenerateConfigToml();
|
|
890
|
-
|
|
891
|
-
// Use project config.toml if available (isolated data dir + ports), otherwise fall back to --port
|
|
892
|
-
const extraArgs = (projectConfigToml && fs.existsSync(projectConfigToml))
|
|
893
|
-
? []
|
|
894
|
-
: ["--port", chattrPort];
|
|
895
|
-
|
|
896
|
-
// Resolve AgentChattr from its cloned directory
|
|
897
|
-
const { dir: acDir } = resolveProjectChattr(projectId);
|
|
898
|
-
// #394: backfill sender-overflow CSS/JS patch on every spawn so
|
|
899
|
-
// existing installs receive the fix without manual update.
|
|
900
|
-
patchAgentchattrCss(acDir);
|
|
901
|
-
const acSpawn = resolveChattrSpawn(acDir);
|
|
902
|
-
if (!acSpawn) {
|
|
903
|
-
setProc({ process: null, state: "error", error: `AgentChattr not installed. Clone it: git clone https://github.com/bcurts/agentchattr.git ${acDir}` });
|
|
904
|
-
return null;
|
|
905
|
-
}
|
|
906
|
-
|
|
907
|
-
// #569: redirect AC stdout/stderr to a log file so operators can
|
|
908
|
-
// diagnose startup failures. Append mode preserves restart history.
|
|
909
|
-
const acLogDir = path.join(os.homedir(), ".quadwork", projectId);
|
|
910
|
-
try { fs.mkdirSync(acLogDir, { recursive: true, mode: 0o700 }); } catch {}
|
|
911
|
-
const acLogPath = path.join(acLogDir, "agentchattr.log");
|
|
912
|
-
const acLogFd = fs.openSync(acLogPath, "a");
|
|
913
|
-
const child = spawn(acSpawn.command, [...acSpawn.args, ...extraArgs], {
|
|
914
|
-
cwd: acSpawn.cwd,
|
|
915
|
-
env: process.env,
|
|
916
|
-
stdio: ["ignore", acLogFd, acLogFd],
|
|
917
|
-
detached: true,
|
|
918
|
-
});
|
|
919
|
-
|
|
920
|
-
// Close our copy of the log fd — child inherits its own copy.
|
|
921
|
-
fs.closeSync(acLogFd);
|
|
922
|
-
|
|
923
|
-
// If pid is undefined, spawn failed
|
|
924
|
-
if (!child.pid) {
|
|
925
|
-
setProc({ process: null, state: "error", error: "Failed to start AgentChattr — check that Python venv is set up in " + acDir + ". Log: " + acLogPath });
|
|
926
|
-
child.on("error", () => {});
|
|
927
|
-
return null;
|
|
928
|
-
}
|
|
929
|
-
|
|
930
|
-
child.unref();
|
|
931
|
-
child.on("error", (err) => {
|
|
932
|
-
setProc({ process: null, state: "error", error: err.message });
|
|
933
|
-
});
|
|
934
|
-
child.on("exit", (code) => {
|
|
935
|
-
const cur = getProc();
|
|
936
|
-
if (cur.process === child) {
|
|
937
|
-
setProc({ process: null, state: "stopped", error: code ? `exit:${code}` : null });
|
|
938
|
-
}
|
|
939
|
-
});
|
|
940
|
-
// #580: wait for AC to actually bind the port before declaring success.
|
|
941
|
-
// On fast-start installs this resolves in 1-2s; prevents false-down
|
|
942
|
-
// detection on slow starts that triggered ghost agent cascades.
|
|
943
|
-
const ready = await waitForAgentChattrReady(chattrPort, 30000);
|
|
944
|
-
if (ready) {
|
|
945
|
-
setProc({ process: child, state: "running", error: null, runningSince: Date.now() });
|
|
946
|
-
return child;
|
|
947
|
-
} else {
|
|
948
|
-
setProc({ process: child, state: "error", error: "AgentChattr did not become ready within 30s" });
|
|
949
|
-
return null;
|
|
950
|
-
}
|
|
951
|
-
}
|
|
952
|
-
|
|
953
|
-
// #386: Kill any process listening on the AC port. Handles orphaned
|
|
954
|
-
// processes that survive QuadWork restarts (detached + unref'd spawns
|
|
955
|
-
// lose their tracked reference when the Node process recycles).
|
|
956
|
-
function killProcessOnPort(port, signal = "SIGTERM") {
|
|
957
|
-
try {
|
|
958
|
-
const pids = execFileSync("lsof", ["-ti", `TCP:${port}`, "-sTCP:LISTEN"], {
|
|
959
|
-
encoding: "utf-8",
|
|
960
|
-
timeout: 5000,
|
|
961
|
-
stdio: ["pipe", "pipe", "pipe"],
|
|
962
|
-
}).trim();
|
|
963
|
-
if (!pids) return;
|
|
964
|
-
for (const line of pids.split("\n")) {
|
|
965
|
-
const pid = parseInt(line, 10);
|
|
966
|
-
if (pid > 0) {
|
|
967
|
-
try { process.kill(pid, signal); } catch {}
|
|
968
|
-
}
|
|
969
|
-
}
|
|
970
|
-
} catch {
|
|
971
|
-
// lsof exits non-zero when no matching process — expected
|
|
972
|
-
}
|
|
973
|
-
}
|
|
974
|
-
|
|
975
|
-
// #386: Poll until the port is free or timeout expires.
|
|
976
|
-
function waitForPortFree(port, timeoutMs = 3000) {
|
|
977
|
-
const start = Date.now();
|
|
978
|
-
return new Promise((resolve) => {
|
|
979
|
-
function check() {
|
|
980
|
-
try {
|
|
981
|
-
execFileSync("lsof", ["-ti", `TCP:${port}`, "-sTCP:LISTEN"], {
|
|
982
|
-
encoding: "utf-8",
|
|
983
|
-
timeout: 2000,
|
|
984
|
-
stdio: ["pipe", "pipe", "pipe"],
|
|
985
|
-
});
|
|
986
|
-
// Still occupied — retry if within budget
|
|
987
|
-
if (Date.now() - start < timeoutMs) {
|
|
988
|
-
setTimeout(check, 200);
|
|
989
|
-
} else {
|
|
990
|
-
resolve(false);
|
|
991
|
-
}
|
|
992
|
-
} catch {
|
|
993
|
-
// lsof found nothing — port is free
|
|
994
|
-
resolve(true);
|
|
995
|
-
}
|
|
996
|
-
}
|
|
997
|
-
check();
|
|
998
|
-
});
|
|
999
|
-
}
|
|
1000
|
-
|
|
1001
|
-
if (action === "start") {
|
|
1002
|
-
const proc = getProc();
|
|
1003
|
-
if (proc.state === "running" && proc.process) {
|
|
1004
|
-
return res.json({ ok: true, state: "running", message: "Already running" });
|
|
1005
|
-
}
|
|
1006
|
-
// #401: validate AgentChattr is installed BEFORE killing anything on
|
|
1007
|
-
// the port. Without this guard, clicking Start when AC is missing
|
|
1008
|
-
// kills an unrelated process then fails with "not installed".
|
|
1009
|
-
const { dir: acDir } = resolveProjectChattr(projectId);
|
|
1010
|
-
const acSpawn = resolveChattrSpawn(acDir);
|
|
1011
|
-
if (!acSpawn) {
|
|
1012
|
-
const errMsg = `AgentChattr not installed. Clone it: git clone https://github.com/bcurts/agentchattr.git ${acDir}`;
|
|
1013
|
-
setProc({ process: null, state: "error", error: errMsg });
|
|
1014
|
-
return res.status(500).json({ ok: false, state: "error", error: errMsg });
|
|
1015
|
-
}
|
|
1016
|
-
|
|
1017
|
-
// #393: kill any orphaned process on the port before spawning
|
|
1018
|
-
// (same pattern as restart/stop from #386).
|
|
1019
|
-
killProcessOnPort(chattrPort);
|
|
1020
|
-
const portFree = await waitForPortFree(chattrPort, 3000);
|
|
1021
|
-
if (!portFree) {
|
|
1022
|
-
console.warn(`[agentchattr] ${projectId} port ${chattrPort} still occupied after 3s — spawning anyway`);
|
|
1023
|
-
}
|
|
1024
|
-
try {
|
|
1025
|
-
const child = await spawnChattr();
|
|
1026
|
-
if (!child) {
|
|
1027
|
-
const errProc = getProc();
|
|
1028
|
-
return res.status(500).json({ ok: false, state: "error", error: errProc.error || "Failed to start AgentChattr" });
|
|
1029
|
-
}
|
|
1030
|
-
// Sync token after AgentChattr starts (it generates its own)
|
|
1031
|
-
setTimeout(() => syncChattrToken(projectId), 2000);
|
|
1032
|
-
res.json({ ok: true, state: "running", pid: child.pid });
|
|
1033
|
-
} catch (err) {
|
|
1034
|
-
setProc({ process: null, state: "error", error: err.message });
|
|
1035
|
-
res.status(500).json({ ok: false, state: "error", error: err.message });
|
|
1036
|
-
}
|
|
1037
|
-
} else if (action === "stop") {
|
|
1038
|
-
const proc = getProc();
|
|
1039
|
-
if (proc.process) {
|
|
1040
|
-
try { proc.process.kill("SIGTERM"); } catch {}
|
|
1041
|
-
}
|
|
1042
|
-
// #386: also kill any orphaned process holding the port
|
|
1043
|
-
killProcessOnPort(chattrPort);
|
|
1044
|
-
setProc({ process: null, state: "stopped", error: null });
|
|
1045
|
-
res.json({ ok: true, state: "stopped" });
|
|
1046
|
-
} else if (action === "restart") {
|
|
1047
|
-
// #424 / quadwork#304: snapshot history before killing the
|
|
1048
|
-
// process. Best-effort and non-blocking-on-failure so a flaky
|
|
1049
|
-
// snapshot doesn't leave the operator unable to restart AC.
|
|
1050
|
-
await snapshotProjectHistory(projectId).catch(() => {});
|
|
1051
|
-
// #424 / quadwork#304 Phase 3: latch the opt-in BEFORE the
|
|
1052
|
-
// spawn so a restart that itself clears the flag can't starve
|
|
1053
|
-
// the auto-restore. We capture the snapshot filename we just
|
|
1054
|
-
// wrote + the project's auto_restore_after_restart flag and
|
|
1055
|
-
// replay it in the post-spawn tick below if both are set.
|
|
1056
|
-
const preRestartCfg = readConfig();
|
|
1057
|
-
const preRestartProject = preRestartCfg.projects?.find((p) => p.id === projectId);
|
|
1058
|
-
const shouldAutoRestore = !!(preRestartProject && preRestartProject.auto_restore_after_restart);
|
|
1059
|
-
const proc = getProc();
|
|
1060
|
-
if (proc.process) {
|
|
1061
|
-
console.log(`[agentchattr] ${projectId} restart: killing AC (PID: ${proc.process.pid})`);
|
|
1062
|
-
try { proc.process.kill("SIGTERM"); } catch {}
|
|
1063
|
-
}
|
|
1064
|
-
// #386: also kill any orphaned process holding the port (handles
|
|
1065
|
-
// detached processes that survived a QuadWork restart).
|
|
1066
|
-
killProcessOnPort(chattrPort);
|
|
1067
|
-
setProc({ process: null, state: "stopped", error: null });
|
|
1068
|
-
// #582: wait up to 5s for the port to be free, then SIGKILL
|
|
1069
|
-
// any remaining process as a fallback before spawning.
|
|
1070
|
-
let portFree = await waitForPortFree(chattrPort, 5000);
|
|
1071
|
-
if (!portFree) {
|
|
1072
|
-
console.warn(`[agentchattr] ${projectId} port ${chattrPort} still occupied after 5s — sending SIGKILL`);
|
|
1073
|
-
killProcessOnPort(chattrPort, "SIGKILL");
|
|
1074
|
-
portFree = await waitForPortFree(chattrPort, 3000);
|
|
1075
|
-
if (!portFree) {
|
|
1076
|
-
const portErr = `Port ${chattrPort} still occupied — cannot restart`;
|
|
1077
|
-
console.error(`[agentchattr] ${projectId} ${portErr}`);
|
|
1078
|
-
setProc({ process: null, state: "error", error: portErr });
|
|
1079
|
-
return res.status(500).json({ ok: false, state: "error", error: portErr });
|
|
1080
|
-
}
|
|
1081
|
-
}
|
|
1082
|
-
console.log(`[agentchattr] ${projectId} restart: port ${chattrPort} is free, spawning AC`);
|
|
1083
|
-
try {
|
|
1084
|
-
const child = await spawnChattr();
|
|
1085
|
-
if (!child) {
|
|
1086
|
-
const errProc = getProc();
|
|
1087
|
-
console.error(`[agentchattr] ${projectId} restart: spawnChattr failed — ${errProc.error || "unknown error"}`);
|
|
1088
|
-
return res.status(500).json({ ok: false, state: "error", error: errProc.error || "Failed to start AgentChattr" });
|
|
1089
|
-
}
|
|
1090
|
-
console.log(`[agentchattr] ${projectId} restart: AC spawned and ready (PID: ${child.pid})`);
|
|
1091
|
-
// Sync token after AgentChattr restarts
|
|
1092
|
-
setTimeout(() => syncChattrToken(projectId), 2000);
|
|
1093
|
-
// #424 / quadwork#304 Phase 3: optional auto-restore.
|
|
1094
|
-
// Fire the restore 3s after spawn so AC's ws is ready.
|
|
1095
|
-
// Best-effort: never blocks the restart response or
|
|
1096
|
-
// rolls back on error.
|
|
1097
|
-
if (shouldAutoRestore) {
|
|
1098
|
-
setTimeout(async () => {
|
|
1099
|
-
try {
|
|
1100
|
-
const snapDir = path.join(require("os").homedir(), ".quadwork", projectId, "history-snapshots");
|
|
1101
|
-
if (!fs.existsSync(snapDir)) return;
|
|
1102
|
-
const newest = fs.readdirSync(snapDir)
|
|
1103
|
-
.filter((f) => f.endsWith(".json"))
|
|
1104
|
-
.map((f) => ({ f, t: fs.statSync(path.join(snapDir, f)).mtimeMs }))
|
|
1105
|
-
.sort((a, b) => b.t - a.t)[0];
|
|
1106
|
-
if (!newest) return;
|
|
1107
|
-
const r = await fetch(`http://127.0.0.1:${PORT}/api/project-history/restore?project=${encodeURIComponent(projectId)}&name=${encodeURIComponent(newest.f)}`, {
|
|
1108
|
-
method: "POST",
|
|
1109
|
-
});
|
|
1110
|
-
if (r.ok) console.log(`[snapshot] ${projectId} auto-restored ${newest.f}`);
|
|
1111
|
-
else console.warn(`[snapshot] ${projectId} auto-restore returned ${r.status}`);
|
|
1112
|
-
} catch (err) {
|
|
1113
|
-
console.warn(`[snapshot] ${projectId} auto-restore failed: ${err.message || err}`);
|
|
1114
|
-
}
|
|
1115
|
-
}, 3000);
|
|
1116
|
-
}
|
|
1117
|
-
res.json({ ok: true, state: "running", pid: child.pid });
|
|
1118
|
-
// #447: auto-reset all agents after AC restart so they get
|
|
1119
|
-
// fresh MCP tokens. #581: mark reset as scheduled immediately
|
|
1120
|
-
// so the health monitor skips its own reset while ours is in-flight.
|
|
1121
|
-
// #579: also skip if a reset already succeeded within the last 30s.
|
|
1122
|
-
// Multiple restart sources (bridge-migrate, health monitor, dashboard)
|
|
1123
|
-
// can fire in rapid succession — only the first should trigger a reset.
|
|
1124
|
-
const existingReset = _acHealth.resetState.get(projectId);
|
|
1125
|
-
const resetRecentlyDone = existingReset &&
|
|
1126
|
-
(existingReset.status === "succeeded" || existingReset.status === "scheduled") &&
|
|
1127
|
-
Date.now() - existingReset.timestamp < 30_000;
|
|
1128
|
-
if (resetRecentlyDone) {
|
|
1129
|
-
console.log(`[agentchattr] ${projectId} skipping auto-reset — one already ${existingReset.status} ${Math.round((Date.now() - existingReset.timestamp) / 1000)}s ago`);
|
|
1130
|
-
} else {
|
|
1131
|
-
_acHealth.resetState.set(projectId, { status: "scheduled", timestamp: Date.now() });
|
|
1132
|
-
}
|
|
1133
|
-
if (!resetRecentlyDone) setTimeout(async () => {
|
|
1134
|
-
try {
|
|
1135
|
-
const resetResp = await fetch(`http://127.0.0.1:${PORT}/api/agents/${encodeURIComponent(projectId)}/reset`, {
|
|
1136
|
-
method: "POST",
|
|
1137
|
-
});
|
|
1138
|
-
if (resetResp.ok) {
|
|
1139
|
-
const resetData = await resetResp.json();
|
|
1140
|
-
_acHealth.resetState.set(projectId, { status: "succeeded", timestamp: Date.now() });
|
|
1141
|
-
console.log(`[agentchattr] ${projectId} auto-reset ${resetData.restarted} agent(s) after AC restart`);
|
|
1142
|
-
} else {
|
|
1143
|
-
_acHealth.resetState.set(projectId, { status: "failed", timestamp: Date.now() });
|
|
1144
|
-
console.warn(`[agentchattr] ${projectId} agent reset after AC restart returned ${resetResp.status}`);
|
|
1145
|
-
}
|
|
1146
|
-
} catch (err) {
|
|
1147
|
-
_acHealth.resetState.set(projectId, { status: "failed", timestamp: Date.now() });
|
|
1148
|
-
console.warn(`[agentchattr] ${projectId} agent reset after AC restart failed: ${err.message || err}`);
|
|
1149
|
-
}
|
|
1150
|
-
}, 2000);
|
|
1151
|
-
} catch (err) {
|
|
1152
|
-
setProc({ process: null, state: "error", error: err.message });
|
|
1153
|
-
res.status(500).json({ ok: false, state: "error", error: err.message });
|
|
1154
|
-
}
|
|
1155
|
-
} else if (action === "update") {
|
|
1156
|
-
// Update AgentChattr: stop → git pull → pip install → restart
|
|
1157
|
-
const { dir: acDir } = resolveProjectChattr(projectId);
|
|
1158
|
-
if (!acDir || !fs.existsSync(path.join(acDir, "run.py"))) {
|
|
1159
|
-
return res.status(400).json({ ok: false, error: "AgentChattr not installed at " + (acDir || "unknown") });
|
|
1160
|
-
}
|
|
1161
|
-
try {
|
|
1162
|
-
// Stop running process before pulling. Snapshot first so a
|
|
1163
|
-
// botched git pull can still be rolled back from disk.
|
|
1164
|
-
// #424 / quadwork#304: best-effort.
|
|
1165
|
-
await snapshotProjectHistory(projectId).catch(() => {});
|
|
1166
|
-
// Latch the auto-restore opt-in BEFORE stop, same as the
|
|
1167
|
-
// explicit restart branch above — a config mutation during
|
|
1168
|
-
// the git pull shouldn't starve the replay.
|
|
1169
|
-
const updateCfgPre = readConfig();
|
|
1170
|
-
const updateProjectPre = updateCfgPre.projects?.find((p) => p.id === projectId);
|
|
1171
|
-
const updateShouldAutoRestore = !!(updateProjectPre && updateProjectPre.auto_restore_after_restart);
|
|
1172
|
-
const proc = getProc();
|
|
1173
|
-
const wasRunning = proc.process && proc.state === "running";
|
|
1174
|
-
if (wasRunning) {
|
|
1175
|
-
try { proc.process.kill("SIGTERM"); } catch {}
|
|
1176
|
-
}
|
|
1177
|
-
// #386: kill orphaned processes on the port too
|
|
1178
|
-
killProcessOnPort(chattrPort);
|
|
1179
|
-
if (wasRunning) {
|
|
1180
|
-
setProc({ process: null, state: "stopped", error: null });
|
|
1181
|
-
// Wait for the port to be released before pulling/restarting
|
|
1182
|
-
await waitForPortFree(chattrPort, 3000);
|
|
1183
|
-
}
|
|
1184
|
-
|
|
1185
|
-
const pullResult = execFileSync("git", ["pull"], { cwd: acDir, encoding: "utf-8", timeout: 30000, stdio: "pipe" }).trim();
|
|
1186
|
-
// #388: re-apply sender-overflow CSS patch after git pull
|
|
1187
|
-
patchAgentchattrCss(acDir);
|
|
1188
|
-
// #629: re-apply crash timeout patch after git pull (pull may revert app.py)
|
|
1189
|
-
patchCrashTimeout(acDir);
|
|
1190
|
-
const venvPython = path.join(acDir, ".venv", "bin", "python");
|
|
1191
|
-
let pipResult = "";
|
|
1192
|
-
const reqFile = path.join(acDir, "requirements.txt");
|
|
1193
|
-
if (fs.existsSync(venvPython) && fs.existsSync(reqFile)) {
|
|
1194
|
-
pipResult = execFileSync(venvPython, ["-m", "pip", "install", "-r", "requirements.txt"], { cwd: acDir, encoding: "utf-8", timeout: 120000, stdio: "pipe" }).trim();
|
|
1195
|
-
}
|
|
1196
|
-
|
|
1197
|
-
// Restart if it was running before the update
|
|
1198
|
-
let restarted = false;
|
|
1199
|
-
if (wasRunning) {
|
|
1200
|
-
const child = await spawnChattr();
|
|
1201
|
-
restarted = !!child;
|
|
1202
|
-
if (child) {
|
|
1203
|
-
setTimeout(() => syncChattrToken(projectId).catch(() => {}), 2000);
|
|
1204
|
-
// #424 / quadwork#304 Phase 3: auto-restore after an
|
|
1205
|
-
// update-triggered restart too (t2a re-review). Same
|
|
1206
|
-
//3s wait + newest-snapshot-by-mtime path as the explicit
|
|
1207
|
-
// restart branch, using the pre-stop latched opt-in.
|
|
1208
|
-
if (updateShouldAutoRestore) {
|
|
1209
|
-
setTimeout(async () => {
|
|
1210
|
-
try {
|
|
1211
|
-
const snapDir = path.join(require("os").homedir(), ".quadwork", projectId, "history-snapshots");
|
|
1212
|
-
if (!fs.existsSync(snapDir)) return;
|
|
1213
|
-
const newest = fs.readdirSync(snapDir)
|
|
1214
|
-
.filter((f) => f.endsWith(".json"))
|
|
1215
|
-
.map((f) => ({ f, t: fs.statSync(path.join(snapDir, f)).mtimeMs }))
|
|
1216
|
-
.sort((a, b) => b.t - a.t)[0];
|
|
1217
|
-
if (!newest) return;
|
|
1218
|
-
const r = await fetch(`http://127.0.0.1:${PORT}/api/project-history/restore?project=${encodeURIComponent(projectId)}&name=${encodeURIComponent(newest.f)}`, {
|
|
1219
|
-
method: "POST",
|
|
1220
|
-
});
|
|
1221
|
-
if (r.ok) console.log(`[snapshot] ${projectId} auto-restored ${newest.f} after update`);
|
|
1222
|
-
else console.warn(`[snapshot] ${projectId} post-update auto-restore returned ${r.status}`);
|
|
1223
|
-
} catch (err) {
|
|
1224
|
-
console.warn(`[snapshot] ${projectId} post-update auto-restore failed: ${err.message || err}`);
|
|
1225
|
-
}
|
|
1226
|
-
}, 3000);
|
|
1227
|
-
}
|
|
1228
|
-
}
|
|
1229
|
-
}
|
|
560
|
+
// Per-project AgentChattr lifecycle (removed in #723 — AC stack deleted)
|
|
1230
561
|
|
|
1231
|
-
|
|
1232
|
-
|
|
1233
|
-
|
|
1234
|
-
}
|
|
1235
|
-
} else {
|
|
1236
|
-
res.status(400).json({ error: "Unknown action" });
|
|
1237
|
-
}
|
|
562
|
+
// Stub endpoints — return 410 Gone so dashboard code degrades gracefully
|
|
563
|
+
async function handleAgentChattr(_req, res) {
|
|
564
|
+
return res.status(410).json({ ok: false, error: "AgentChattr removed in Phase 3" });
|
|
1238
565
|
}
|
|
1239
|
-
app.post("/api/agentchattr/:projectOrAction/:action", handleAgentChattr);
|
|
1240
|
-
app.post("/api/agentchattr/:projectOrAction", handleAgentChattr);
|
|
1241
|
-
|
|
1242
|
-
// --- Reset agents: deregister all registered slots ---
|
|
1243
|
-
// AgentChattr doesn't expose staleness metadata, so this clears all slots.
|
|
1244
|
-
// Agents' wrapper heartbeat will auto-re-register with clean names.
|
|
1245
|
-
|
|
1246
|
-
// #416: AC health status endpoint — returns the health monitor state
|
|
1247
|
-
// for a project so the dashboard can surface auto-restart events.
|
|
1248
|
-
app.get("/api/agentchattr/:project/health", (req, res) => {
|
|
1249
|
-
const projectId = req.params.project;
|
|
1250
|
-
const proc = chattrProcesses.get(projectId);
|
|
1251
|
-
const health = _acHealth.state.get(projectId) || { lastRestart: 0, consecutiveFailures: 0 };
|
|
1252
|
-
res.json({
|
|
1253
|
-
state: proc?.state || "unknown",
|
|
1254
|
-
error: proc?.error || null,
|
|
1255
|
-
autoRestart: {
|
|
1256
|
-
lastRestart: health.lastRestart || null,
|
|
1257
|
-
consecutiveFailures: health.consecutiveFailures,
|
|
1258
|
-
gaveUp: health.consecutiveFailures >= 3,
|
|
1259
|
-
},
|
|
1260
|
-
});
|
|
1261
|
-
});
|
|
1262
|
-
|
|
1263
566
|
app.post("/api/agents/:project/reset", async (req, res) => {
|
|
1264
567
|
const projectId = req.params.project;
|
|
1265
568
|
|
|
@@ -1291,6 +594,8 @@ app.post("/api/agents/:project/reset", async (req, res) => {
|
|
|
1291
594
|
|
|
1292
595
|
// Stop all agents first (handles deregistration best-effort)
|
|
1293
596
|
for (const agentId of allAgentIds) {
|
|
597
|
+
const s = agentSessions.get(`${projectId}/${agentId}`);
|
|
598
|
+
if (s) s._suppressLifecycleMsg = true;
|
|
1294
599
|
await stopAgentSession(`${projectId}/${agentId}`);
|
|
1295
600
|
}
|
|
1296
601
|
|
|
@@ -1298,8 +603,9 @@ app.post("/api/agents/:project/reset", async (req, res) => {
|
|
|
1298
603
|
let restarted = 0;
|
|
1299
604
|
const errors = [];
|
|
1300
605
|
for (const agentId of allAgentIds) {
|
|
1301
|
-
const result = await spawnAgentPty(projectId, agentId);
|
|
606
|
+
const result = await spawnAgentPty(projectId, agentId, { suppressLifecycleMsg: true });
|
|
1302
607
|
if (result.ok) {
|
|
608
|
+
emitSystemMessage(projectId, `${agentId} restarted`);
|
|
1303
609
|
restarted++;
|
|
1304
610
|
} else {
|
|
1305
611
|
errors.push(`${agentId}: ${result.error}`);
|
|
@@ -1317,7 +623,7 @@ app.post("/api/agents/:project/reset", async (req, res) => {
|
|
|
1317
623
|
}
|
|
1318
624
|
});
|
|
1319
625
|
|
|
1320
|
-
// --- Full Reset: restart all
|
|
626
|
+
// --- Full Reset: restart all agents across all projects (#657) ---
|
|
1321
627
|
|
|
1322
628
|
app.post("/api/full-reset", async (_req, res) => {
|
|
1323
629
|
const start = Date.now();
|
|
@@ -1326,42 +632,21 @@ app.post("/api/full-reset", async (_req, res) => {
|
|
|
1326
632
|
const cfg = readConfig();
|
|
1327
633
|
const projects = (cfg.projects || []).filter((p) => !p.archived);
|
|
1328
634
|
|
|
1329
|
-
// 1. Stop all agent sessions
|
|
1330
635
|
console.log("[full-reset] stopping all agent sessions...");
|
|
1331
636
|
const sessionKeys = [...agentSessions.keys()];
|
|
1332
637
|
for (const key of sessionKeys) {
|
|
1333
638
|
await stopAgentSession(key);
|
|
1334
639
|
}
|
|
1335
640
|
|
|
1336
|
-
// 2. Stop Butler if running
|
|
1337
641
|
console.log("[full-reset] stopping Butler...");
|
|
1338
642
|
stopButlerPty();
|
|
1339
643
|
|
|
1340
|
-
// 3. Re-run startup migrations
|
|
1341
644
|
console.log("[full-reset] running startup migrations...");
|
|
1342
645
|
runStartupMigrations(cfg);
|
|
1343
646
|
|
|
1344
|
-
// 4. Restart each project's AC + agents
|
|
1345
647
|
let totalAgents = 0;
|
|
1346
648
|
const errors = [];
|
|
1347
649
|
for (const project of projects) {
|
|
1348
|
-
console.log(`[full-reset] restarting AC for ${project.id}...`);
|
|
1349
|
-
// Pre-mark reset as scheduled so AC restart's auto-reset timer is suppressed
|
|
1350
|
-
_acHealth.resetState.set(project.id, { status: "scheduled", timestamp: Date.now() });
|
|
1351
|
-
try {
|
|
1352
|
-
const acResp = await fetch(`http://127.0.0.1:${PORT}/api/agentchattr/${encodeURIComponent(project.id)}/restart`, {
|
|
1353
|
-
method: "POST",
|
|
1354
|
-
});
|
|
1355
|
-
if (!acResp.ok) {
|
|
1356
|
-
const errData = await acResp.json().catch(() => ({}));
|
|
1357
|
-
errors.push(`${project.id}: AC restart failed — ${errData.error || acResp.status}`);
|
|
1358
|
-
continue;
|
|
1359
|
-
}
|
|
1360
|
-
} catch (err) {
|
|
1361
|
-
errors.push(`${project.id}: AC — ${err.message}`);
|
|
1362
|
-
continue;
|
|
1363
|
-
}
|
|
1364
|
-
// Explicitly reset agents and await result
|
|
1365
650
|
try {
|
|
1366
651
|
const resetResp = await fetch(`http://127.0.0.1:${PORT}/api/agents/${encodeURIComponent(project.id)}/reset`, {
|
|
1367
652
|
method: "POST",
|
|
@@ -1377,7 +662,6 @@ app.post("/api/full-reset", async (_req, res) => {
|
|
|
1377
662
|
}
|
|
1378
663
|
}
|
|
1379
664
|
|
|
1380
|
-
// 5. Restart Butler if enabled
|
|
1381
665
|
if (cfg.butler?.enabled) {
|
|
1382
666
|
console.log("[full-reset] restarting Butler...");
|
|
1383
667
|
const result = spawnButlerPty();
|
|
@@ -1435,16 +719,45 @@ app.post("/api/agents/:project/:agent/restart", async (req, res) => {
|
|
|
1435
719
|
|
|
1436
720
|
// #241: must await deregister before respawn so the slot frees and
|
|
1437
721
|
// the fresh register lands at slot 1 instead of head-2.
|
|
722
|
+
const existing = agentSessions.get(key);
|
|
723
|
+
if (existing) existing._suppressLifecycleMsg = true;
|
|
1438
724
|
await stopAgentSession(key);
|
|
1439
725
|
|
|
1440
|
-
const result = await spawnAgentPty(project, agent);
|
|
726
|
+
const result = await spawnAgentPty(project, agent, { suppressLifecycleMsg: true });
|
|
1441
727
|
if (result.ok) {
|
|
728
|
+
emitSystemMessage(project, `${agent} restarted`);
|
|
1442
729
|
res.json({ ok: true, state: "running", pid: result.pid });
|
|
1443
730
|
} else {
|
|
1444
731
|
res.status(500).json({ ok: false, state: "error", error: result.error });
|
|
1445
732
|
}
|
|
1446
733
|
});
|
|
1447
734
|
|
|
735
|
+
// --- #706: Manual interrupt — send Ctrl+C to agent PTY ---
|
|
736
|
+
|
|
737
|
+
app.post("/api/agents/:project/:agent/interrupt", (req, res) => {
|
|
738
|
+
const key = `${req.params.project}/${req.params.agent}`;
|
|
739
|
+
const session = agentSessions.get(key);
|
|
740
|
+
if (!session || !session.term) {
|
|
741
|
+
return res.json({ ok: false, error: "Agent not running" });
|
|
742
|
+
}
|
|
743
|
+
safeWrite(session.term, "\x03");
|
|
744
|
+
console.log(`[interrupt] ${key}: operator sent Ctrl+C`);
|
|
745
|
+
res.json({ ok: true });
|
|
746
|
+
});
|
|
747
|
+
|
|
748
|
+
app.post("/api/agents/:project/interrupt-all", (req, res) => {
|
|
749
|
+
const { project } = req.params;
|
|
750
|
+
let count = 0;
|
|
751
|
+
for (const [key, session] of agentSessions) {
|
|
752
|
+
if (!key.startsWith(`${project}/`)) continue;
|
|
753
|
+
if (session.state !== "running" || !session.term) continue;
|
|
754
|
+
safeWrite(session.term, "\x03");
|
|
755
|
+
count++;
|
|
756
|
+
}
|
|
757
|
+
console.log(`[interrupt] ${project}: operator sent Ctrl+C to ${count} agent(s)`);
|
|
758
|
+
res.json({ ok: true, interrupted: count });
|
|
759
|
+
});
|
|
760
|
+
|
|
1448
761
|
// --- Sessions tracking (for /api/projects dashboard) ---
|
|
1449
762
|
|
|
1450
763
|
// Expose agentSessions to migrated routes
|
|
@@ -2380,327 +1693,35 @@ setInterval(autoStopPollingTick, AUTO_STOP_POLL_INTERVAL_MS);
|
|
|
2380
1693
|
// delay is tens of seconds. Skipping projects without the opt-in
|
|
2381
1694
|
// keeps the poller cheap for single-project setups.
|
|
2382
1695
|
|
|
2383
|
-
const _loopGuardPausedState = new Map(); // projectId -> { paused: bool, scheduled: Timeout? }
|
|
2384
|
-
const LOOP_GUARD_POLL_INTERVAL_MS = 10000;
|
|
2385
|
-
|
|
2386
|
-
async function checkLoopGuardPause(project) {
|
|
2387
|
-
if (!project || !project.auto_continue_loop_guard) return;
|
|
2388
|
-
const { url: base, token: sessionToken } = resolveProjectChattr(project.id);
|
|
2389
|
-
if (!base) return;
|
|
2390
|
-
let paused = false;
|
|
2391
|
-
try {
|
|
2392
|
-
const r = await fetch(`${base}/api/status`, {
|
|
2393
|
-
headers: sessionToken ? { "x-session-token": sessionToken } : {},
|
|
2394
|
-
signal: AbortSignal.timeout(5000),
|
|
2395
|
-
});
|
|
2396
|
-
if (!r.ok) return;
|
|
2397
|
-
const data = await r.json();
|
|
2398
|
-
paused = !!(data && data.paused);
|
|
2399
|
-
} catch {
|
|
2400
|
-
return;
|
|
2401
|
-
}
|
|
2402
|
-
const state = _loopGuardPausedState.get(project.id) || { paused: false, scheduled: null };
|
|
2403
|
-
// Transition false → true: schedule an auto-continue after the delay.
|
|
2404
|
-
if (paused && !state.paused && !state.scheduled) {
|
|
2405
|
-
const delaySec = Number.isFinite(project.auto_continue_delay_sec) && project.auto_continue_delay_sec >= 5
|
|
2406
|
-
? project.auto_continue_delay_sec
|
|
2407
|
-
: 30;
|
|
2408
|
-
console.log(`[loop-guard] ${project.id} paused — auto-continue in ${delaySec}s`);
|
|
2409
|
-
state.scheduled = setTimeout(async () => {
|
|
2410
|
-
try {
|
|
2411
|
-
// Re-check the opt-in at fire time so a checkbox disable
|
|
2412
|
-
// mid-wait actually stops the auto-continue.
|
|
2413
|
-
const freshCfg = readConfig();
|
|
2414
|
-
const fresh = freshCfg.projects?.find((p) => p.id === project.id);
|
|
2415
|
-
if (!fresh || !fresh.auto_continue_loop_guard) {
|
|
2416
|
-
console.log(`[loop-guard] ${project.id} auto-continue cancelled (opt-in disabled during wait)`);
|
|
2417
|
-
} else {
|
|
2418
|
-
// Re-check the router's pause state at fire time too. The
|
|
2419
|
-
// 10s status poller may not have seen a manual operator
|
|
2420
|
-
// /continue yet when the delay window (5–9s) is shorter
|
|
2421
|
-
// than the poll interval — without this, a manual resume
|
|
2422
|
-
// inside a 5s wait would be followed by a stale auto
|
|
2423
|
-
// /continue that clobbers hop_count on an already-running
|
|
2424
|
-
// chain (router.continue_routing resets the counter
|
|
2425
|
-
// unconditionally). The re-check closes the race.
|
|
2426
|
-
let stillPaused = false;
|
|
2427
|
-
try {
|
|
2428
|
-
const { url: freshBase, token: freshToken } = resolveProjectChattr(project.id);
|
|
2429
|
-
if (freshBase) {
|
|
2430
|
-
const sr = await fetch(`${freshBase}/api/status`, {
|
|
2431
|
-
headers: freshToken ? { "x-session-token": freshToken } : {},
|
|
2432
|
-
signal: AbortSignal.timeout(5000),
|
|
2433
|
-
});
|
|
2434
|
-
if (sr.ok) {
|
|
2435
|
-
const sd = await sr.json();
|
|
2436
|
-
stillPaused = !!(sd && sd.paused);
|
|
2437
|
-
}
|
|
2438
|
-
}
|
|
2439
|
-
} catch {
|
|
2440
|
-
// Status re-check failed — fall back to "don't fire".
|
|
2441
|
-
// Stuck pause will still be caught on the next 10s tick.
|
|
2442
|
-
}
|
|
2443
|
-
if (!stillPaused) {
|
|
2444
|
-
console.log(`[loop-guard] ${project.id} auto-continue cancelled (router already resumed)`);
|
|
2445
|
-
} else {
|
|
2446
|
-
const res = await fetch(`http://127.0.0.1:${PORT}/api/chat?project=${encodeURIComponent(project.id)}`, {
|
|
2447
|
-
method: "POST",
|
|
2448
|
-
headers: { "Content-Type": "application/json" },
|
|
2449
|
-
body: JSON.stringify({ text: "/continue", channel: "general" }),
|
|
2450
|
-
});
|
|
2451
|
-
if (res.ok) console.log(`[loop-guard] ${project.id} auto-continued`);
|
|
2452
|
-
else console.warn(`[loop-guard] ${project.id} auto-continue POST returned ${res.status}`);
|
|
2453
|
-
}
|
|
2454
|
-
}
|
|
2455
|
-
} catch (err) {
|
|
2456
|
-
console.warn(`[loop-guard] ${project.id} auto-continue failed: ${err.message || err}`);
|
|
2457
|
-
}
|
|
2458
|
-
const s2 = _loopGuardPausedState.get(project.id);
|
|
2459
|
-
if (s2) s2.scheduled = null;
|
|
2460
|
-
}, delaySec * 1000);
|
|
2461
|
-
}
|
|
2462
|
-
// Transition true → false: clear any pending timer.
|
|
2463
|
-
if (!paused && state.paused && state.scheduled) {
|
|
2464
|
-
clearTimeout(state.scheduled);
|
|
2465
|
-
state.scheduled = null;
|
|
2466
|
-
}
|
|
2467
|
-
state.paused = paused;
|
|
2468
|
-
_loopGuardPausedState.set(project.id, state);
|
|
2469
|
-
}
|
|
2470
|
-
|
|
2471
|
-
function runLoopGuardPollingTick() {
|
|
2472
|
-
try {
|
|
2473
|
-
const cfg = readConfig();
|
|
2474
|
-
for (const p of (cfg.projects || [])) {
|
|
2475
|
-
if (p && p.auto_continue_loop_guard) checkLoopGuardPause(p);
|
|
2476
|
-
}
|
|
2477
|
-
} catch {
|
|
2478
|
-
// config unreadable — next tick will retry
|
|
2479
|
-
}
|
|
2480
|
-
}
|
|
2481
|
-
|
|
2482
|
-
setInterval(runLoopGuardPollingTick, LOOP_GUARD_POLL_INTERVAL_MS);
|
|
2483
|
-
|
|
2484
1696
|
// --- Start ---
|
|
2485
1697
|
|
|
2486
|
-
//
|
|
2487
|
-
|
|
2488
|
-
|
|
2489
|
-
// process. If the port is dead, auto-restarts (reusing the existing restart
|
|
2490
|
-
// logic). Rate-limited to one restart per 60s per project; gives up after
|
|
2491
|
-
// 3 consecutive failures and surfaces a persistent error.
|
|
2492
|
-
// ---------------------------------------------------------------------------
|
|
2493
|
-
// #572: restart agents that are running without AC registration after AC
|
|
2494
|
-
// recovers from a crash. Scans agentSessions for the given project,
|
|
2495
|
-
// finds agents missing acRegistrationName, and stop+respawns them so
|
|
2496
|
-
// they get MCP CLI flags at launch time.
|
|
2497
|
-
async function restartUnregisteredAgents(projectId) {
|
|
2498
|
-
const toRestart = [];
|
|
2499
|
-
for (const [key, session] of agentSessions) {
|
|
2500
|
-
if (session.projectId !== projectId) continue;
|
|
2501
|
-
if (session.acRegistrationName) continue; // already registered
|
|
2502
|
-
if (session.state !== "running") continue;
|
|
2503
|
-
if (!session.acServerPort || !session.acInjectMode) continue;
|
|
2504
|
-
toRestart.push({ key, agentId: session.agentId });
|
|
2505
|
-
}
|
|
2506
|
-
|
|
2507
|
-
if (toRestart.length === 0) return;
|
|
2508
|
-
const samplePort = agentSessions.get(toRestart[0].key)?.acServerPort || "?";
|
|
2509
|
-
console.log(`[health] AC recovered on port ${samplePort} — restarting ${toRestart.length} agent(s) for chat integration`);
|
|
2510
|
-
|
|
2511
|
-
for (const { key, agentId } of toRestart) {
|
|
2512
|
-
try {
|
|
2513
|
-
console.log(`[health] Restarting agent ${agentId} for project ${projectId} to gain chat integration`);
|
|
2514
|
-
await stopAgentSession(key);
|
|
2515
|
-
await spawnAgentPty(projectId, agentId);
|
|
2516
|
-
} catch (err) {
|
|
2517
|
-
console.error(`[health] Failed to restart agent ${agentId}: ${err.message}`);
|
|
2518
|
-
}
|
|
2519
|
-
}
|
|
2520
|
-
}
|
|
1698
|
+
// #705: auto-interrupt agents stuck with no PTY output for 10 minutes.
|
|
1699
|
+
const WATCHDOG_TIMEOUT_MS = 10 * 60 * 1000;
|
|
1700
|
+
let _watchdogHandle = null;
|
|
2521
1701
|
|
|
2522
|
-
|
|
2523
|
-
|
|
2524
|
-
|
|
2525
|
-
|
|
2526
|
-
|
|
2527
|
-
|
|
2528
|
-
|
|
2529
|
-
|
|
2530
|
-
|
|
2531
|
-
|
|
2532
|
-
// Tracked via `runningSince` in chattrProcesses entries.
|
|
2533
|
-
};
|
|
2534
|
-
|
|
2535
|
-
function isPortAlive(port) {
|
|
2536
|
-
return new Promise((resolve) => {
|
|
2537
|
-
const sock = net.createConnection({ port, host: "127.0.0.1" }, () => {
|
|
2538
|
-
sock.destroy();
|
|
2539
|
-
resolve(true);
|
|
2540
|
-
});
|
|
2541
|
-
sock.on("error", () => resolve(false));
|
|
2542
|
-
sock.setTimeout(2000, () => { sock.destroy(); resolve(false); });
|
|
2543
|
-
});
|
|
2544
|
-
}
|
|
2545
|
-
|
|
2546
|
-
async function acHealthCheck() {
|
|
2547
|
-
const cfg = readConfig();
|
|
2548
|
-
for (const project of (cfg.projects || [])) {
|
|
2549
|
-
const proc = chattrProcesses.get(project.id);
|
|
2550
|
-
// Only monitor projects that were explicitly started (state === "running"
|
|
2551
|
-
// or had a process). Skip intentionally stopped projects.
|
|
2552
|
-
if (!proc || proc.state === "stopped") continue;
|
|
2553
|
-
// #579: per-project grace period — skip projects whose AC entered
|
|
2554
|
-
// "running" within the last 60s. This lets cmdStart spawns and
|
|
2555
|
-
// startup migrations (bridge-migrate, ghost-fix) settle before the
|
|
2556
|
-
// monitor acts, regardless of when the project was created.
|
|
2557
|
-
if (proc.runningSince && Date.now() - proc.runningSince < 60_000) continue;
|
|
2558
|
-
|
|
2559
|
-
const { url } = resolveProjectChattr(project.id);
|
|
2560
|
-
const portMatch = url.match(/:(\d+)/);
|
|
2561
|
-
const port = portMatch ? parseInt(portMatch[1], 10) : 8300;
|
|
2562
|
-
|
|
2563
|
-
const alive = await isPortAlive(port);
|
|
2564
|
-
const health = _acHealth.state.get(project.id) || { lastRestart: 0, consecutiveFailures: 0 };
|
|
2565
|
-
|
|
2566
|
-
if (alive) {
|
|
2567
|
-
// Healthy — reset failure counter
|
|
2568
|
-
if (health.consecutiveFailures > 0) {
|
|
2569
|
-
console.log(`[health] AC for ${project.id} recovered (port ${port} alive)`);
|
|
2570
|
-
// #572: restart agents that are running without chat integration.
|
|
2571
|
-
// These are agents where the #565 deferred restart timed out, or
|
|
2572
|
-
// agents spawned while AC was down. MCP flags are set at process
|
|
2573
|
-
// launch, so a full stop+respawn is required.
|
|
2574
|
-
// #581: dedupe — skip if a reset is in-flight or succeeded within 60s.
|
|
2575
|
-
// If "scheduled" (in-flight), keep consecutiveFailures=1 so the next
|
|
2576
|
-
// healthy tick re-enters this branch and retries if state became "failed".
|
|
2577
|
-
const rs = _acHealth.resetState.get(project.id);
|
|
2578
|
-
const resetSucceeded = rs && rs.status === "succeeded" && Date.now() - rs.timestamp < 60000;
|
|
2579
|
-
const resetInFlight = rs && rs.status === "scheduled";
|
|
2580
|
-
if (resetSucceeded) {
|
|
2581
|
-
// Already handled — clear failures normally
|
|
2582
|
-
} else if (resetInFlight) {
|
|
2583
|
-
// In-flight — preserve failures so we retry next tick if it fails
|
|
2584
|
-
health.consecutiveFailures = 1;
|
|
2585
|
-
_acHealth.state.set(project.id, health);
|
|
2586
|
-
continue;
|
|
2587
|
-
} else {
|
|
2588
|
-
// No recent reset or previous attempt failed — fire one
|
|
2589
|
-
_acHealth.resetState.set(project.id, { status: "scheduled", timestamp: Date.now() });
|
|
2590
|
-
restartUnregisteredAgents(project.id).then(() => {
|
|
2591
|
-
_acHealth.resetState.set(project.id, { status: "succeeded", timestamp: Date.now() });
|
|
2592
|
-
}).catch((err) => {
|
|
2593
|
-
_acHealth.resetState.set(project.id, { status: "failed", timestamp: Date.now() });
|
|
2594
|
-
console.error(`[health] Failed to restart unregistered agents for ${project.id}:`, err.message);
|
|
2595
|
-
});
|
|
2596
|
-
}
|
|
2597
|
-
}
|
|
2598
|
-
health.consecutiveFailures = 0;
|
|
2599
|
-
_acHealth.state.set(project.id, health);
|
|
2600
|
-
continue;
|
|
2601
|
-
}
|
|
2602
|
-
|
|
2603
|
-
// Port is dead — check rate limits
|
|
2604
|
-
if (health.consecutiveFailures >= 3) {
|
|
2605
|
-
// Already gave up — don't spam restarts. The error state persists
|
|
2606
|
-
// in chattrProcesses for the dashboard to surface.
|
|
2607
|
-
continue;
|
|
2608
|
-
}
|
|
2609
|
-
|
|
2610
|
-
const now = Date.now();
|
|
2611
|
-
if (now - health.lastRestart < 60_000) {
|
|
2612
|
-
// Too soon since last restart attempt
|
|
2613
|
-
continue;
|
|
2614
|
-
}
|
|
2615
|
-
|
|
2616
|
-
health.consecutiveFailures++;
|
|
2617
|
-
health.lastRestart = now;
|
|
2618
|
-
_acHealth.state.set(project.id, health);
|
|
2619
|
-
|
|
2620
|
-
console.warn(`[health] AC for ${project.id} on port ${port} is down (failure ${health.consecutiveFailures}/3) — auto-restarting`);
|
|
2621
|
-
|
|
2622
|
-
// Call the existing restart endpoint internally so we reuse the
|
|
2623
|
-
// hardened path (killProcessOnPort, waitForPortFree, snapshot,
|
|
2624
|
-
// auto-restore) instead of reimplementing spawn logic inline.
|
|
2625
|
-
try {
|
|
2626
|
-
const resp = await fetch(`http://127.0.0.1:${PORT}/api/agentchattr/${encodeURIComponent(project.id)}/restart`, {
|
|
2627
|
-
method: "POST",
|
|
2628
|
-
timeout: 15000,
|
|
2629
|
-
});
|
|
2630
|
-
if (resp.ok) {
|
|
2631
|
-
const data = await resp.json();
|
|
2632
|
-
console.log(`[health] AC for ${project.id} auto-restarted (PID: ${data.pid})`);
|
|
2633
|
-
// #447: agent reset is now chained inside the restart endpoint
|
|
2634
|
-
// itself (fires on a 2s timer), so no separate call needed here.
|
|
2635
|
-
} else {
|
|
2636
|
-
const body = await resp.text().catch(() => "");
|
|
2637
|
-
console.error(`[health] AC auto-restart failed for ${project.id}: ${resp.status} ${body.slice(0, 120)}`);
|
|
2638
|
-
chattrProcesses.set(project.id, { process: null, state: "error", error: `Auto-restart failed: ${resp.status}` });
|
|
2639
|
-
}
|
|
2640
|
-
} catch (err) {
|
|
2641
|
-
console.error(`[health] AC auto-restart failed for ${project.id}:`, err.message);
|
|
2642
|
-
chattrProcesses.set(project.id, { process: null, state: "error", error: `Auto-restart failed: ${err.message}` });
|
|
1702
|
+
function watchdogCheck() {
|
|
1703
|
+
for (const [key, session] of agentSessions) {
|
|
1704
|
+
if (session.state !== "running" || !session.term) continue;
|
|
1705
|
+
if (!session.lastOutputAt) continue;
|
|
1706
|
+
// #732: skip file-chat projects — idle is normal, PTY dispatch wakes them
|
|
1707
|
+
if (routes.getProjectChatMode(session.projectId) === "file") continue;
|
|
1708
|
+
if (Date.now() - session.lastOutputAt > WATCHDOG_TIMEOUT_MS) {
|
|
1709
|
+
console.log(`[watchdog] ${key}: no output for 10m — sending Ctrl+C`);
|
|
1710
|
+
safeWrite(session.term, "\x03");
|
|
1711
|
+
session.lastOutputAt = Date.now();
|
|
2643
1712
|
}
|
|
2644
1713
|
}
|
|
2645
1714
|
}
|
|
2646
1715
|
|
|
2647
|
-
function
|
|
2648
|
-
if (
|
|
2649
|
-
|
|
2650
|
-
console.log("[
|
|
1716
|
+
function startWatchdog() {
|
|
1717
|
+
if (_watchdogHandle) return;
|
|
1718
|
+
_watchdogHandle = setInterval(watchdogCheck, 60_000);
|
|
1719
|
+
console.log("[watchdog] stuck-agent watchdog started (60s interval, 10m threshold)");
|
|
2651
1720
|
}
|
|
2652
1721
|
|
|
2653
1722
|
// #657: extracted startup migrations so full-reset can re-run them
|
|
2654
1723
|
function runStartupMigrations(cfg) {
|
|
2655
1724
|
const projects = (cfg.projects || []).filter((p) => !p.archived);
|
|
2656
|
-
const acRestartNeeded = [];
|
|
2657
|
-
|
|
2658
|
-
// bridge-migrate
|
|
2659
|
-
for (const p of projects) {
|
|
2660
|
-
const acPath = projectAgentchattrConfigPath(p.id);
|
|
2661
|
-
if (!fs.existsSync(acPath)) continue;
|
|
2662
|
-
try {
|
|
2663
|
-
const before = fs.readFileSync(acPath, "utf-8");
|
|
2664
|
-
const hadOldDc = /^\[agents\.discord-bridge\]\s*$/m.test(before);
|
|
2665
|
-
const hadOldTg = /^\[agents\.telegram-bridge\]\s*$/m.test(before);
|
|
2666
|
-
const dc = patchAgentchattrConfigForDiscordBridge(before);
|
|
2667
|
-
const tg = patchAgentchattrConfigForTelegramBridge(dc.text);
|
|
2668
|
-
if (dc.changed || tg.changed) {
|
|
2669
|
-
fs.writeFileSync(acPath, tg.text);
|
|
2670
|
-
console.log(`[bridge-migrate] ${p.id}: migrated AC config slugs`);
|
|
2671
|
-
if (hadOldDc || hadOldTg) {
|
|
2672
|
-
if (!acRestartNeeded.includes(p.id)) acRestartNeeded.push(p.id);
|
|
2673
|
-
}
|
|
2674
|
-
}
|
|
2675
|
-
} catch {}
|
|
2676
|
-
}
|
|
2677
|
-
|
|
2678
|
-
// bridge-refresh
|
|
2679
|
-
const DISCORD_BRIDGE_SRC = path.join(__dirname, "..", "bridges", "discord", "discord_bridge.py");
|
|
2680
|
-
const DISCORD_BRIDGE_DEST = path.join(os.homedir(), ".quadwork", "agentchattr-discord", "discord_bridge.py");
|
|
2681
|
-
if (fs.existsSync(DISCORD_BRIDGE_SRC) && fs.existsSync(path.dirname(DISCORD_BRIDGE_DEST))) {
|
|
2682
|
-
try {
|
|
2683
|
-
fs.copyFileSync(DISCORD_BRIDGE_SRC, DISCORD_BRIDGE_DEST);
|
|
2684
|
-
console.log("[bridge-refresh] refreshed Discord bridge script from package");
|
|
2685
|
-
} catch (err) {
|
|
2686
|
-
console.warn(`[bridge-refresh] failed to refresh Discord bridge script: ${err.message || err}`);
|
|
2687
|
-
}
|
|
2688
|
-
}
|
|
2689
|
-
|
|
2690
|
-
// bridge slug patches
|
|
2691
|
-
const BRIDGE_SLUG_PATCHES = [
|
|
2692
|
-
{ file: path.join(os.homedir(), ".quadwork", "agentchattr-telegram", "telegram_bridge.py"), old: '"telegram-bridge"', replacement: '"tg"' },
|
|
2693
|
-
{ file: path.join(os.homedir(), ".quadwork", "agentchattr-discord", "discord_bridge.py"), old: '"discord-bridge"', replacement: '"dc"' },
|
|
2694
|
-
];
|
|
2695
|
-
for (const { file, old, replacement } of BRIDGE_SLUG_PATCHES) {
|
|
2696
|
-
try {
|
|
2697
|
-
if (!fs.existsSync(file)) continue;
|
|
2698
|
-
const content = fs.readFileSync(file, "utf-8");
|
|
2699
|
-
if (!content.includes(old)) continue;
|
|
2700
|
-
fs.writeFileSync(file, content.replaceAll(old, replacement));
|
|
2701
|
-
console.log(`[bridge-migrate] patched stale bridge_sender in ${path.basename(file)}`);
|
|
2702
|
-
} catch {}
|
|
2703
|
-
}
|
|
2704
1725
|
|
|
2705
1726
|
// reseed stale slugs
|
|
2706
1727
|
const SLUG_FIXES = [
|
|
@@ -2736,109 +1757,6 @@ function runStartupMigrations(cfg) {
|
|
|
2736
1757
|
}
|
|
2737
1758
|
}
|
|
2738
1759
|
|
|
2739
|
-
// ghost-fix + idle-fix
|
|
2740
|
-
for (const p of projects) {
|
|
2741
|
-
const acDir = resolveProjectChattr(p.id).dir;
|
|
2742
|
-
const regPath = path.join(acDir, "registry.py");
|
|
2743
|
-
if (fs.existsSync(regPath)) {
|
|
2744
|
-
try {
|
|
2745
|
-
let reg = fs.readFileSync(regPath, "utf-8");
|
|
2746
|
-
if (!reg.includes("force: bool")) {
|
|
2747
|
-
reg = reg.replace(
|
|
2748
|
-
/def register\(self, base: str, label: str \| None = None\) -> dict \| None:/,
|
|
2749
|
-
"def register(self, base: str, label: str | None = None, force: bool = False) -> dict | None:",
|
|
2750
|
-
);
|
|
2751
|
-
reg = reg.replace(
|
|
2752
|
-
" self._expire_reserved()\n\n # Find next free slot",
|
|
2753
|
-
" self._expire_reserved()\n\n" +
|
|
2754
|
-
" # quadwork#478 + #502: force-replace\n" +
|
|
2755
|
-
" if force:\n" +
|
|
2756
|
-
" ghosts = [n for n, i in self._instances.items() if i.base == base]\n" +
|
|
2757
|
-
" for name in ghosts:\n" +
|
|
2758
|
-
" del self._instances[name]\n" +
|
|
2759
|
-
" stale_reserved = [rn for rn in self._reserved\n" +
|
|
2760
|
-
" if self._parse_name(rn)[0] == base]\n" +
|
|
2761
|
-
" for rn in stale_reserved:\n" +
|
|
2762
|
-
" del self._reserved[rn]\n\n" +
|
|
2763
|
-
" # Find next free slot",
|
|
2764
|
-
);
|
|
2765
|
-
fs.writeFileSync(regPath, reg);
|
|
2766
|
-
console.log(`[ghost-fix] ${p.id}: patched registry.py with force-replace support`);
|
|
2767
|
-
} else if (!reg.includes("stale_reserved")) {
|
|
2768
|
-
reg = reg.replace(
|
|
2769
|
-
/( +)for name in ghosts:\n\1 del self\._instances\[name\]\n\1 self\._reserved\[name\] = time\.time\(\)/,
|
|
2770
|
-
"$1for name in ghosts:\n$1 del self._instances[name]\n" +
|
|
2771
|
-
"$1stale_reserved = [rn for rn in self._reserved\n" +
|
|
2772
|
-
"$1 if self._parse_name(rn)[0] == base]\n" +
|
|
2773
|
-
"$1for rn in stale_reserved:\n" +
|
|
2774
|
-
"$1 del self._reserved[rn]",
|
|
2775
|
-
);
|
|
2776
|
-
fs.writeFileSync(regPath, reg);
|
|
2777
|
-
console.log(`[ghost-fix] ${p.id}: upgraded registry.py force-replace to clear _reserved (#502)`);
|
|
2778
|
-
}
|
|
2779
|
-
} catch (err) {
|
|
2780
|
-
console.warn(`[ghost-fix] ${p.id}: failed to patch registry.py: ${err.message}`);
|
|
2781
|
-
}
|
|
2782
|
-
}
|
|
2783
|
-
const appPath = path.join(acDir, "app.py");
|
|
2784
|
-
if (fs.existsSync(appPath)) {
|
|
2785
|
-
try {
|
|
2786
|
-
let app = fs.readFileSync(appPath, "utf-8");
|
|
2787
|
-
if (!app.includes("force = bool(body.get(\"force\"")) {
|
|
2788
|
-
app = app.replace(
|
|
2789
|
-
" result = registry.register(base, label)\n",
|
|
2790
|
-
" force = bool(body.get(\"force\", False))\n result = registry.register(base, label, force=force)\n",
|
|
2791
|
-
);
|
|
2792
|
-
fs.writeFileSync(appPath, app);
|
|
2793
|
-
console.log(`[ghost-fix] ${p.id}: patched app.py with force-replace support`);
|
|
2794
|
-
}
|
|
2795
|
-
} catch (err) {
|
|
2796
|
-
console.warn(`[ghost-fix] ${p.id}: failed to patch app.py: ${err.message}`);
|
|
2797
|
-
}
|
|
2798
|
-
}
|
|
2799
|
-
if (fs.existsSync(appPath)) {
|
|
2800
|
-
try {
|
|
2801
|
-
const app = fs.readFileSync(appPath, "utf-8");
|
|
2802
|
-
if (app.includes("_CRASH_TIMEOUT = 15")) {
|
|
2803
|
-
patchCrashTimeout(acDir);
|
|
2804
|
-
console.log(`[idle-fix] ${p.id}: crash timeout patched on disk`);
|
|
2805
|
-
acRestartNeeded.push(p.id);
|
|
2806
|
-
}
|
|
2807
|
-
} catch (err) {
|
|
2808
|
-
console.warn(`[idle-fix] ${p.id}: failed to patch app.py crash timeout: ${err.message}`);
|
|
2809
|
-
}
|
|
2810
|
-
}
|
|
2811
|
-
}
|
|
2812
|
-
|
|
2813
|
-
// CLI-based agent sections
|
|
2814
|
-
for (const p of projects) {
|
|
2815
|
-
const acPath = projectAgentchattrConfigPath(p.id);
|
|
2816
|
-
if (!fs.existsSync(acPath)) continue;
|
|
2817
|
-
try {
|
|
2818
|
-
let toml = fs.readFileSync(acPath, "utf-8");
|
|
2819
|
-
const cliSections = new Set();
|
|
2820
|
-
for (const [, agentCfg] of Object.entries(p.agents || {})) {
|
|
2821
|
-
const cmd = agentCfg.command || "claude";
|
|
2822
|
-
const cli = cmd.split("/").pop().split(" ")[0];
|
|
2823
|
-
cliSections.add(cli);
|
|
2824
|
-
}
|
|
2825
|
-
let changed = false;
|
|
2826
|
-
for (const cli of cliSections) {
|
|
2827
|
-
if (!new RegExp(`^\\[agents\\.${cli}\\]`, "m").test(toml)) {
|
|
2828
|
-
const injectMode = cli === "codex" ? "proxy_flag" : cli === "gemini" ? "env" : "flag";
|
|
2829
|
-
toml += `\n[agents.${cli}]\ncommand = "${cli}"\nlabel = "${cli}"\nmcp_inject = "${injectMode}"\n`;
|
|
2830
|
-
changed = true;
|
|
2831
|
-
}
|
|
2832
|
-
}
|
|
2833
|
-
if (changed) {
|
|
2834
|
-
fs.writeFileSync(acPath, toml);
|
|
2835
|
-
console.log(`[#596] ${p.id}: added CLI-based agent sections to config.toml`);
|
|
2836
|
-
}
|
|
2837
|
-
} catch (err) {
|
|
2838
|
-
console.warn(`[#596] ${p.id}: config.toml migration failed: ${err.message}`);
|
|
2839
|
-
}
|
|
2840
|
-
}
|
|
2841
|
-
|
|
2842
1760
|
// #690: seed DESIGN-GUIDE.md into existing agent worktrees
|
|
2843
1761
|
const designGuideSrc = path.join(__dirname, "..", "templates", "seeds", "DESIGN-GUIDE.md");
|
|
2844
1762
|
if (fs.existsSync(designGuideSrc)) {
|
|
@@ -2859,97 +1777,66 @@ function runStartupMigrations(cfg) {
|
|
|
2859
1777
|
}
|
|
2860
1778
|
}
|
|
2861
1779
|
|
|
2862
|
-
return acRestartNeeded;
|
|
2863
1780
|
}
|
|
2864
1781
|
|
|
2865
1782
|
server.listen(PORT, "127.0.0.1", async () => {
|
|
2866
1783
|
console.log(`QuadWork server listening on http://127.0.0.1:${PORT}`);
|
|
2867
1784
|
syncTriggersFromConfig();
|
|
2868
|
-
// #579: detect AC processes already running (spawned by cmdStart before
|
|
2869
|
-
// the server module loaded). Without this, chattrProcesses is empty on
|
|
2870
|
-
// boot and the health monitor can't track cmdStart-spawned ACs, while
|
|
2871
|
-
// the dashboard's Start button would redundantly kill+respawn them.
|
|
2872
1785
|
const startupCfg = readConfig();
|
|
2873
|
-
|
|
2874
|
-
|
|
2875
|
-
|
|
2876
|
-
|
|
2877
|
-
|
|
2878
|
-
|
|
2879
|
-
|
|
2880
|
-
|
|
2881
|
-
|
|
2882
|
-
|
|
2883
|
-
|
|
1786
|
+
|
|
1787
|
+
// #719: Migrate AC chat history to JSONL before initializing file-chat.
|
|
1788
|
+
const migrationFailed = new Set(runAcMigration(startupCfg));
|
|
1789
|
+
|
|
1790
|
+
// #722: One-time switchover — set all projects to file-based chat.
|
|
1791
|
+
if (!startupCfg.file_chat_switchover_done) {
|
|
1792
|
+
let switched = false;
|
|
1793
|
+
for (const p of (startupCfg.projects || [])) {
|
|
1794
|
+
if (p.chat_mode !== "file" && !migrationFailed.has(p.id)) {
|
|
1795
|
+
p.chat_mode = "file";
|
|
1796
|
+
switched = true;
|
|
1797
|
+
console.log(`[startup] ${p.id}: switched to file-based chat`);
|
|
1798
|
+
}
|
|
2884
1799
|
}
|
|
1800
|
+
startupCfg.file_chat_switchover_done = true;
|
|
1801
|
+
writeConfig(startupCfg);
|
|
1802
|
+
if (switched) console.log("[startup] file-chat switchover complete");
|
|
2885
1803
|
}
|
|
2886
|
-
|
|
2887
|
-
//
|
|
2888
|
-
// instances receive the fix without requiring a restart.
|
|
2889
|
-
// #448: retry after 5s for projects where AC isn't up yet at boot.
|
|
1804
|
+
|
|
1805
|
+
// Initialize file-chat engine for all projects.
|
|
2890
1806
|
for (const p of (startupCfg.projects || [])) {
|
|
2891
|
-
|
|
2892
|
-
|
|
2893
|
-
|
|
2894
|
-
|
|
2895
|
-
|
|
2896
|
-
|
|
2897
|
-
|
|
2898
|
-
|
|
2899
|
-
|
|
2900
|
-
|
|
2901
|
-
|
|
2902
|
-
|
|
2903
|
-
if (startupCfg._acRestartNeeded) {
|
|
2904
|
-
for (const projectId of startupCfg._acRestartNeeded) {
|
|
2905
|
-
const { url } = resolveProjectChattr(projectId);
|
|
2906
|
-
const portMatch = url.match(/:(\d+)/);
|
|
2907
|
-
const port = portMatch ? parseInt(portMatch[1], 10) : 8300;
|
|
2908
|
-
isPortAlive(port).then((alive) => {
|
|
2909
|
-
if (!alive) return;
|
|
2910
|
-
console.log(`[idle-fix] ${projectId}: restarting AC (port ${port}) so running process observes _CRASH_TIMEOUT = 120 (#629)`);
|
|
2911
|
-
return fetch(`http://127.0.0.1:${PORT}/api/agentchattr/${encodeURIComponent(projectId)}/restart`, {
|
|
2912
|
-
method: "POST",
|
|
2913
|
-
headers: { "Content-Type": "application/json" },
|
|
2914
|
-
body: JSON.stringify({ action: "restart" }),
|
|
2915
|
-
});
|
|
2916
|
-
}).then((r) => {
|
|
2917
|
-
if (r && r.ok) console.log(`[idle-fix] ${projectId}: AC restarted successfully`);
|
|
2918
|
-
else if (r) console.warn(`[idle-fix] ${projectId}: AC restart returned ${r.status}`);
|
|
2919
|
-
}).catch((err) => {
|
|
2920
|
-
console.warn(`[idle-fix] ${projectId}: AC restart failed: ${err.message}`);
|
|
2921
|
-
});
|
|
1807
|
+
if (p.chat_mode === "file") {
|
|
1808
|
+
if (migrationFailed.has(p.id)) {
|
|
1809
|
+
console.error(`[startup] ${p.id}: migration failed — skipping file-chat init`);
|
|
1810
|
+
continue;
|
|
1811
|
+
}
|
|
1812
|
+
try {
|
|
1813
|
+
fileChat.initProject(p.id);
|
|
1814
|
+
console.log(`[startup] ${p.id}: file-chat engine initialized`);
|
|
1815
|
+
} catch (err) {
|
|
1816
|
+
console.error(`[startup] FATAL: ${p.id}: ${err.message}`);
|
|
1817
|
+
process.exit(1);
|
|
1818
|
+
}
|
|
2922
1819
|
}
|
|
2923
1820
|
}
|
|
2924
|
-
|
|
1821
|
+
|
|
1822
|
+
runStartupMigrations(startupCfg);
|
|
1823
|
+
|
|
2925
1824
|
if (startupCfg.butler && startupCfg.butler.enabled && startupCfg.butler.auto_start) {
|
|
2926
1825
|
const result = spawnButlerPty();
|
|
2927
1826
|
if (result.ok) console.log(`[butler] auto-started (PID: ${result.pid})`);
|
|
2928
1827
|
else console.warn(`[butler] auto-start failed: ${result.error}`);
|
|
2929
1828
|
}
|
|
2930
|
-
|
|
2931
|
-
startAcHealthMonitor();
|
|
1829
|
+
startWatchdog();
|
|
2932
1830
|
});
|
|
2933
1831
|
|
|
2934
|
-
|
|
2935
|
-
|
|
2936
|
-
|
|
2937
|
-
|
|
2938
|
-
|
|
2939
|
-
|
|
2940
|
-
* invisible to the CLI. Without this, a Ctrl+C in the foreground
|
|
2941
|
-
* quadwork terminal would exit the Node process and orphan every
|
|
2942
|
-
* dashboard-started python run.py. See review on quadwork#213.
|
|
2943
|
-
*/
|
|
2944
|
-
function shutdownChattrProcesses() {
|
|
2945
|
-
for (const [, proc] of chattrProcesses) {
|
|
2946
|
-
if (proc && proc.process) {
|
|
2947
|
-
try { proc.process.kill("SIGTERM"); } catch {}
|
|
1832
|
+
function shutdown() {
|
|
1833
|
+
stopButlerPty();
|
|
1834
|
+
const cfg = readConfig();
|
|
1835
|
+
for (const p of (cfg.projects || [])) {
|
|
1836
|
+
if (p.chat_mode === "file") {
|
|
1837
|
+
try { fileChat.shutdownProject(p.id); } catch {}
|
|
2948
1838
|
}
|
|
2949
1839
|
}
|
|
2950
|
-
chattrProcesses.clear();
|
|
2951
|
-
// #631: stop Butler PTY on shutdown
|
|
2952
|
-
stopButlerPty();
|
|
2953
1840
|
}
|
|
2954
1841
|
|
|
2955
|
-
module.exports = {
|
|
1842
|
+
module.exports = { shutdown };
|