quadwork 1.19.3 → 2.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +19 -35
- package/bin/quadwork.js +48 -1118
- package/out/404.html +1 -1
- package/out/__next.__PAGE__.txt +3 -3
- package/out/__next._full.txt +14 -14
- package/out/__next._head.txt +4 -4
- package/out/__next._index.txt +8 -8
- package/out/__next._tree.txt +2 -2
- package/out/_next/static/chunks/{030cjkhts487t.js → 079wdniva~de1.js} +1 -1
- package/out/_next/static/chunks/{0n~dq4kpx9xxx.js → 07lhk_q6pmm3r.js} +1 -1
- package/out/_next/static/chunks/0_79hkefw1mo2.js +1 -0
- package/out/_next/static/chunks/{08tog0xc~.es_.js → 0_lyyn..t63bc.js} +1 -1
- package/out/_next/static/chunks/0oxv9vrvc17to.js +2 -0
- package/out/_next/static/chunks/0py7102i226n5.js +1 -0
- package/out/_next/static/chunks/{13fv-yi7.v52g.js → 0q4bm04c1jl_3.js} +1 -1
- package/out/_next/static/chunks/{0_idxioyl0p7h.js → 0sjhy6oe3mbon.js} +1 -1
- package/out/_next/static/chunks/13xk0vgfbrcld.css +2 -0
- package/out/_next/static/chunks/14k3bfe537f9_.js +25 -0
- package/out/_next/static/chunks/{turbopack-0qm-e3ifrz~2u.js → turbopack-0y2u-q0l2m67w.js} +1 -1
- package/out/_not-found/__next._full.txt +13 -13
- package/out/_not-found/__next._head.txt +4 -4
- package/out/_not-found/__next._index.txt +8 -8
- package/out/_not-found/__next._not-found.__PAGE__.txt +2 -2
- package/out/_not-found/__next._not-found.txt +3 -3
- package/out/_not-found/__next._tree.txt +2 -2
- package/out/_not-found.html +1 -1
- package/out/_not-found.txt +13 -13
- package/out/app-shell/__next._full.txt +13 -13
- package/out/app-shell/__next._head.txt +4 -4
- package/out/app-shell/__next._index.txt +8 -8
- package/out/app-shell/__next._tree.txt +2 -2
- package/out/app-shell/__next.app-shell.__PAGE__.txt +2 -2
- package/out/app-shell/__next.app-shell.txt +3 -3
- package/out/app-shell.html +1 -1
- package/out/app-shell.txt +13 -13
- package/out/index.html +1 -1
- package/out/index.txt +14 -14
- package/out/project/_/__next._full.txt +14 -14
- package/out/project/_/__next._head.txt +4 -4
- package/out/project/_/__next._index.txt +8 -8
- package/out/project/_/__next._tree.txt +2 -2
- package/out/project/_/__next.project.$d$id.__PAGE__.txt +3 -3
- package/out/project/_/__next.project.$d$id.txt +3 -3
- package/out/project/_/__next.project.txt +3 -3
- package/out/project/_/queue/__next._full.txt +14 -14
- package/out/project/_/queue/__next._head.txt +4 -4
- package/out/project/_/queue/__next._index.txt +8 -8
- package/out/project/_/queue/__next._tree.txt +2 -2
- package/out/project/_/queue/__next.project.$d$id.queue.__PAGE__.txt +3 -3
- package/out/project/_/queue/__next.project.$d$id.queue.txt +3 -3
- package/out/project/_/queue/__next.project.$d$id.txt +3 -3
- package/out/project/_/queue/__next.project.txt +3 -3
- package/out/project/_/queue.html +1 -1
- package/out/project/_/queue.txt +14 -14
- package/out/project/_.html +1 -1
- package/out/project/_.txt +14 -14
- package/out/settings/__next._full.txt +14 -14
- package/out/settings/__next._head.txt +4 -4
- package/out/settings/__next._index.txt +8 -8
- package/out/settings/__next._tree.txt +2 -2
- package/out/settings/__next.settings.__PAGE__.txt +3 -3
- package/out/settings/__next.settings.txt +3 -3
- package/out/settings.html +1 -1
- package/out/settings.txt +14 -14
- package/out/setup/__next._full.txt +14 -14
- package/out/setup/__next._head.txt +4 -4
- package/out/setup/__next._index.txt +8 -8
- package/out/setup/__next._tree.txt +2 -2
- package/out/setup/__next.setup.__PAGE__.txt +3 -3
- package/out/setup/__next.setup.txt +3 -3
- package/out/setup.html +1 -1
- package/out/setup.txt +14 -14
- package/package.json +4 -2
- package/server/ac-restore.js +128 -0
- package/server/bridges/discord.js +183 -0
- package/server/bridges/telegram.js +210 -0
- package/server/config.js +4 -60
- package/server/file-chat.js +318 -0
- package/server/index.js +129 -1294
- package/server/install-agentchattr.js +3 -284
- package/server/mcp-chat-shim.js +171 -0
- package/server/migrate-ac.js +158 -0
- package/server/pty-dispatcher.js +188 -0
- package/server/routes.js +149 -1397
- package/templates/CLAUDE.md +2 -2
- package/templates/OVERNIGHT-QUEUE.md +1 -1
- package/templates/seeds/butler.CLAUDE.md +30 -62
- package/templates/seeds/dev.AGENTS.md +10 -1
- package/templates/seeds/head.AGENTS.md +3 -3
- package/templates/seeds/re1.AGENTS.md +3 -3
- package/templates/seeds/re2.AGENTS.md +3 -3
- package/bridges/discord/__pycache__/discord_bridge.cpython-314.pyc +0 -0
- package/bridges/discord/discord_bridge.py +0 -666
- package/bridges/discord/requirements.txt +0 -2
- package/out/_next/static/chunks/08kw.2kplxa.6.css +0 -2
- package/out/_next/static/chunks/0_nm7se0m3twm.js +0 -25
- package/out/_next/static/chunks/0uz5svjlo9dwl.js +0 -1
- package/out/_next/static/chunks/0zahstmgdrpy5.js +0 -1
- package/out/_next/static/chunks/0zfotsowwll1x.js +0 -2
- package/server/__tests__/bridge-auto-stop-guard.test.js +0 -134
- package/server/__tests__/rate-limit-handling.test.js +0 -168
- package/server/__tests__/scrub-secrets.test.js +0 -235
- package/server/__tests__/v1110-security-qa.test.js +0 -312
- package/server/agentchattr-registry.js +0 -188
- package/server/install-agentchattr.patchCrashTimeout.test.js +0 -71
- package/server/queue-watcher.js +0 -171
- package/server/queue-watcher.test.js +0 -64
- package/server/routes.batchProgress.test.js +0 -94
- package/server/routes.chatWsSend.test.js +0 -161
- package/server/routes.discordBridge.test.js +0 -80
- package/server/routes.parseActiveBatch.test.js +0 -88
- package/server/routes.telegramBridge.test.js +0 -241
- package/templates/config.toml +0 -72
- package/templates/wrapper.py +0 -70
- /package/out/_next/static/{D66Um4H226QD5y4w5xTKq → 479UD5Kit4YvCmtgO25VT}/_buildManifest.js +0 -0
- /package/out/_next/static/{D66Um4H226QD5y4w5xTKq → 479UD5Kit4YvCmtgO25VT}/_clientMiddlewareManifest.js +0 -0
- /package/out/_next/static/{D66Um4H226QD5y4w5xTKq → 479UD5Kit4YvCmtgO25VT}/_ssgManifest.js +0 -0
package/server/index.js
CHANGED
|
@@ -6,21 +6,23 @@ const os = require("os");
|
|
|
6
6
|
const { WebSocketServer, WebSocket } = require("ws");
|
|
7
7
|
const pty = require("node-pty");
|
|
8
8
|
const { spawn } = require("child_process");
|
|
9
|
-
const { readConfig, resolveAgentCwd, resolveAgentCommand,
|
|
9
|
+
const { readConfig, resolveAgentCwd, resolveAgentCommand, CONFIG_PATH, ensureSecureDir, writeSecureFile, writeConfig } = require("./config");
|
|
10
10
|
const routes = require("./routes");
|
|
11
|
-
const
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
projectAgentchattrConfigPath,
|
|
15
|
-
} = routes;
|
|
16
|
-
const { waitForAgentChattrReady, registerAgent, registerAgentWithRetry, deregisterAgent, startHeartbeat, stopHeartbeat } = require("./agentchattr-registry");
|
|
17
|
-
const { patchAgentchattrCss, patchCrashTimeout } = require("./install-agentchattr");
|
|
18
|
-
const { startQueueWatcher, stopQueueWatcher } = require("./queue-watcher");
|
|
11
|
+
const fileChat = require("./file-chat");
|
|
12
|
+
const { dispatchToAgentPTY, cleanupSession: cleanupPtyDispatcher } = require("./pty-dispatcher");
|
|
13
|
+
const { runAcMigration } = require("./migrate-ac");
|
|
19
14
|
|
|
20
15
|
const net = require("net");
|
|
21
16
|
const config = readConfig();
|
|
22
17
|
const PORT = config.port || 8400;
|
|
23
18
|
|
|
19
|
+
function emitSystemMessage(projectId, text) {
|
|
20
|
+
try {
|
|
21
|
+
if (routes.getProjectChatMode(projectId) !== "file") return;
|
|
22
|
+
fileChat.appendMessage(projectId, { sender: "system", type: "system", text });
|
|
23
|
+
} catch {}
|
|
24
|
+
}
|
|
25
|
+
|
|
24
26
|
const app = express();
|
|
25
27
|
// #412 / quadwork#279: bump the global JSON body limit to 10mb so
|
|
26
28
|
// POST /api/project-history can accept full chat exports. The
|
|
@@ -33,6 +35,14 @@ app.use(express.json({ limit: "10mb" }));
|
|
|
33
35
|
// --- Mount migrated API routes (from Next.js) ---
|
|
34
36
|
app.use(routes);
|
|
35
37
|
|
|
38
|
+
// #730: wire PTY injection dispatcher into the chat route
|
|
39
|
+
routes.setPtyDispatchCallback((projectId, msg) => {
|
|
40
|
+
dispatchToAgentPTY(projectId, msg, agentSessions, {
|
|
41
|
+
isLoopGuardPaused: fileChat.isLoopGuardPaused,
|
|
42
|
+
safeWrite,
|
|
43
|
+
});
|
|
44
|
+
});
|
|
45
|
+
|
|
36
46
|
const server = http.createServer(app);
|
|
37
47
|
|
|
38
48
|
// --- REST endpoints ---
|
|
@@ -163,9 +173,6 @@ app.get("/api/caffeinate/status", (_req, res) => {
|
|
|
163
173
|
// PTY (term) is the source of truth for "running". WS is optional (attaches to view terminal).
|
|
164
174
|
const agentSessions = new Map();
|
|
165
175
|
|
|
166
|
-
// AgentChattr server processes — per-project (key = projectId)
|
|
167
|
-
const chattrProcesses = new Map();
|
|
168
|
-
|
|
169
176
|
// #631: Butler session — single global PTY (not per-project, no AC integration)
|
|
170
177
|
let butlerSession = { term: null, viewers: new Set(), viewerDims: new Map(), lastDims: null, state: "stopped", error: null, scrollback: Buffer.alloc(0) };
|
|
171
178
|
|
|
@@ -317,6 +324,27 @@ function writeMcpConfigFile(projectId, agentId, mcpHttpPort, token) {
|
|
|
317
324
|
return filePath;
|
|
318
325
|
}
|
|
319
326
|
|
|
327
|
+
function writeFileChatMcpConfig(projectId, agentId, serverPort) {
|
|
328
|
+
const os = require("os");
|
|
329
|
+
const crypto = require("crypto");
|
|
330
|
+
const configDir = path.join(os.homedir(), ".quadwork", projectId);
|
|
331
|
+
ensureSecureDir(configDir);
|
|
332
|
+
const filePath = path.join(configDir, `mcp-${agentId}.json`);
|
|
333
|
+
const shimPath = path.join(__dirname, "mcp-chat-shim.js");
|
|
334
|
+
const token = crypto.randomBytes(16).toString("hex");
|
|
335
|
+
fileChat.registerShimToken(projectId, agentId, token);
|
|
336
|
+
const config = {
|
|
337
|
+
mcpServers: {
|
|
338
|
+
chat: {
|
|
339
|
+
command: "node",
|
|
340
|
+
args: [shimPath, "--project", projectId, "--agent", agentId, "--port", String(serverPort), "--token", token],
|
|
341
|
+
},
|
|
342
|
+
},
|
|
343
|
+
};
|
|
344
|
+
writeSecureFile(filePath, JSON.stringify(config, null, 2));
|
|
345
|
+
return { filePath, token };
|
|
346
|
+
}
|
|
347
|
+
|
|
320
348
|
/**
|
|
321
349
|
* Build extra launch args for an agent (permission flags + MCP injection).
|
|
322
350
|
* Async because Codex proxy_flag mode needs to await proxy startup.
|
|
@@ -324,16 +352,12 @@ function writeMcpConfigFile(projectId, agentId, mcpHttpPort, token) {
|
|
|
324
352
|
async function buildAgentArgs(projectId, agentId) {
|
|
325
353
|
const cfg = readConfig();
|
|
326
354
|
const project = cfg.projects?.find((p) => p.id === projectId);
|
|
327
|
-
if (!project) return { args: []
|
|
355
|
+
if (!project) return { args: [] };
|
|
328
356
|
|
|
329
357
|
const agentCfg = project.agents?.[agentId] || {};
|
|
330
358
|
const command = agentCfg.command || "claude";
|
|
331
|
-
const cliBase = command.split("/").pop().split(" ")[0];
|
|
359
|
+
const cliBase = command.split("/").pop().split(" ")[0];
|
|
332
360
|
const args = [];
|
|
333
|
-
let acRegistrationName = null;
|
|
334
|
-
let acServerPort = null;
|
|
335
|
-
let acRegistrationToken = null;
|
|
336
|
-
let acInjectMode = null;
|
|
337
361
|
|
|
338
362
|
// Permission bypass flags
|
|
339
363
|
if (agentCfg.auto_approve !== false) {
|
|
@@ -367,93 +391,22 @@ async function buildAgentArgs(projectId, agentId) {
|
|
|
367
391
|
}
|
|
368
392
|
}
|
|
369
393
|
|
|
370
|
-
// MCP config injection
|
|
371
|
-
const
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
const
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
|
|
383
|
-
// (git clone + venv + pip install) before it can bind a port.
|
|
384
|
-
const acReady = await waitForAgentChattrReady(acServerPort, 30000);
|
|
385
|
-
if (!acReady) {
|
|
386
|
-
console.warn(`[#565] Agent ${agentId}: AC not reachable on port ${acServerPort} after 30s. Spawning without chat integration.`);
|
|
387
|
-
// #565: preserve acServerPort and acInjectMode so deferred
|
|
388
|
-
// recovery in spawnAgentPty can retry registration later.
|
|
389
|
-
return { args, acRegistrationName: null, acServerPort, acRegistrationToken: null, acInjectMode: injectMode, acMcpHttpPort: mcpHttpPort || null };
|
|
390
|
-
}
|
|
391
|
-
// #242: best-effort deregister any stale registration of the
|
|
392
|
-
// canonical name (left over by a crashed previous QuadWork
|
|
393
|
-
// session) so the fresh register lands at slot 1 instead of
|
|
394
|
-
// head-2 / re2-2. We need the previous agent's bearer
|
|
395
|
-
// token because app.py:2123 requires authenticated agent
|
|
396
|
-
// session for family names — load it from disk (persisted
|
|
397
|
-
// across restarts). Failures are non-fatal.
|
|
398
|
-
const stalePersistedToken = readPersistedAgentToken(projectId, agentId);
|
|
399
|
-
if (stalePersistedToken) {
|
|
400
|
-
await deregisterAgent(acServerPort, agentId, stalePersistedToken).catch(() => {});
|
|
401
|
-
clearPersistedAgentToken(projectId, agentId);
|
|
402
|
-
}
|
|
403
|
-
// #478: force-replace so AC expires any ghost slots for this base
|
|
404
|
-
// #565: retry with backoff and degrade gracefully if AC is not ready
|
|
405
|
-
const registration = await registerAgentWithRetry(acServerPort, agentId, agentCfg.display_name || null, { force: true });
|
|
406
|
-
if (!registration) {
|
|
407
|
-
console.warn(`[#565] Agent ${agentId}: AC registration failed after retries (${registerAgent.lastError}). Spawning without chat integration.`);
|
|
408
|
-
} else {
|
|
409
|
-
acRegistrationName = registration.name;
|
|
410
|
-
acRegistrationToken = registration.token;
|
|
411
|
-
writePersistedAgentToken(projectId, agentId, registration.token);
|
|
412
|
-
const mcpConfigPath = writeMcpConfigFile(projectId, agentId, mcpHttpPort, registration.token);
|
|
413
|
-
const flag = agentCfg.mcp_flag || "--mcp-config";
|
|
414
|
-
args.push(flag, mcpConfigPath);
|
|
415
|
-
}
|
|
416
|
-
} else if (injectMode === "proxy_flag") {
|
|
417
|
-
// Codex: register with AgentChattr first (#240) so the proxy
|
|
418
|
-
// injects a real per-agent token, not the global session token.
|
|
419
|
-
// Resolve via resolveProjectChattr so legacy/global-config
|
|
420
|
-
// projects without a per-project agentchattr_url still work.
|
|
421
|
-
const chattrInfo = resolveProjectChattr(projectId);
|
|
422
|
-
acServerPort = Number(new URL(chattrInfo.url).port) || 8300;
|
|
423
|
-
// #565: extend timeout to 30s for first-setup scenario
|
|
424
|
-
const acReady = await waitForAgentChattrReady(acServerPort, 30000);
|
|
425
|
-
if (!acReady) {
|
|
426
|
-
console.warn(`[#565] Agent ${agentId}: AC not reachable on port ${acServerPort} after 30s. Spawning without chat integration.`);
|
|
427
|
-
// #565: preserve acServerPort and acInjectMode so deferred
|
|
428
|
-
// recovery in spawnAgentPty can retry registration later.
|
|
429
|
-
return { args, acRegistrationName: null, acServerPort, acRegistrationToken: null, acInjectMode: injectMode, acMcpHttpPort: mcpHttpPort || null };
|
|
430
|
-
}
|
|
431
|
-
// #242: best-effort deregister stale canonical name first using
|
|
432
|
-
// the persisted bearer token from a previous session.
|
|
433
|
-
const stalePersistedToken = readPersistedAgentToken(projectId, agentId);
|
|
434
|
-
if (stalePersistedToken) {
|
|
435
|
-
await deregisterAgent(acServerPort, agentId, stalePersistedToken).catch(() => {});
|
|
436
|
-
clearPersistedAgentToken(projectId, agentId);
|
|
437
|
-
}
|
|
438
|
-
// #478: force-replace so AC expires any ghost slots for this base
|
|
439
|
-
// #565: retry with backoff and degrade gracefully if AC is not ready
|
|
440
|
-
const registration = await registerAgentWithRetry(acServerPort, agentId, agentCfg.display_name || null, { force: true });
|
|
441
|
-
if (!registration) {
|
|
442
|
-
console.warn(`[#565] Agent ${agentId}: AC registration failed after retries (${registerAgent.lastError}). Spawning without chat integration.`);
|
|
443
|
-
} else {
|
|
444
|
-
acRegistrationName = registration.name;
|
|
445
|
-
acRegistrationToken = registration.token;
|
|
446
|
-
writePersistedAgentToken(projectId, agentId, registration.token);
|
|
447
|
-
const upstreamUrl = `http://127.0.0.1:${mcpHttpPort}`;
|
|
448
|
-
const proxyUrl = await startMcpProxy(projectId, agentId, upstreamUrl, registration.token);
|
|
449
|
-
if (proxyUrl) {
|
|
450
|
-
args.push("-c", `mcp_servers.agentchattr.url="${proxyUrl}"`);
|
|
451
|
-
}
|
|
452
|
-
}
|
|
453
|
-
}
|
|
394
|
+
// MCP config injection — file-chat shim
|
|
395
|
+
const injectMode = agentCfg.mcp_inject || (cliBase === "codex" ? "proxy_flag" : cliBase === "gemini" ? "env" : "flag");
|
|
396
|
+
if (injectMode === "flag") {
|
|
397
|
+
const { filePath: mcpConfigPath } = writeFileChatMcpConfig(projectId, agentId, PORT);
|
|
398
|
+
const mcpFlag = agentCfg.mcp_flag || "--mcp-config";
|
|
399
|
+
args.push(mcpFlag, mcpConfigPath);
|
|
400
|
+
} else if (injectMode === "proxy_flag") {
|
|
401
|
+
const { token: shimToken } = writeFileChatMcpConfig(projectId, agentId, PORT);
|
|
402
|
+
const shimPath = path.join(__dirname, "mcp-chat-shim.js");
|
|
403
|
+
args.push(
|
|
404
|
+
"-c", `mcp_servers.chat.command="node"`,
|
|
405
|
+
"-c", `mcp_servers.chat.args=["${shimPath}","--project","${projectId}","--agent","${agentId}","--port","${PORT}","--token","${shimToken}"]`,
|
|
406
|
+
);
|
|
454
407
|
}
|
|
455
|
-
|
|
456
|
-
return { args
|
|
408
|
+
// env mode (Gemini) handled in buildAgentEnv
|
|
409
|
+
return { args };
|
|
457
410
|
}
|
|
458
411
|
|
|
459
412
|
/**
|
|
@@ -470,18 +423,19 @@ function buildAgentEnv(projectId, agentId) {
|
|
|
470
423
|
const env = {};
|
|
471
424
|
|
|
472
425
|
// Gemini: inject MCP via env var
|
|
473
|
-
if (cliBase === "gemini"
|
|
426
|
+
if (cliBase === "gemini") {
|
|
474
427
|
const os = require("os");
|
|
475
428
|
const configDir = path.join(os.homedir(), ".quadwork", projectId);
|
|
476
429
|
ensureSecureDir(configDir);
|
|
477
430
|
const settingsPath = path.join(configDir, `mcp-${agentId}-settings.json`);
|
|
478
|
-
|
|
431
|
+
|
|
432
|
+
const { token: shimToken } = writeFileChatMcpConfig(projectId, agentId, PORT);
|
|
433
|
+
const shimPath = path.join(__dirname, "mcp-chat-shim.js");
|
|
479
434
|
const settings = {
|
|
480
435
|
mcpServers: {
|
|
481
|
-
|
|
482
|
-
|
|
483
|
-
|
|
484
|
-
...(project.agentchattr_token ? { headers: { Authorization: `Bearer ${project.agentchattr_token}` } } : {}),
|
|
436
|
+
chat: {
|
|
437
|
+
command: "node",
|
|
438
|
+
args: [shimPath, "--project", projectId, "--agent", agentId, "--port", String(PORT), "--token", shimToken],
|
|
485
439
|
},
|
|
486
440
|
},
|
|
487
441
|
};
|
|
@@ -492,76 +446,8 @@ function buildAgentEnv(projectId, agentId) {
|
|
|
492
446
|
return env;
|
|
493
447
|
}
|
|
494
448
|
|
|
495
|
-
/**
|
|
496
|
-
* #394 / quadwork#253: recover from a heartbeat 409 (AgentChattr was
|
|
497
|
-
* restarted, in-memory registry wiped, our token is now stale). Mirrors
|
|
498
|
-
* wrapper.py:732-741. Re-registers the running agent, swaps the
|
|
499
|
-
* tracked name/token on the live session so the heartbeat interval
|
|
500
|
-
* picks up the new credentials on its next tick, refreshes whichever
|
|
501
|
-
* MCP transport this agent uses (Claude config file vs Codex proxy),
|
|
502
|
-
* and restarts the queue watcher in case the assigned name changed
|
|
503
|
-
* (multi-instance slot bump).
|
|
504
|
-
*
|
|
505
|
-
* Best-effort: any failure here just means the next 5s heartbeat will
|
|
506
|
-
* fail again and we'll re-enter recovery — no tight retry loop because
|
|
507
|
-
* startHeartbeat guards re-entry with `recovering`.
|
|
508
|
-
*/
|
|
509
|
-
async function recoverFrom409(projectId, agentId, session) {
|
|
510
|
-
if (!session.acServerPort) return;
|
|
511
|
-
const cfg = readConfig();
|
|
512
|
-
const project = cfg.projects?.find((p) => p.id === projectId);
|
|
513
|
-
const agentCfg = project?.agents?.[agentId] || {};
|
|
514
|
-
// AC may need a moment to come back up after a restart — wait briefly.
|
|
515
|
-
await waitForAgentChattrReady(session.acServerPort, 10000);
|
|
516
|
-
|
|
517
|
-
// Best-effort cleanup of the stale registration on disk so the
|
|
518
|
-
// fresh register isn't shoved into a slot 2 by leftover state.
|
|
519
|
-
const stale = readPersistedAgentToken(projectId, agentId);
|
|
520
|
-
if (stale) {
|
|
521
|
-
await deregisterAgent(session.acServerPort, agentId, stale).catch(() => {});
|
|
522
|
-
clearPersistedAgentToken(projectId, agentId);
|
|
523
|
-
}
|
|
524
|
-
|
|
525
|
-
// #478: force-replace so AC expires any ghost slots for this base
|
|
526
|
-
const replacement = await registerAgent(session.acServerPort, agentId, agentCfg.display_name || null, { force: true });
|
|
527
|
-
if (!replacement) return;
|
|
528
|
-
|
|
529
|
-
const previousName = session.acRegistrationName;
|
|
530
|
-
session.acRegistrationName = replacement.name;
|
|
531
|
-
session.acRegistrationToken = replacement.token;
|
|
532
|
-
writePersistedAgentToken(projectId, agentId, replacement.token);
|
|
533
|
-
|
|
534
|
-
// Refresh whichever MCP transport this agent uses so subsequent
|
|
535
|
-
// tool calls (and the queue-watcher's `mcp read` injections) hit
|
|
536
|
-
// AC with the new bearer token instead of the now-rejected one.
|
|
537
|
-
if (session.acInjectMode === "flag" && session.acMcpHttpPort) {
|
|
538
|
-
try { writeMcpConfigFile(projectId, agentId, session.acMcpHttpPort, replacement.token); } catch {}
|
|
539
|
-
} else if (session.acInjectMode === "proxy_flag") {
|
|
540
|
-
// Codex is pinned to the original ephemeral proxy URL, so we
|
|
541
|
-
// can't tear the listener down — mutate the token in place.
|
|
542
|
-
try { updateMcpProxyToken(projectId, agentId, replacement.token); } catch {}
|
|
543
|
-
}
|
|
544
|
-
|
|
545
|
-
// If the assigned name changed (e.g. multi-instance slot collision)
|
|
546
|
-
// the queue watcher is now polling the wrong file. Restart it
|
|
547
|
-
// against the new name so chat reaches the right agent.
|
|
548
|
-
if (replacement.name !== previousName && session.term) {
|
|
549
|
-
if (session.queueWatcherHandle) {
|
|
550
|
-
stopQueueWatcher(session.queueWatcherHandle);
|
|
551
|
-
session.queueWatcherHandle = null;
|
|
552
|
-
}
|
|
553
|
-
try {
|
|
554
|
-
const { dir: acDir } = resolveProjectChattr(projectId);
|
|
555
|
-
if (acDir) {
|
|
556
|
-
const dataDir = path.join(acDir, "data");
|
|
557
|
-
session.queueWatcherHandle = startQueueWatcher(dataDir, replacement.name, session.term);
|
|
558
|
-
}
|
|
559
|
-
} catch {}
|
|
560
|
-
}
|
|
561
|
-
}
|
|
562
|
-
|
|
563
449
|
// Helper: spawn a PTY for a project/agent and register in agentSessions
|
|
564
|
-
async function spawnAgentPty(project, agent) {
|
|
450
|
+
async function spawnAgentPty(project, agent, opts = {}) {
|
|
565
451
|
const key = `${project}/${agent}`;
|
|
566
452
|
|
|
567
453
|
const cwd = resolveAgentCwd(project, agent);
|
|
@@ -593,13 +479,6 @@ async function spawnAgentPty(project, agent) {
|
|
|
593
479
|
lastDims: null,
|
|
594
480
|
state: "running",
|
|
595
481
|
error: null,
|
|
596
|
-
acRegistrationName: built.acRegistrationName,
|
|
597
|
-
acServerPort: built.acServerPort,
|
|
598
|
-
acRegistrationToken: built.acRegistrationToken,
|
|
599
|
-
acInjectMode: built.acInjectMode,
|
|
600
|
-
acMcpHttpPort: built.acMcpHttpPort,
|
|
601
|
-
acHeartbeatHandle: null,
|
|
602
|
-
queueWatcherHandle: null,
|
|
603
482
|
lastOutputAt: Date.now(),
|
|
604
483
|
// #418: ring buffer of recent PTY output so reconnecting WS
|
|
605
484
|
// clients see the terminal state instead of a blank panel.
|
|
@@ -608,6 +487,10 @@ async function spawnAgentPty(project, agent) {
|
|
|
608
487
|
};
|
|
609
488
|
agentSessions.set(key, session);
|
|
610
489
|
|
|
490
|
+
if (!opts.suppressLifecycleMsg) {
|
|
491
|
+
emitSystemMessage(project, `${agent} joined`);
|
|
492
|
+
}
|
|
493
|
+
|
|
611
494
|
// #418: capture PTY output into the scrollback ring buffer (64KB).
|
|
612
495
|
// This runs independently of WS — even when no client is connected,
|
|
613
496
|
// the buffer accumulates so the next connect gets replay.
|
|
@@ -621,72 +504,10 @@ async function spawnAgentPty(project, agent) {
|
|
|
621
504
|
}
|
|
622
505
|
});
|
|
623
506
|
|
|
624
|
-
// #391 / quadwork#250: keep this agent alive in AgentChattr by
|
|
625
|
-
// POSTing /api/heartbeat/{name} every 5s. Without it, AC's 60s
|
|
626
|
-
// crash-detection window deregisters the agent and chat messages
|
|
627
|
-
// never reach it. Mirrors wrapper.py:_heartbeat (lines 715-748).
|
|
628
|
-
if (session.acRegistrationName && session.acServerPort && session.acRegistrationToken) {
|
|
629
|
-
// #394 / quadwork#253: pass getters (not raw values) so the 409
|
|
630
|
-
// recovery path below can swap acRegistrationName/Token in place
|
|
631
|
-
// and the very next heartbeat tick uses the replacement
|
|
632
|
-
// credentials without us having to tear down + restart the
|
|
633
|
-
// interval.
|
|
634
|
-
session.acHeartbeatHandle = startHeartbeat(
|
|
635
|
-
session.acServerPort,
|
|
636
|
-
() => session.acRegistrationName,
|
|
637
|
-
() => session.acRegistrationToken,
|
|
638
|
-
{ onConflict: () => recoverFrom409(project, agent, session) },
|
|
639
|
-
);
|
|
640
|
-
}
|
|
641
|
-
|
|
642
|
-
// #393 / quadwork#251: queue watcher — the actual mechanism by
|
|
643
|
-
// which agents pick up chat. Without this an agent can be
|
|
644
|
-
// registered + heartbeating yet still never respond, because
|
|
645
|
-
// AgentChattr only writes to {data_dir}/{name}_queue.jsonl and
|
|
646
|
-
// expects the agent side to poll + inject `mcp read`.
|
|
647
|
-
if (session.acRegistrationName && session.term) {
|
|
648
|
-
try {
|
|
649
|
-
const { dir: acDir } = resolveProjectChattr(project);
|
|
650
|
-
if (acDir) {
|
|
651
|
-
const dataDir = path.join(acDir, "data");
|
|
652
|
-
session.queueWatcherHandle = startQueueWatcher(
|
|
653
|
-
dataDir,
|
|
654
|
-
session.acRegistrationName,
|
|
655
|
-
session.term,
|
|
656
|
-
);
|
|
657
|
-
}
|
|
658
|
-
} catch {
|
|
659
|
-
// best-effort — failure here just means no chat injection
|
|
660
|
-
}
|
|
661
|
-
}
|
|
662
|
-
|
|
663
|
-
// #565: deferred restart — if the agent spawned without AC
|
|
664
|
-
// registration (AC wasn't ready or registration failed), wait for
|
|
665
|
-
// AC to come up then stop + respawn the agent so it gets the full
|
|
666
|
-
// MCP CLI args (--mcp-config / -c mcp_servers...url) that can only
|
|
667
|
-
// be set at process launch time.
|
|
668
|
-
if (!session.acRegistrationName && session.acServerPort && session.acInjectMode) {
|
|
669
|
-
const deferredRestart = async () => {
|
|
670
|
-
const ready = await waitForAgentChattrReady(session.acServerPort, 60000);
|
|
671
|
-
if (!ready) {
|
|
672
|
-
// #572: log timeout so operators know the health monitor will
|
|
673
|
-
// handle recovery when AC eventually comes up.
|
|
674
|
-
console.log(`[#565] Agent ${agent}: AC not reachable after 60s — health monitor will restart agent when AC recovers.`);
|
|
675
|
-
return;
|
|
676
|
-
}
|
|
677
|
-
// Guard: agent may have been stopped manually while we waited.
|
|
678
|
-
const current = agentSessions.get(key);
|
|
679
|
-
if (!current || !current.term || current.state !== "running") return;
|
|
680
|
-
console.log(`[#565] Agent ${agent}: AC is now reachable — restarting agent to gain chat integration.`);
|
|
681
|
-
await stopAgentSession(key);
|
|
682
|
-
await spawnAgentPty(project, agent);
|
|
683
|
-
};
|
|
684
|
-
deferredRestart().catch(() => {});
|
|
685
|
-
}
|
|
686
|
-
|
|
687
507
|
term.onExit(({ exitCode }) => {
|
|
688
508
|
const current = agentSessions.get(key);
|
|
689
509
|
if (current && current.term === term) {
|
|
510
|
+
cleanupPtyDispatcher(key);
|
|
690
511
|
current.state = "stopped";
|
|
691
512
|
current.error = exitCode ? `exit:${exitCode}` : null;
|
|
692
513
|
current.term = null;
|
|
@@ -694,27 +515,6 @@ async function spawnAgentPty(project, agent) {
|
|
|
694
515
|
if (v.readyState <= 1) v.close(1000, `exited:${exitCode}`);
|
|
695
516
|
}
|
|
696
517
|
current.viewers.clear();
|
|
697
|
-
// #391 / quadwork#250: a crashed PTY must also clear its
|
|
698
|
-
// heartbeat interval (otherwise it leaks and a later /start
|
|
699
|
-
// double-registers) and free the AgentChattr slot (otherwise
|
|
700
|
-
// the agent stays falsely `active` forever and the next
|
|
701
|
-
// register lands at slot 2). Deregister is best-effort.
|
|
702
|
-
if (current.acHeartbeatHandle) {
|
|
703
|
-
stopHeartbeat(current.acHeartbeatHandle);
|
|
704
|
-
current.acHeartbeatHandle = null;
|
|
705
|
-
}
|
|
706
|
-
if (current.queueWatcherHandle) {
|
|
707
|
-
stopQueueWatcher(current.queueWatcherHandle);
|
|
708
|
-
current.queueWatcherHandle = null;
|
|
709
|
-
}
|
|
710
|
-
if (current.acRegistrationName && current.acServerPort) {
|
|
711
|
-
deregisterAgent(current.acServerPort, current.acRegistrationName).catch(() => {});
|
|
712
|
-
if (current.projectId && current.agentId) {
|
|
713
|
-
try { clearPersistedAgentToken(current.projectId, current.agentId); } catch {}
|
|
714
|
-
}
|
|
715
|
-
current.acRegistrationName = null;
|
|
716
|
-
current.acRegistrationToken = null;
|
|
717
|
-
}
|
|
718
518
|
}
|
|
719
519
|
});
|
|
720
520
|
|
|
@@ -725,16 +525,16 @@ async function spawnAgentPty(project, agent) {
|
|
|
725
525
|
}
|
|
726
526
|
}
|
|
727
527
|
|
|
728
|
-
// Helper: stop an agent session — kill PTY, close WS, deregister.
|
|
729
|
-
// Async because deregister must complete before a restart re-registers,
|
|
730
|
-
// otherwise the old slot stays occupied and a fresh register lands at
|
|
731
|
-
// head-2 instead of slot 1 (#241).
|
|
732
528
|
async function stopAgentSession(key) {
|
|
733
529
|
const session = agentSessions.get(key);
|
|
734
530
|
if (!session) {
|
|
735
531
|
agentSessions.set(key, { projectId: null, agentId: null, term: null, viewers: new Set(), viewerDims: new Map(), lastDims: null, state: "stopped", error: null });
|
|
736
532
|
return;
|
|
737
533
|
}
|
|
534
|
+
if (session.projectId && session.agentId && !session._suppressLifecycleMsg) {
|
|
535
|
+
emitSystemMessage(session.projectId, `${session.agentId} left`);
|
|
536
|
+
}
|
|
537
|
+
cleanupPtyDispatcher(key);
|
|
738
538
|
if (session.term) {
|
|
739
539
|
try { session.term.kill(); } catch {}
|
|
740
540
|
session.term = null;
|
|
@@ -745,33 +545,6 @@ async function stopAgentSession(key) {
|
|
|
745
545
|
session.viewers.clear();
|
|
746
546
|
session.state = "stopped";
|
|
747
547
|
session.error = null;
|
|
748
|
-
// Stop heartbeat before deregister so we don't race a final POST
|
|
749
|
-
// against AgentChattr removing the name (#391 / quadwork#250).
|
|
750
|
-
if (session.acHeartbeatHandle) {
|
|
751
|
-
stopHeartbeat(session.acHeartbeatHandle);
|
|
752
|
-
session.acHeartbeatHandle = null;
|
|
753
|
-
}
|
|
754
|
-
// Stop queue watcher (#393 / quadwork#251) — the PTY is gone,
|
|
755
|
-
// injecting into a dead term would throw on the next tick.
|
|
756
|
-
if (session.queueWatcherHandle) {
|
|
757
|
-
stopQueueWatcher(session.queueWatcherHandle);
|
|
758
|
-
session.queueWatcherHandle = null;
|
|
759
|
-
}
|
|
760
|
-
// Best-effort deregister from AgentChattr (#241) so the slot frees
|
|
761
|
-
// and the next register lands at slot 1 instead of head-2.
|
|
762
|
-
if (session.acRegistrationName && session.acServerPort) {
|
|
763
|
-
try {
|
|
764
|
-
await deregisterAgent(session.acServerPort, session.acRegistrationName);
|
|
765
|
-
} catch {
|
|
766
|
-
// best-effort — failures are non-fatal
|
|
767
|
-
}
|
|
768
|
-
if (session.projectId && session.agentId) {
|
|
769
|
-
clearPersistedAgentToken(session.projectId, session.agentId);
|
|
770
|
-
}
|
|
771
|
-
session.acRegistrationName = null;
|
|
772
|
-
session.acRegistrationToken = null;
|
|
773
|
-
}
|
|
774
|
-
// Clean up MCP auth proxy if running
|
|
775
548
|
const [projectId, agentId] = key.split("/");
|
|
776
549
|
if (projectId && agentId) stopMcpProxy(projectId, agentId);
|
|
777
550
|
}
|
|
@@ -781,487 +554,15 @@ app.get("/api/agents", (_req, res) => {
|
|
|
781
554
|
for (const [key, session] of agentSessions) {
|
|
782
555
|
agents[key] = { state: session.state, error: session.error || null };
|
|
783
556
|
}
|
|
784
|
-
for (const [pid, proc] of chattrProcesses) {
|
|
785
|
-
agents[`_agentchattr/${pid}`] = { state: proc.state, error: proc.error };
|
|
786
|
-
}
|
|
787
557
|
res.json(agents);
|
|
788
558
|
});
|
|
789
559
|
|
|
790
|
-
//
|
|
791
|
-
// before any AgentChattr restart. Defense-in-depth against
|
|
792
|
-
// destructive ops like /clear that rewrite AC's JSONL log in place
|
|
793
|
-
// — per #303 the log itself IS persistent across normal restarts,
|
|
794
|
-
// so the snapshot's job is to give the operator a point-in-time
|
|
795
|
-
// rollback if the log gets clobbered, not to prevent history loss
|
|
796
|
-
// on ordinary lifecycle events.
|
|
797
|
-
//
|
|
798
|
-
// Snapshot contents = the same envelope GET /api/project-history
|
|
799
|
-
// returns, so an operator (or a future "restore" button) can feed
|
|
800
|
-
// the file straight into POST /api/project-history for replay.
|
|
801
|
-
const HISTORY_SNAPSHOT_LIMIT = 5;
|
|
802
|
-
|
|
803
|
-
async function snapshotProjectHistory(projectId) {
|
|
804
|
-
try {
|
|
805
|
-
const snapDir = path.join(require("os").homedir(), ".quadwork", projectId, "history-snapshots");
|
|
806
|
-
ensureSecureDir(snapDir);
|
|
807
|
-
const res = await fetch(`http://127.0.0.1:${PORT}/api/project-history?project=${encodeURIComponent(projectId)}`, {
|
|
808
|
-
signal: AbortSignal.timeout(30000),
|
|
809
|
-
});
|
|
810
|
-
if (!res.ok) {
|
|
811
|
-
console.warn(`[snapshot] ${projectId} history fetch returned ${res.status}; skipping snapshot`);
|
|
812
|
-
return false;
|
|
813
|
-
}
|
|
814
|
-
const text = await res.text();
|
|
815
|
-
const stamp = new Date().toISOString().replace(/[:.]/g, "-");
|
|
816
|
-
const outPath = path.join(snapDir, `${stamp}.json`);
|
|
817
|
-
fs.writeFileSync(outPath, text);
|
|
818
|
-
console.log(`[snapshot] ${projectId} → ${outPath}`);
|
|
819
|
-
// Prune to the newest HISTORY_SNAPSHOT_LIMIT files so the
|
|
820
|
-
// directory can't grow unbounded across weeks of restarts.
|
|
821
|
-
try {
|
|
822
|
-
const entries = fs.readdirSync(snapDir)
|
|
823
|
-
.filter((f) => f.endsWith(".json"))
|
|
824
|
-
.map((f) => ({ f, t: fs.statSync(path.join(snapDir, f)).mtimeMs }))
|
|
825
|
-
.sort((a, b) => b.t - a.t);
|
|
826
|
-
for (const old of entries.slice(HISTORY_SNAPSHOT_LIMIT)) {
|
|
827
|
-
try { fs.unlinkSync(path.join(snapDir, old.f)); } catch {}
|
|
828
|
-
}
|
|
829
|
-
} catch {
|
|
830
|
-
// non-fatal — stale snapshots just linger
|
|
831
|
-
}
|
|
832
|
-
return true;
|
|
833
|
-
} catch (err) {
|
|
834
|
-
console.warn(`[snapshot] ${projectId} snapshot failed: ${err.message || err}`);
|
|
835
|
-
return false;
|
|
836
|
-
}
|
|
837
|
-
}
|
|
838
|
-
|
|
839
|
-
// Per-project AgentChattr lifecycle: /api/agentchattr/:project/:action
|
|
840
|
-
// Backward compat: /api/agentchattr/:action uses first project
|
|
841
|
-
async function handleAgentChattr(req, res) {
|
|
842
|
-
let projectId, action;
|
|
843
|
-
if (req.params.action) {
|
|
844
|
-
projectId = req.params.projectOrAction;
|
|
845
|
-
action = req.params.action;
|
|
846
|
-
} else {
|
|
847
|
-
// Backward compat: single-param = action, use first project
|
|
848
|
-
action = req.params.projectOrAction;
|
|
849
|
-
const cfg = readConfig();
|
|
850
|
-
projectId = cfg.projects?.[0]?.id || "_default";
|
|
851
|
-
}
|
|
852
|
-
|
|
853
|
-
const { url: chattrUrl } = resolveProjectChattr(projectId);
|
|
854
|
-
const chattrPort = new URL(chattrUrl).port || "8300";
|
|
855
|
-
|
|
856
|
-
// Find per-project config.toml. Phase 2E / #181: prefer the
|
|
857
|
-
// per-project AgentChattr clone ROOT (where the web/CLI wizards now
|
|
858
|
-
// write it as of #184/#185 — and where run.py actually reads it from).
|
|
859
|
-
// Fall back to the legacy <working_dir>/agentchattr/config.toml for
|
|
860
|
-
// v1 setups that haven't been migrated yet (#188).
|
|
861
|
-
const cfg = readConfig();
|
|
862
|
-
const project = cfg.projects?.find((p) => p.id === projectId);
|
|
863
|
-
const { dir: resolvedAcDir } = resolveProjectChattr(projectId);
|
|
864
|
-
let projectConfigToml = null;
|
|
865
|
-
if (resolvedAcDir && fs.existsSync(path.join(resolvedAcDir, "config.toml"))) {
|
|
866
|
-
projectConfigToml = path.join(resolvedAcDir, "config.toml");
|
|
867
|
-
} else if (project?.working_dir) {
|
|
868
|
-
const legacyToml = path.join(project.working_dir, "agentchattr", "config.toml");
|
|
869
|
-
if (fs.existsSync(legacyToml)) projectConfigToml = legacyToml;
|
|
870
|
-
}
|
|
871
|
-
|
|
872
|
-
function getProc() {
|
|
873
|
-
return chattrProcesses.get(projectId) || { process: null, state: "stopped", error: null };
|
|
874
|
-
}
|
|
875
|
-
function setProc(val) {
|
|
876
|
-
chattrProcesses.set(projectId, val);
|
|
877
|
-
}
|
|
878
|
-
|
|
879
|
-
function regenerateConfigToml() {
|
|
880
|
-
// If project has a config.toml, update the port to match current config
|
|
881
|
-
if (!projectConfigToml || !fs.existsSync(projectConfigToml)) return;
|
|
882
|
-
try {
|
|
883
|
-
let content = fs.readFileSync(projectConfigToml, "utf-8");
|
|
884
|
-
content = content.replace(/^port = \d+/m, `port = ${chattrPort}`);
|
|
885
|
-
writeSecureFile(projectConfigToml, content);
|
|
886
|
-
} catch {}
|
|
887
|
-
}
|
|
888
|
-
|
|
889
|
-
async function spawnChattr() {
|
|
890
|
-
// Sync config.toml port before starting
|
|
891
|
-
regenerateConfigToml();
|
|
892
|
-
|
|
893
|
-
// Use project config.toml if available (isolated data dir + ports), otherwise fall back to --port
|
|
894
|
-
const extraArgs = (projectConfigToml && fs.existsSync(projectConfigToml))
|
|
895
|
-
? []
|
|
896
|
-
: ["--port", chattrPort];
|
|
897
|
-
|
|
898
|
-
// Resolve AgentChattr from its cloned directory
|
|
899
|
-
const { dir: acDir } = resolveProjectChattr(projectId);
|
|
900
|
-
// #394: backfill sender-overflow CSS/JS patch on every spawn so
|
|
901
|
-
// existing installs receive the fix without manual update.
|
|
902
|
-
patchAgentchattrCss(acDir);
|
|
903
|
-
const acSpawn = resolveChattrSpawn(acDir);
|
|
904
|
-
if (!acSpawn) {
|
|
905
|
-
setProc({ process: null, state: "error", error: `AgentChattr not installed. Clone it: git clone https://github.com/bcurts/agentchattr.git ${acDir}` });
|
|
906
|
-
return null;
|
|
907
|
-
}
|
|
908
|
-
|
|
909
|
-
// #569: redirect AC stdout/stderr to a log file so operators can
|
|
910
|
-
// diagnose startup failures. Append mode preserves restart history.
|
|
911
|
-
const acLogDir = path.join(os.homedir(), ".quadwork", projectId);
|
|
912
|
-
try { fs.mkdirSync(acLogDir, { recursive: true, mode: 0o700 }); } catch {}
|
|
913
|
-
const acLogPath = path.join(acLogDir, "agentchattr.log");
|
|
914
|
-
const acLogFd = fs.openSync(acLogPath, "a");
|
|
915
|
-
const child = spawn(acSpawn.command, [...acSpawn.args, ...extraArgs], {
|
|
916
|
-
cwd: acSpawn.cwd,
|
|
917
|
-
env: process.env,
|
|
918
|
-
stdio: ["ignore", acLogFd, acLogFd],
|
|
919
|
-
detached: true,
|
|
920
|
-
});
|
|
921
|
-
|
|
922
|
-
// Close our copy of the log fd — child inherits its own copy.
|
|
923
|
-
fs.closeSync(acLogFd);
|
|
924
|
-
|
|
925
|
-
// If pid is undefined, spawn failed
|
|
926
|
-
if (!child.pid) {
|
|
927
|
-
setProc({ process: null, state: "error", error: "Failed to start AgentChattr — check that Python venv is set up in " + acDir + ". Log: " + acLogPath });
|
|
928
|
-
child.on("error", () => {});
|
|
929
|
-
return null;
|
|
930
|
-
}
|
|
931
|
-
|
|
932
|
-
child.unref();
|
|
933
|
-
child.on("error", (err) => {
|
|
934
|
-
setProc({ process: null, state: "error", error: err.message });
|
|
935
|
-
});
|
|
936
|
-
child.on("exit", (code) => {
|
|
937
|
-
const cur = getProc();
|
|
938
|
-
if (cur.process === child) {
|
|
939
|
-
setProc({ process: null, state: "stopped", error: code ? `exit:${code}` : null });
|
|
940
|
-
}
|
|
941
|
-
});
|
|
942
|
-
// #580: wait for AC to actually bind the port before declaring success.
|
|
943
|
-
// On fast-start installs this resolves in 1-2s; prevents false-down
|
|
944
|
-
// detection on slow starts that triggered ghost agent cascades.
|
|
945
|
-
const ready = await waitForAgentChattrReady(chattrPort, 30000);
|
|
946
|
-
if (ready) {
|
|
947
|
-
setProc({ process: child, state: "running", error: null, runningSince: Date.now() });
|
|
948
|
-
return child;
|
|
949
|
-
} else {
|
|
950
|
-
setProc({ process: child, state: "error", error: "AgentChattr did not become ready within 30s" });
|
|
951
|
-
return null;
|
|
952
|
-
}
|
|
953
|
-
}
|
|
954
|
-
|
|
955
|
-
// #386: Kill any process listening on the AC port. Handles orphaned
|
|
956
|
-
// processes that survive QuadWork restarts (detached + unref'd spawns
|
|
957
|
-
// lose their tracked reference when the Node process recycles).
|
|
958
|
-
function killProcessOnPort(port, signal = "SIGTERM") {
|
|
959
|
-
try {
|
|
960
|
-
const pids = execFileSync("lsof", ["-ti", `TCP:${port}`, "-sTCP:LISTEN"], {
|
|
961
|
-
encoding: "utf-8",
|
|
962
|
-
timeout: 5000,
|
|
963
|
-
stdio: ["pipe", "pipe", "pipe"],
|
|
964
|
-
}).trim();
|
|
965
|
-
if (!pids) return;
|
|
966
|
-
for (const line of pids.split("\n")) {
|
|
967
|
-
const pid = parseInt(line, 10);
|
|
968
|
-
if (pid > 0) {
|
|
969
|
-
try { process.kill(pid, signal); } catch {}
|
|
970
|
-
}
|
|
971
|
-
}
|
|
972
|
-
} catch {
|
|
973
|
-
// lsof exits non-zero when no matching process — expected
|
|
974
|
-
}
|
|
975
|
-
}
|
|
976
|
-
|
|
977
|
-
// #386: Poll until the port is free or timeout expires.
|
|
978
|
-
function waitForPortFree(port, timeoutMs = 3000) {
|
|
979
|
-
const start = Date.now();
|
|
980
|
-
return new Promise((resolve) => {
|
|
981
|
-
function check() {
|
|
982
|
-
try {
|
|
983
|
-
execFileSync("lsof", ["-ti", `TCP:${port}`, "-sTCP:LISTEN"], {
|
|
984
|
-
encoding: "utf-8",
|
|
985
|
-
timeout: 2000,
|
|
986
|
-
stdio: ["pipe", "pipe", "pipe"],
|
|
987
|
-
});
|
|
988
|
-
// Still occupied — retry if within budget
|
|
989
|
-
if (Date.now() - start < timeoutMs) {
|
|
990
|
-
setTimeout(check, 200);
|
|
991
|
-
} else {
|
|
992
|
-
resolve(false);
|
|
993
|
-
}
|
|
994
|
-
} catch {
|
|
995
|
-
// lsof found nothing — port is free
|
|
996
|
-
resolve(true);
|
|
997
|
-
}
|
|
998
|
-
}
|
|
999
|
-
check();
|
|
1000
|
-
});
|
|
1001
|
-
}
|
|
1002
|
-
|
|
1003
|
-
if (action === "start") {
|
|
1004
|
-
const proc = getProc();
|
|
1005
|
-
if (proc.state === "running" && proc.process) {
|
|
1006
|
-
return res.json({ ok: true, state: "running", message: "Already running" });
|
|
1007
|
-
}
|
|
1008
|
-
// #401: validate AgentChattr is installed BEFORE killing anything on
|
|
1009
|
-
// the port. Without this guard, clicking Start when AC is missing
|
|
1010
|
-
// kills an unrelated process then fails with "not installed".
|
|
1011
|
-
const { dir: acDir } = resolveProjectChattr(projectId);
|
|
1012
|
-
const acSpawn = resolveChattrSpawn(acDir);
|
|
1013
|
-
if (!acSpawn) {
|
|
1014
|
-
const errMsg = `AgentChattr not installed. Clone it: git clone https://github.com/bcurts/agentchattr.git ${acDir}`;
|
|
1015
|
-
setProc({ process: null, state: "error", error: errMsg });
|
|
1016
|
-
return res.status(500).json({ ok: false, state: "error", error: errMsg });
|
|
1017
|
-
}
|
|
1018
|
-
|
|
1019
|
-
// #393: kill any orphaned process on the port before spawning
|
|
1020
|
-
// (same pattern as restart/stop from #386).
|
|
1021
|
-
killProcessOnPort(chattrPort);
|
|
1022
|
-
const portFree = await waitForPortFree(chattrPort, 3000);
|
|
1023
|
-
if (!portFree) {
|
|
1024
|
-
console.warn(`[agentchattr] ${projectId} port ${chattrPort} still occupied after 3s — spawning anyway`);
|
|
1025
|
-
}
|
|
1026
|
-
try {
|
|
1027
|
-
const child = await spawnChattr();
|
|
1028
|
-
if (!child) {
|
|
1029
|
-
const errProc = getProc();
|
|
1030
|
-
return res.status(500).json({ ok: false, state: "error", error: errProc.error || "Failed to start AgentChattr" });
|
|
1031
|
-
}
|
|
1032
|
-
// Sync token after AgentChattr starts (it generates its own)
|
|
1033
|
-
setTimeout(() => syncChattrToken(projectId), 2000);
|
|
1034
|
-
res.json({ ok: true, state: "running", pid: child.pid });
|
|
1035
|
-
} catch (err) {
|
|
1036
|
-
setProc({ process: null, state: "error", error: err.message });
|
|
1037
|
-
res.status(500).json({ ok: false, state: "error", error: err.message });
|
|
1038
|
-
}
|
|
1039
|
-
} else if (action === "stop") {
|
|
1040
|
-
const proc = getProc();
|
|
1041
|
-
if (proc.process) {
|
|
1042
|
-
try { proc.process.kill("SIGTERM"); } catch {}
|
|
1043
|
-
}
|
|
1044
|
-
// #386: also kill any orphaned process holding the port
|
|
1045
|
-
killProcessOnPort(chattrPort);
|
|
1046
|
-
setProc({ process: null, state: "stopped", error: null });
|
|
1047
|
-
res.json({ ok: true, state: "stopped" });
|
|
1048
|
-
} else if (action === "restart") {
|
|
1049
|
-
// #424 / quadwork#304: snapshot history before killing the
|
|
1050
|
-
// process. Best-effort and non-blocking-on-failure so a flaky
|
|
1051
|
-
// snapshot doesn't leave the operator unable to restart AC.
|
|
1052
|
-
await snapshotProjectHistory(projectId).catch(() => {});
|
|
1053
|
-
// #424 / quadwork#304 Phase 3: latch the opt-in BEFORE the
|
|
1054
|
-
// spawn so a restart that itself clears the flag can't starve
|
|
1055
|
-
// the auto-restore. We capture the snapshot filename we just
|
|
1056
|
-
// wrote + the project's auto_restore_after_restart flag and
|
|
1057
|
-
// replay it in the post-spawn tick below if both are set.
|
|
1058
|
-
const preRestartCfg = readConfig();
|
|
1059
|
-
const preRestartProject = preRestartCfg.projects?.find((p) => p.id === projectId);
|
|
1060
|
-
const shouldAutoRestore = !!(preRestartProject && preRestartProject.auto_restore_after_restart);
|
|
1061
|
-
const proc = getProc();
|
|
1062
|
-
if (proc.process) {
|
|
1063
|
-
console.log(`[agentchattr] ${projectId} restart: killing AC (PID: ${proc.process.pid})`);
|
|
1064
|
-
try { proc.process.kill("SIGTERM"); } catch {}
|
|
1065
|
-
}
|
|
1066
|
-
// #386: also kill any orphaned process holding the port (handles
|
|
1067
|
-
// detached processes that survived a QuadWork restart).
|
|
1068
|
-
killProcessOnPort(chattrPort);
|
|
1069
|
-
setProc({ process: null, state: "stopped", error: null });
|
|
1070
|
-
// #582: wait up to 5s for the port to be free, then SIGKILL
|
|
1071
|
-
// any remaining process as a fallback before spawning.
|
|
1072
|
-
let portFree = await waitForPortFree(chattrPort, 5000);
|
|
1073
|
-
if (!portFree) {
|
|
1074
|
-
console.warn(`[agentchattr] ${projectId} port ${chattrPort} still occupied after 5s — sending SIGKILL`);
|
|
1075
|
-
killProcessOnPort(chattrPort, "SIGKILL");
|
|
1076
|
-
portFree = await waitForPortFree(chattrPort, 3000);
|
|
1077
|
-
if (!portFree) {
|
|
1078
|
-
const portErr = `Port ${chattrPort} still occupied — cannot restart`;
|
|
1079
|
-
console.error(`[agentchattr] ${projectId} ${portErr}`);
|
|
1080
|
-
setProc({ process: null, state: "error", error: portErr });
|
|
1081
|
-
return res.status(500).json({ ok: false, state: "error", error: portErr });
|
|
1082
|
-
}
|
|
1083
|
-
}
|
|
1084
|
-
console.log(`[agentchattr] ${projectId} restart: port ${chattrPort} is free, spawning AC`);
|
|
1085
|
-
try {
|
|
1086
|
-
const child = await spawnChattr();
|
|
1087
|
-
if (!child) {
|
|
1088
|
-
const errProc = getProc();
|
|
1089
|
-
console.error(`[agentchattr] ${projectId} restart: spawnChattr failed — ${errProc.error || "unknown error"}`);
|
|
1090
|
-
return res.status(500).json({ ok: false, state: "error", error: errProc.error || "Failed to start AgentChattr" });
|
|
1091
|
-
}
|
|
1092
|
-
console.log(`[agentchattr] ${projectId} restart: AC spawned and ready (PID: ${child.pid})`);
|
|
1093
|
-
// Sync token after AgentChattr restarts
|
|
1094
|
-
setTimeout(() => syncChattrToken(projectId), 2000);
|
|
1095
|
-
// #424 / quadwork#304 Phase 3: optional auto-restore.
|
|
1096
|
-
// Fire the restore 3s after spawn so AC's ws is ready.
|
|
1097
|
-
// Best-effort: never blocks the restart response or
|
|
1098
|
-
// rolls back on error.
|
|
1099
|
-
if (shouldAutoRestore) {
|
|
1100
|
-
setTimeout(async () => {
|
|
1101
|
-
try {
|
|
1102
|
-
const snapDir = path.join(require("os").homedir(), ".quadwork", projectId, "history-snapshots");
|
|
1103
|
-
if (!fs.existsSync(snapDir)) return;
|
|
1104
|
-
const newest = fs.readdirSync(snapDir)
|
|
1105
|
-
.filter((f) => f.endsWith(".json"))
|
|
1106
|
-
.map((f) => ({ f, t: fs.statSync(path.join(snapDir, f)).mtimeMs }))
|
|
1107
|
-
.sort((a, b) => b.t - a.t)[0];
|
|
1108
|
-
if (!newest) return;
|
|
1109
|
-
const r = await fetch(`http://127.0.0.1:${PORT}/api/project-history/restore?project=${encodeURIComponent(projectId)}&name=${encodeURIComponent(newest.f)}`, {
|
|
1110
|
-
method: "POST",
|
|
1111
|
-
});
|
|
1112
|
-
if (r.ok) console.log(`[snapshot] ${projectId} auto-restored ${newest.f}`);
|
|
1113
|
-
else console.warn(`[snapshot] ${projectId} auto-restore returned ${r.status}`);
|
|
1114
|
-
} catch (err) {
|
|
1115
|
-
console.warn(`[snapshot] ${projectId} auto-restore failed: ${err.message || err}`);
|
|
1116
|
-
}
|
|
1117
|
-
}, 3000);
|
|
1118
|
-
}
|
|
1119
|
-
res.json({ ok: true, state: "running", pid: child.pid });
|
|
1120
|
-
// #447: auto-reset all agents after AC restart so they get
|
|
1121
|
-
// fresh MCP tokens. #581: mark reset as scheduled immediately
|
|
1122
|
-
// so the health monitor skips its own reset while ours is in-flight.
|
|
1123
|
-
// #579: also skip if a reset already succeeded within the last 30s.
|
|
1124
|
-
// Multiple restart sources (bridge-migrate, health monitor, dashboard)
|
|
1125
|
-
// can fire in rapid succession — only the first should trigger a reset.
|
|
1126
|
-
const existingReset = _acHealth.resetState.get(projectId);
|
|
1127
|
-
const resetRecentlyDone = existingReset &&
|
|
1128
|
-
(existingReset.status === "succeeded" || existingReset.status === "scheduled") &&
|
|
1129
|
-
Date.now() - existingReset.timestamp < 30_000;
|
|
1130
|
-
if (resetRecentlyDone) {
|
|
1131
|
-
console.log(`[agentchattr] ${projectId} skipping auto-reset — one already ${existingReset.status} ${Math.round((Date.now() - existingReset.timestamp) / 1000)}s ago`);
|
|
1132
|
-
} else {
|
|
1133
|
-
_acHealth.resetState.set(projectId, { status: "scheduled", timestamp: Date.now() });
|
|
1134
|
-
}
|
|
1135
|
-
if (!resetRecentlyDone) setTimeout(async () => {
|
|
1136
|
-
try {
|
|
1137
|
-
const resetResp = await fetch(`http://127.0.0.1:${PORT}/api/agents/${encodeURIComponent(projectId)}/reset`, {
|
|
1138
|
-
method: "POST",
|
|
1139
|
-
});
|
|
1140
|
-
if (resetResp.ok) {
|
|
1141
|
-
const resetData = await resetResp.json();
|
|
1142
|
-
_acHealth.resetState.set(projectId, { status: "succeeded", timestamp: Date.now() });
|
|
1143
|
-
console.log(`[agentchattr] ${projectId} auto-reset ${resetData.restarted} agent(s) after AC restart`);
|
|
1144
|
-
} else {
|
|
1145
|
-
_acHealth.resetState.set(projectId, { status: "failed", timestamp: Date.now() });
|
|
1146
|
-
console.warn(`[agentchattr] ${projectId} agent reset after AC restart returned ${resetResp.status}`);
|
|
1147
|
-
}
|
|
1148
|
-
} catch (err) {
|
|
1149
|
-
_acHealth.resetState.set(projectId, { status: "failed", timestamp: Date.now() });
|
|
1150
|
-
console.warn(`[agentchattr] ${projectId} agent reset after AC restart failed: ${err.message || err}`);
|
|
1151
|
-
}
|
|
1152
|
-
}, 2000);
|
|
1153
|
-
} catch (err) {
|
|
1154
|
-
setProc({ process: null, state: "error", error: err.message });
|
|
1155
|
-
res.status(500).json({ ok: false, state: "error", error: err.message });
|
|
1156
|
-
}
|
|
1157
|
-
} else if (action === "update") {
|
|
1158
|
-
// Update AgentChattr: stop → git pull → pip install → restart
|
|
1159
|
-
const { dir: acDir } = resolveProjectChattr(projectId);
|
|
1160
|
-
if (!acDir || !fs.existsSync(path.join(acDir, "run.py"))) {
|
|
1161
|
-
return res.status(400).json({ ok: false, error: "AgentChattr not installed at " + (acDir || "unknown") });
|
|
1162
|
-
}
|
|
1163
|
-
try {
|
|
1164
|
-
// Stop running process before pulling. Snapshot first so a
|
|
1165
|
-
// botched git pull can still be rolled back from disk.
|
|
1166
|
-
// #424 / quadwork#304: best-effort.
|
|
1167
|
-
await snapshotProjectHistory(projectId).catch(() => {});
|
|
1168
|
-
// Latch the auto-restore opt-in BEFORE stop, same as the
|
|
1169
|
-
// explicit restart branch above — a config mutation during
|
|
1170
|
-
// the git pull shouldn't starve the replay.
|
|
1171
|
-
const updateCfgPre = readConfig();
|
|
1172
|
-
const updateProjectPre = updateCfgPre.projects?.find((p) => p.id === projectId);
|
|
1173
|
-
const updateShouldAutoRestore = !!(updateProjectPre && updateProjectPre.auto_restore_after_restart);
|
|
1174
|
-
const proc = getProc();
|
|
1175
|
-
const wasRunning = proc.process && proc.state === "running";
|
|
1176
|
-
if (wasRunning) {
|
|
1177
|
-
try { proc.process.kill("SIGTERM"); } catch {}
|
|
1178
|
-
}
|
|
1179
|
-
// #386: kill orphaned processes on the port too
|
|
1180
|
-
killProcessOnPort(chattrPort);
|
|
1181
|
-
if (wasRunning) {
|
|
1182
|
-
setProc({ process: null, state: "stopped", error: null });
|
|
1183
|
-
// Wait for the port to be released before pulling/restarting
|
|
1184
|
-
await waitForPortFree(chattrPort, 3000);
|
|
1185
|
-
}
|
|
1186
|
-
|
|
1187
|
-
const pullResult = execFileSync("git", ["pull"], { cwd: acDir, encoding: "utf-8", timeout: 30000, stdio: "pipe" }).trim();
|
|
1188
|
-
// #388: re-apply sender-overflow CSS patch after git pull
|
|
1189
|
-
patchAgentchattrCss(acDir);
|
|
1190
|
-
// #629: re-apply crash timeout patch after git pull (pull may revert app.py)
|
|
1191
|
-
patchCrashTimeout(acDir);
|
|
1192
|
-
const venvPython = path.join(acDir, ".venv", "bin", "python");
|
|
1193
|
-
let pipResult = "";
|
|
1194
|
-
const reqFile = path.join(acDir, "requirements.txt");
|
|
1195
|
-
if (fs.existsSync(venvPython) && fs.existsSync(reqFile)) {
|
|
1196
|
-
pipResult = execFileSync(venvPython, ["-m", "pip", "install", "-r", "requirements.txt"], { cwd: acDir, encoding: "utf-8", timeout: 120000, stdio: "pipe" }).trim();
|
|
1197
|
-
}
|
|
1198
|
-
|
|
1199
|
-
// Restart if it was running before the update
|
|
1200
|
-
let restarted = false;
|
|
1201
|
-
if (wasRunning) {
|
|
1202
|
-
const child = await spawnChattr();
|
|
1203
|
-
restarted = !!child;
|
|
1204
|
-
if (child) {
|
|
1205
|
-
setTimeout(() => syncChattrToken(projectId).catch(() => {}), 2000);
|
|
1206
|
-
// #424 / quadwork#304 Phase 3: auto-restore after an
|
|
1207
|
-
// update-triggered restart too (t2a re-review). Same
|
|
1208
|
-
//3s wait + newest-snapshot-by-mtime path as the explicit
|
|
1209
|
-
// restart branch, using the pre-stop latched opt-in.
|
|
1210
|
-
if (updateShouldAutoRestore) {
|
|
1211
|
-
setTimeout(async () => {
|
|
1212
|
-
try {
|
|
1213
|
-
const snapDir = path.join(require("os").homedir(), ".quadwork", projectId, "history-snapshots");
|
|
1214
|
-
if (!fs.existsSync(snapDir)) return;
|
|
1215
|
-
const newest = fs.readdirSync(snapDir)
|
|
1216
|
-
.filter((f) => f.endsWith(".json"))
|
|
1217
|
-
.map((f) => ({ f, t: fs.statSync(path.join(snapDir, f)).mtimeMs }))
|
|
1218
|
-
.sort((a, b) => b.t - a.t)[0];
|
|
1219
|
-
if (!newest) return;
|
|
1220
|
-
const r = await fetch(`http://127.0.0.1:${PORT}/api/project-history/restore?project=${encodeURIComponent(projectId)}&name=${encodeURIComponent(newest.f)}`, {
|
|
1221
|
-
method: "POST",
|
|
1222
|
-
});
|
|
1223
|
-
if (r.ok) console.log(`[snapshot] ${projectId} auto-restored ${newest.f} after update`);
|
|
1224
|
-
else console.warn(`[snapshot] ${projectId} post-update auto-restore returned ${r.status}`);
|
|
1225
|
-
} catch (err) {
|
|
1226
|
-
console.warn(`[snapshot] ${projectId} post-update auto-restore failed: ${err.message || err}`);
|
|
1227
|
-
}
|
|
1228
|
-
}, 3000);
|
|
1229
|
-
}
|
|
1230
|
-
}
|
|
1231
|
-
}
|
|
560
|
+
// Per-project AgentChattr lifecycle (removed in #723 — AC stack deleted)
|
|
1232
561
|
|
|
1233
|
-
|
|
1234
|
-
|
|
1235
|
-
|
|
1236
|
-
}
|
|
1237
|
-
} else {
|
|
1238
|
-
res.status(400).json({ error: "Unknown action" });
|
|
1239
|
-
}
|
|
562
|
+
// Stub endpoints — return 410 Gone so dashboard code degrades gracefully
|
|
563
|
+
async function handleAgentChattr(_req, res) {
|
|
564
|
+
return res.status(410).json({ ok: false, error: "AgentChattr removed in Phase 3" });
|
|
1240
565
|
}
|
|
1241
|
-
app.post("/api/agentchattr/:projectOrAction/:action", handleAgentChattr);
|
|
1242
|
-
app.post("/api/agentchattr/:projectOrAction", handleAgentChattr);
|
|
1243
|
-
|
|
1244
|
-
// --- Reset agents: deregister all registered slots ---
|
|
1245
|
-
// AgentChattr doesn't expose staleness metadata, so this clears all slots.
|
|
1246
|
-
// Agents' wrapper heartbeat will auto-re-register with clean names.
|
|
1247
|
-
|
|
1248
|
-
// #416: AC health status endpoint — returns the health monitor state
|
|
1249
|
-
// for a project so the dashboard can surface auto-restart events.
|
|
1250
|
-
app.get("/api/agentchattr/:project/health", (req, res) => {
|
|
1251
|
-
const projectId = req.params.project;
|
|
1252
|
-
const proc = chattrProcesses.get(projectId);
|
|
1253
|
-
const health = _acHealth.state.get(projectId) || { lastRestart: 0, consecutiveFailures: 0 };
|
|
1254
|
-
res.json({
|
|
1255
|
-
state: proc?.state || "unknown",
|
|
1256
|
-
error: proc?.error || null,
|
|
1257
|
-
autoRestart: {
|
|
1258
|
-
lastRestart: health.lastRestart || null,
|
|
1259
|
-
consecutiveFailures: health.consecutiveFailures,
|
|
1260
|
-
gaveUp: health.consecutiveFailures >= 3,
|
|
1261
|
-
},
|
|
1262
|
-
});
|
|
1263
|
-
});
|
|
1264
|
-
|
|
1265
566
|
app.post("/api/agents/:project/reset", async (req, res) => {
|
|
1266
567
|
const projectId = req.params.project;
|
|
1267
568
|
|
|
@@ -1293,6 +594,8 @@ app.post("/api/agents/:project/reset", async (req, res) => {
|
|
|
1293
594
|
|
|
1294
595
|
// Stop all agents first (handles deregistration best-effort)
|
|
1295
596
|
for (const agentId of allAgentIds) {
|
|
597
|
+
const s = agentSessions.get(`${projectId}/${agentId}`);
|
|
598
|
+
if (s) s._suppressLifecycleMsg = true;
|
|
1296
599
|
await stopAgentSession(`${projectId}/${agentId}`);
|
|
1297
600
|
}
|
|
1298
601
|
|
|
@@ -1300,8 +603,9 @@ app.post("/api/agents/:project/reset", async (req, res) => {
|
|
|
1300
603
|
let restarted = 0;
|
|
1301
604
|
const errors = [];
|
|
1302
605
|
for (const agentId of allAgentIds) {
|
|
1303
|
-
const result = await spawnAgentPty(projectId, agentId);
|
|
606
|
+
const result = await spawnAgentPty(projectId, agentId, { suppressLifecycleMsg: true });
|
|
1304
607
|
if (result.ok) {
|
|
608
|
+
emitSystemMessage(projectId, `${agentId} restarted`);
|
|
1305
609
|
restarted++;
|
|
1306
610
|
} else {
|
|
1307
611
|
errors.push(`${agentId}: ${result.error}`);
|
|
@@ -1319,7 +623,7 @@ app.post("/api/agents/:project/reset", async (req, res) => {
|
|
|
1319
623
|
}
|
|
1320
624
|
});
|
|
1321
625
|
|
|
1322
|
-
// --- Full Reset: restart all
|
|
626
|
+
// --- Full Reset: restart all agents across all projects (#657) ---
|
|
1323
627
|
|
|
1324
628
|
app.post("/api/full-reset", async (_req, res) => {
|
|
1325
629
|
const start = Date.now();
|
|
@@ -1328,42 +632,21 @@ app.post("/api/full-reset", async (_req, res) => {
|
|
|
1328
632
|
const cfg = readConfig();
|
|
1329
633
|
const projects = (cfg.projects || []).filter((p) => !p.archived);
|
|
1330
634
|
|
|
1331
|
-
// 1. Stop all agent sessions
|
|
1332
635
|
console.log("[full-reset] stopping all agent sessions...");
|
|
1333
636
|
const sessionKeys = [...agentSessions.keys()];
|
|
1334
637
|
for (const key of sessionKeys) {
|
|
1335
638
|
await stopAgentSession(key);
|
|
1336
639
|
}
|
|
1337
640
|
|
|
1338
|
-
// 2. Stop Butler if running
|
|
1339
641
|
console.log("[full-reset] stopping Butler...");
|
|
1340
642
|
stopButlerPty();
|
|
1341
643
|
|
|
1342
|
-
// 3. Re-run startup migrations
|
|
1343
644
|
console.log("[full-reset] running startup migrations...");
|
|
1344
645
|
runStartupMigrations(cfg);
|
|
1345
646
|
|
|
1346
|
-
// 4. Restart each project's AC + agents
|
|
1347
647
|
let totalAgents = 0;
|
|
1348
648
|
const errors = [];
|
|
1349
649
|
for (const project of projects) {
|
|
1350
|
-
console.log(`[full-reset] restarting AC for ${project.id}...`);
|
|
1351
|
-
// Pre-mark reset as scheduled so AC restart's auto-reset timer is suppressed
|
|
1352
|
-
_acHealth.resetState.set(project.id, { status: "scheduled", timestamp: Date.now() });
|
|
1353
|
-
try {
|
|
1354
|
-
const acResp = await fetch(`http://127.0.0.1:${PORT}/api/agentchattr/${encodeURIComponent(project.id)}/restart`, {
|
|
1355
|
-
method: "POST",
|
|
1356
|
-
});
|
|
1357
|
-
if (!acResp.ok) {
|
|
1358
|
-
const errData = await acResp.json().catch(() => ({}));
|
|
1359
|
-
errors.push(`${project.id}: AC restart failed — ${errData.error || acResp.status}`);
|
|
1360
|
-
continue;
|
|
1361
|
-
}
|
|
1362
|
-
} catch (err) {
|
|
1363
|
-
errors.push(`${project.id}: AC — ${err.message}`);
|
|
1364
|
-
continue;
|
|
1365
|
-
}
|
|
1366
|
-
// Explicitly reset agents and await result
|
|
1367
650
|
try {
|
|
1368
651
|
const resetResp = await fetch(`http://127.0.0.1:${PORT}/api/agents/${encodeURIComponent(project.id)}/reset`, {
|
|
1369
652
|
method: "POST",
|
|
@@ -1379,7 +662,6 @@ app.post("/api/full-reset", async (_req, res) => {
|
|
|
1379
662
|
}
|
|
1380
663
|
}
|
|
1381
664
|
|
|
1382
|
-
// 5. Restart Butler if enabled
|
|
1383
665
|
if (cfg.butler?.enabled) {
|
|
1384
666
|
console.log("[full-reset] restarting Butler...");
|
|
1385
667
|
const result = spawnButlerPty();
|
|
@@ -1437,10 +719,13 @@ app.post("/api/agents/:project/:agent/restart", async (req, res) => {
|
|
|
1437
719
|
|
|
1438
720
|
// #241: must await deregister before respawn so the slot frees and
|
|
1439
721
|
// the fresh register lands at slot 1 instead of head-2.
|
|
722
|
+
const existing = agentSessions.get(key);
|
|
723
|
+
if (existing) existing._suppressLifecycleMsg = true;
|
|
1440
724
|
await stopAgentSession(key);
|
|
1441
725
|
|
|
1442
|
-
const result = await spawnAgentPty(project, agent);
|
|
726
|
+
const result = await spawnAgentPty(project, agent, { suppressLifecycleMsg: true });
|
|
1443
727
|
if (result.ok) {
|
|
728
|
+
emitSystemMessage(project, `${agent} restarted`);
|
|
1444
729
|
res.json({ ok: true, state: "running", pid: result.pid });
|
|
1445
730
|
} else {
|
|
1446
731
|
res.status(500).json({ ok: false, state: "error", error: result.error });
|
|
@@ -2408,276 +1693,8 @@ setInterval(autoStopPollingTick, AUTO_STOP_POLL_INTERVAL_MS);
|
|
|
2408
1693
|
// delay is tens of seconds. Skipping projects without the opt-in
|
|
2409
1694
|
// keeps the poller cheap for single-project setups.
|
|
2410
1695
|
|
|
2411
|
-
const _loopGuardPausedState = new Map(); // projectId -> { paused: bool, scheduled: Timeout? }
|
|
2412
|
-
const LOOP_GUARD_POLL_INTERVAL_MS = 10000;
|
|
2413
|
-
|
|
2414
|
-
async function checkLoopGuardPause(project) {
|
|
2415
|
-
if (!project || !project.auto_continue_loop_guard) return;
|
|
2416
|
-
const { url: base, token: sessionToken } = resolveProjectChattr(project.id);
|
|
2417
|
-
if (!base) return;
|
|
2418
|
-
let paused = false;
|
|
2419
|
-
try {
|
|
2420
|
-
const r = await fetch(`${base}/api/status`, {
|
|
2421
|
-
headers: sessionToken ? { "x-session-token": sessionToken } : {},
|
|
2422
|
-
signal: AbortSignal.timeout(5000),
|
|
2423
|
-
});
|
|
2424
|
-
if (!r.ok) return;
|
|
2425
|
-
const data = await r.json();
|
|
2426
|
-
paused = !!(data && data.paused);
|
|
2427
|
-
} catch {
|
|
2428
|
-
return;
|
|
2429
|
-
}
|
|
2430
|
-
const state = _loopGuardPausedState.get(project.id) || { paused: false, scheduled: null };
|
|
2431
|
-
// Transition false → true: schedule an auto-continue after the delay.
|
|
2432
|
-
if (paused && !state.paused && !state.scheduled) {
|
|
2433
|
-
const delaySec = Number.isFinite(project.auto_continue_delay_sec) && project.auto_continue_delay_sec >= 5
|
|
2434
|
-
? project.auto_continue_delay_sec
|
|
2435
|
-
: 30;
|
|
2436
|
-
console.log(`[loop-guard] ${project.id} paused — auto-continue in ${delaySec}s`);
|
|
2437
|
-
state.scheduled = setTimeout(async () => {
|
|
2438
|
-
try {
|
|
2439
|
-
// Re-check the opt-in at fire time so a checkbox disable
|
|
2440
|
-
// mid-wait actually stops the auto-continue.
|
|
2441
|
-
const freshCfg = readConfig();
|
|
2442
|
-
const fresh = freshCfg.projects?.find((p) => p.id === project.id);
|
|
2443
|
-
if (!fresh || !fresh.auto_continue_loop_guard) {
|
|
2444
|
-
console.log(`[loop-guard] ${project.id} auto-continue cancelled (opt-in disabled during wait)`);
|
|
2445
|
-
} else {
|
|
2446
|
-
// Re-check the router's pause state at fire time too. The
|
|
2447
|
-
// 10s status poller may not have seen a manual operator
|
|
2448
|
-
// /continue yet when the delay window (5–9s) is shorter
|
|
2449
|
-
// than the poll interval — without this, a manual resume
|
|
2450
|
-
// inside a 5s wait would be followed by a stale auto
|
|
2451
|
-
// /continue that clobbers hop_count on an already-running
|
|
2452
|
-
// chain (router.continue_routing resets the counter
|
|
2453
|
-
// unconditionally). The re-check closes the race.
|
|
2454
|
-
let stillPaused = false;
|
|
2455
|
-
try {
|
|
2456
|
-
const { url: freshBase, token: freshToken } = resolveProjectChattr(project.id);
|
|
2457
|
-
if (freshBase) {
|
|
2458
|
-
const sr = await fetch(`${freshBase}/api/status`, {
|
|
2459
|
-
headers: freshToken ? { "x-session-token": freshToken } : {},
|
|
2460
|
-
signal: AbortSignal.timeout(5000),
|
|
2461
|
-
});
|
|
2462
|
-
if (sr.ok) {
|
|
2463
|
-
const sd = await sr.json();
|
|
2464
|
-
stillPaused = !!(sd && sd.paused);
|
|
2465
|
-
}
|
|
2466
|
-
}
|
|
2467
|
-
} catch {
|
|
2468
|
-
// Status re-check failed — fall back to "don't fire".
|
|
2469
|
-
// Stuck pause will still be caught on the next 10s tick.
|
|
2470
|
-
}
|
|
2471
|
-
if (!stillPaused) {
|
|
2472
|
-
console.log(`[loop-guard] ${project.id} auto-continue cancelled (router already resumed)`);
|
|
2473
|
-
} else {
|
|
2474
|
-
const res = await fetch(`http://127.0.0.1:${PORT}/api/chat?project=${encodeURIComponent(project.id)}`, {
|
|
2475
|
-
method: "POST",
|
|
2476
|
-
headers: { "Content-Type": "application/json" },
|
|
2477
|
-
body: JSON.stringify({ text: "/continue", channel: "general" }),
|
|
2478
|
-
});
|
|
2479
|
-
if (res.ok) console.log(`[loop-guard] ${project.id} auto-continued`);
|
|
2480
|
-
else console.warn(`[loop-guard] ${project.id} auto-continue POST returned ${res.status}`);
|
|
2481
|
-
}
|
|
2482
|
-
}
|
|
2483
|
-
} catch (err) {
|
|
2484
|
-
console.warn(`[loop-guard] ${project.id} auto-continue failed: ${err.message || err}`);
|
|
2485
|
-
}
|
|
2486
|
-
const s2 = _loopGuardPausedState.get(project.id);
|
|
2487
|
-
if (s2) s2.scheduled = null;
|
|
2488
|
-
}, delaySec * 1000);
|
|
2489
|
-
}
|
|
2490
|
-
// Transition true → false: clear any pending timer.
|
|
2491
|
-
if (!paused && state.paused && state.scheduled) {
|
|
2492
|
-
clearTimeout(state.scheduled);
|
|
2493
|
-
state.scheduled = null;
|
|
2494
|
-
}
|
|
2495
|
-
state.paused = paused;
|
|
2496
|
-
_loopGuardPausedState.set(project.id, state);
|
|
2497
|
-
}
|
|
2498
|
-
|
|
2499
|
-
function runLoopGuardPollingTick() {
|
|
2500
|
-
try {
|
|
2501
|
-
const cfg = readConfig();
|
|
2502
|
-
for (const p of (cfg.projects || [])) {
|
|
2503
|
-
if (p && p.auto_continue_loop_guard) checkLoopGuardPause(p);
|
|
2504
|
-
}
|
|
2505
|
-
} catch {
|
|
2506
|
-
// config unreadable — next tick will retry
|
|
2507
|
-
}
|
|
2508
|
-
}
|
|
2509
|
-
|
|
2510
|
-
setInterval(runLoopGuardPollingTick, LOOP_GUARD_POLL_INTERVAL_MS);
|
|
2511
|
-
|
|
2512
1696
|
// --- Start ---
|
|
2513
1697
|
|
|
2514
|
-
// ---------------------------------------------------------------------------
|
|
2515
|
-
// #416: AC health monitor — auto-restart AgentChattr on crash detection.
|
|
2516
|
-
// Runs a TCP connect probe every 30s for each project with a "running" AC
|
|
2517
|
-
// process. If the port is dead, auto-restarts (reusing the existing restart
|
|
2518
|
-
// logic). Rate-limited to one restart per 60s per project; gives up after
|
|
2519
|
-
// 3 consecutive failures and surfaces a persistent error.
|
|
2520
|
-
// ---------------------------------------------------------------------------
|
|
2521
|
-
// #572: restart agents that are running without AC registration after AC
|
|
2522
|
-
// recovers from a crash. Scans agentSessions for the given project,
|
|
2523
|
-
// finds agents missing acRegistrationName, and stop+respawns them so
|
|
2524
|
-
// they get MCP CLI flags at launch time.
|
|
2525
|
-
async function restartUnregisteredAgents(projectId) {
|
|
2526
|
-
const toRestart = [];
|
|
2527
|
-
for (const [key, session] of agentSessions) {
|
|
2528
|
-
if (session.projectId !== projectId) continue;
|
|
2529
|
-
if (session.acRegistrationName) continue; // already registered
|
|
2530
|
-
if (session.state !== "running") continue;
|
|
2531
|
-
if (!session.acServerPort || !session.acInjectMode) continue;
|
|
2532
|
-
toRestart.push({ key, agentId: session.agentId });
|
|
2533
|
-
}
|
|
2534
|
-
|
|
2535
|
-
if (toRestart.length === 0) return;
|
|
2536
|
-
const samplePort = agentSessions.get(toRestart[0].key)?.acServerPort || "?";
|
|
2537
|
-
console.log(`[health] AC recovered on port ${samplePort} — restarting ${toRestart.length} agent(s) for chat integration`);
|
|
2538
|
-
|
|
2539
|
-
for (const { key, agentId } of toRestart) {
|
|
2540
|
-
try {
|
|
2541
|
-
console.log(`[health] Restarting agent ${agentId} for project ${projectId} to gain chat integration`);
|
|
2542
|
-
await stopAgentSession(key);
|
|
2543
|
-
await spawnAgentPty(projectId, agentId);
|
|
2544
|
-
} catch (err) {
|
|
2545
|
-
console.error(`[health] Failed to restart agent ${agentId}: ${err.message}`);
|
|
2546
|
-
}
|
|
2547
|
-
}
|
|
2548
|
-
}
|
|
2549
|
-
|
|
2550
|
-
const _acHealth = {
|
|
2551
|
-
// Per-project: { lastRestart: timestamp, consecutiveFailures: number }
|
|
2552
|
-
state: new Map(),
|
|
2553
|
-
intervalHandle: null,
|
|
2554
|
-
// #581: per-project reset state — prevents duplicate resets per restart event.
|
|
2555
|
-
// Values: { status: "scheduled"|"succeeded"|"failed", timestamp: number }
|
|
2556
|
-
resetState: new Map(),
|
|
2557
|
-
// #579: per-project grace period. Projects whose AC entered "running"
|
|
2558
|
-
// within the last 60s are skipped by the health monitor so startup
|
|
2559
|
-
// migrations (bridge-migrate, ghost-fix) and fresh spawns can settle.
|
|
2560
|
-
// Tracked via `runningSince` in chattrProcesses entries.
|
|
2561
|
-
};
|
|
2562
|
-
|
|
2563
|
-
function isPortAlive(port) {
|
|
2564
|
-
return new Promise((resolve) => {
|
|
2565
|
-
const sock = net.createConnection({ port, host: "127.0.0.1" }, () => {
|
|
2566
|
-
sock.destroy();
|
|
2567
|
-
resolve(true);
|
|
2568
|
-
});
|
|
2569
|
-
sock.on("error", () => resolve(false));
|
|
2570
|
-
sock.setTimeout(2000, () => { sock.destroy(); resolve(false); });
|
|
2571
|
-
});
|
|
2572
|
-
}
|
|
2573
|
-
|
|
2574
|
-
async function acHealthCheck() {
|
|
2575
|
-
const cfg = readConfig();
|
|
2576
|
-
for (const project of (cfg.projects || [])) {
|
|
2577
|
-
const proc = chattrProcesses.get(project.id);
|
|
2578
|
-
// Only monitor projects that were explicitly started (state === "running"
|
|
2579
|
-
// or had a process). Skip intentionally stopped projects.
|
|
2580
|
-
if (!proc || proc.state === "stopped") continue;
|
|
2581
|
-
// #579: per-project grace period — skip projects whose AC entered
|
|
2582
|
-
// "running" within the last 60s. This lets cmdStart spawns and
|
|
2583
|
-
// startup migrations (bridge-migrate, ghost-fix) settle before the
|
|
2584
|
-
// monitor acts, regardless of when the project was created.
|
|
2585
|
-
if (proc.runningSince && Date.now() - proc.runningSince < 60_000) continue;
|
|
2586
|
-
|
|
2587
|
-
const { url } = resolveProjectChattr(project.id);
|
|
2588
|
-
const portMatch = url.match(/:(\d+)/);
|
|
2589
|
-
const port = portMatch ? parseInt(portMatch[1], 10) : 8300;
|
|
2590
|
-
|
|
2591
|
-
const alive = await isPortAlive(port);
|
|
2592
|
-
const health = _acHealth.state.get(project.id) || { lastRestart: 0, consecutiveFailures: 0 };
|
|
2593
|
-
|
|
2594
|
-
if (alive) {
|
|
2595
|
-
// Healthy — reset failure counter
|
|
2596
|
-
if (health.consecutiveFailures > 0) {
|
|
2597
|
-
console.log(`[health] AC for ${project.id} recovered (port ${port} alive)`);
|
|
2598
|
-
// #572: restart agents that are running without chat integration.
|
|
2599
|
-
// These are agents where the #565 deferred restart timed out, or
|
|
2600
|
-
// agents spawned while AC was down. MCP flags are set at process
|
|
2601
|
-
// launch, so a full stop+respawn is required.
|
|
2602
|
-
// #581: dedupe — skip if a reset is in-flight or succeeded within 60s.
|
|
2603
|
-
// If "scheduled" (in-flight), keep consecutiveFailures=1 so the next
|
|
2604
|
-
// healthy tick re-enters this branch and retries if state became "failed".
|
|
2605
|
-
const rs = _acHealth.resetState.get(project.id);
|
|
2606
|
-
const resetSucceeded = rs && rs.status === "succeeded" && Date.now() - rs.timestamp < 60000;
|
|
2607
|
-
const resetInFlight = rs && rs.status === "scheduled";
|
|
2608
|
-
if (resetSucceeded) {
|
|
2609
|
-
// Already handled — clear failures normally
|
|
2610
|
-
} else if (resetInFlight) {
|
|
2611
|
-
// In-flight — preserve failures so we retry next tick if it fails
|
|
2612
|
-
health.consecutiveFailures = 1;
|
|
2613
|
-
_acHealth.state.set(project.id, health);
|
|
2614
|
-
continue;
|
|
2615
|
-
} else {
|
|
2616
|
-
// No recent reset or previous attempt failed — fire one
|
|
2617
|
-
_acHealth.resetState.set(project.id, { status: "scheduled", timestamp: Date.now() });
|
|
2618
|
-
restartUnregisteredAgents(project.id).then(() => {
|
|
2619
|
-
_acHealth.resetState.set(project.id, { status: "succeeded", timestamp: Date.now() });
|
|
2620
|
-
}).catch((err) => {
|
|
2621
|
-
_acHealth.resetState.set(project.id, { status: "failed", timestamp: Date.now() });
|
|
2622
|
-
console.error(`[health] Failed to restart unregistered agents for ${project.id}:`, err.message);
|
|
2623
|
-
});
|
|
2624
|
-
}
|
|
2625
|
-
}
|
|
2626
|
-
health.consecutiveFailures = 0;
|
|
2627
|
-
_acHealth.state.set(project.id, health);
|
|
2628
|
-
continue;
|
|
2629
|
-
}
|
|
2630
|
-
|
|
2631
|
-
// Port is dead — check rate limits
|
|
2632
|
-
if (health.consecutiveFailures >= 3) {
|
|
2633
|
-
// Already gave up — don't spam restarts. The error state persists
|
|
2634
|
-
// in chattrProcesses for the dashboard to surface.
|
|
2635
|
-
continue;
|
|
2636
|
-
}
|
|
2637
|
-
|
|
2638
|
-
const now = Date.now();
|
|
2639
|
-
if (now - health.lastRestart < 60_000) {
|
|
2640
|
-
// Too soon since last restart attempt
|
|
2641
|
-
continue;
|
|
2642
|
-
}
|
|
2643
|
-
|
|
2644
|
-
health.consecutiveFailures++;
|
|
2645
|
-
health.lastRestart = now;
|
|
2646
|
-
_acHealth.state.set(project.id, health);
|
|
2647
|
-
|
|
2648
|
-
console.warn(`[health] AC for ${project.id} on port ${port} is down (failure ${health.consecutiveFailures}/3) — auto-restarting`);
|
|
2649
|
-
|
|
2650
|
-
// Call the existing restart endpoint internally so we reuse the
|
|
2651
|
-
// hardened path (killProcessOnPort, waitForPortFree, snapshot,
|
|
2652
|
-
// auto-restore) instead of reimplementing spawn logic inline.
|
|
2653
|
-
try {
|
|
2654
|
-
const resp = await fetch(`http://127.0.0.1:${PORT}/api/agentchattr/${encodeURIComponent(project.id)}/restart`, {
|
|
2655
|
-
method: "POST",
|
|
2656
|
-
timeout: 15000,
|
|
2657
|
-
});
|
|
2658
|
-
if (resp.ok) {
|
|
2659
|
-
const data = await resp.json();
|
|
2660
|
-
console.log(`[health] AC for ${project.id} auto-restarted (PID: ${data.pid})`);
|
|
2661
|
-
// #447: agent reset is now chained inside the restart endpoint
|
|
2662
|
-
// itself (fires on a 2s timer), so no separate call needed here.
|
|
2663
|
-
} else {
|
|
2664
|
-
const body = await resp.text().catch(() => "");
|
|
2665
|
-
console.error(`[health] AC auto-restart failed for ${project.id}: ${resp.status} ${body.slice(0, 120)}`);
|
|
2666
|
-
chattrProcesses.set(project.id, { process: null, state: "error", error: `Auto-restart failed: ${resp.status}` });
|
|
2667
|
-
}
|
|
2668
|
-
} catch (err) {
|
|
2669
|
-
console.error(`[health] AC auto-restart failed for ${project.id}:`, err.message);
|
|
2670
|
-
chattrProcesses.set(project.id, { process: null, state: "error", error: `Auto-restart failed: ${err.message}` });
|
|
2671
|
-
}
|
|
2672
|
-
}
|
|
2673
|
-
}
|
|
2674
|
-
|
|
2675
|
-
function startAcHealthMonitor() {
|
|
2676
|
-
if (_acHealth.intervalHandle) return;
|
|
2677
|
-
_acHealth.intervalHandle = setInterval(acHealthCheck, 30_000);
|
|
2678
|
-
console.log("[health] AC health monitor started (30s interval, per-project 60s grace)");
|
|
2679
|
-
}
|
|
2680
|
-
|
|
2681
1698
|
// #705: auto-interrupt agents stuck with no PTY output for 10 minutes.
|
|
2682
1699
|
const WATCHDOG_TIMEOUT_MS = 10 * 60 * 1000;
|
|
2683
1700
|
let _watchdogHandle = null;
|
|
@@ -2686,6 +1703,8 @@ function watchdogCheck() {
|
|
|
2686
1703
|
for (const [key, session] of agentSessions) {
|
|
2687
1704
|
if (session.state !== "running" || !session.term) continue;
|
|
2688
1705
|
if (!session.lastOutputAt) continue;
|
|
1706
|
+
// #732: skip file-chat projects — idle is normal, PTY dispatch wakes them
|
|
1707
|
+
if (routes.getProjectChatMode(session.projectId) === "file") continue;
|
|
2689
1708
|
if (Date.now() - session.lastOutputAt > WATCHDOG_TIMEOUT_MS) {
|
|
2690
1709
|
console.log(`[watchdog] ${key}: no output for 10m — sending Ctrl+C`);
|
|
2691
1710
|
safeWrite(session.term, "\x03");
|
|
@@ -2703,54 +1722,6 @@ function startWatchdog() {
|
|
|
2703
1722
|
// #657: extracted startup migrations so full-reset can re-run them
|
|
2704
1723
|
function runStartupMigrations(cfg) {
|
|
2705
1724
|
const projects = (cfg.projects || []).filter((p) => !p.archived);
|
|
2706
|
-
const acRestartNeeded = [];
|
|
2707
|
-
|
|
2708
|
-
// bridge-migrate
|
|
2709
|
-
for (const p of projects) {
|
|
2710
|
-
const acPath = projectAgentchattrConfigPath(p.id);
|
|
2711
|
-
if (!fs.existsSync(acPath)) continue;
|
|
2712
|
-
try {
|
|
2713
|
-
const before = fs.readFileSync(acPath, "utf-8");
|
|
2714
|
-
const hadOldDc = /^\[agents\.discord-bridge\]\s*$/m.test(before);
|
|
2715
|
-
const hadOldTg = /^\[agents\.telegram-bridge\]\s*$/m.test(before);
|
|
2716
|
-
const dc = patchAgentchattrConfigForDiscordBridge(before);
|
|
2717
|
-
const tg = patchAgentchattrConfigForTelegramBridge(dc.text);
|
|
2718
|
-
if (dc.changed || tg.changed) {
|
|
2719
|
-
fs.writeFileSync(acPath, tg.text);
|
|
2720
|
-
console.log(`[bridge-migrate] ${p.id}: migrated AC config slugs`);
|
|
2721
|
-
if (hadOldDc || hadOldTg) {
|
|
2722
|
-
if (!acRestartNeeded.includes(p.id)) acRestartNeeded.push(p.id);
|
|
2723
|
-
}
|
|
2724
|
-
}
|
|
2725
|
-
} catch {}
|
|
2726
|
-
}
|
|
2727
|
-
|
|
2728
|
-
// bridge-refresh
|
|
2729
|
-
const DISCORD_BRIDGE_SRC = path.join(__dirname, "..", "bridges", "discord", "discord_bridge.py");
|
|
2730
|
-
const DISCORD_BRIDGE_DEST = path.join(os.homedir(), ".quadwork", "agentchattr-discord", "discord_bridge.py");
|
|
2731
|
-
if (fs.existsSync(DISCORD_BRIDGE_SRC) && fs.existsSync(path.dirname(DISCORD_BRIDGE_DEST))) {
|
|
2732
|
-
try {
|
|
2733
|
-
fs.copyFileSync(DISCORD_BRIDGE_SRC, DISCORD_BRIDGE_DEST);
|
|
2734
|
-
console.log("[bridge-refresh] refreshed Discord bridge script from package");
|
|
2735
|
-
} catch (err) {
|
|
2736
|
-
console.warn(`[bridge-refresh] failed to refresh Discord bridge script: ${err.message || err}`);
|
|
2737
|
-
}
|
|
2738
|
-
}
|
|
2739
|
-
|
|
2740
|
-
// bridge slug patches
|
|
2741
|
-
const BRIDGE_SLUG_PATCHES = [
|
|
2742
|
-
{ file: path.join(os.homedir(), ".quadwork", "agentchattr-telegram", "telegram_bridge.py"), old: '"telegram-bridge"', replacement: '"tg"' },
|
|
2743
|
-
{ file: path.join(os.homedir(), ".quadwork", "agentchattr-discord", "discord_bridge.py"), old: '"discord-bridge"', replacement: '"dc"' },
|
|
2744
|
-
];
|
|
2745
|
-
for (const { file, old, replacement } of BRIDGE_SLUG_PATCHES) {
|
|
2746
|
-
try {
|
|
2747
|
-
if (!fs.existsSync(file)) continue;
|
|
2748
|
-
const content = fs.readFileSync(file, "utf-8");
|
|
2749
|
-
if (!content.includes(old)) continue;
|
|
2750
|
-
fs.writeFileSync(file, content.replaceAll(old, replacement));
|
|
2751
|
-
console.log(`[bridge-migrate] patched stale bridge_sender in ${path.basename(file)}`);
|
|
2752
|
-
} catch {}
|
|
2753
|
-
}
|
|
2754
1725
|
|
|
2755
1726
|
// reseed stale slugs
|
|
2756
1727
|
const SLUG_FIXES = [
|
|
@@ -2786,109 +1757,6 @@ function runStartupMigrations(cfg) {
|
|
|
2786
1757
|
}
|
|
2787
1758
|
}
|
|
2788
1759
|
|
|
2789
|
-
// ghost-fix + idle-fix
|
|
2790
|
-
for (const p of projects) {
|
|
2791
|
-
const acDir = resolveProjectChattr(p.id).dir;
|
|
2792
|
-
const regPath = path.join(acDir, "registry.py");
|
|
2793
|
-
if (fs.existsSync(regPath)) {
|
|
2794
|
-
try {
|
|
2795
|
-
let reg = fs.readFileSync(regPath, "utf-8");
|
|
2796
|
-
if (!reg.includes("force: bool")) {
|
|
2797
|
-
reg = reg.replace(
|
|
2798
|
-
/def register\(self, base: str, label: str \| None = None\) -> dict \| None:/,
|
|
2799
|
-
"def register(self, base: str, label: str | None = None, force: bool = False) -> dict | None:",
|
|
2800
|
-
);
|
|
2801
|
-
reg = reg.replace(
|
|
2802
|
-
" self._expire_reserved()\n\n # Find next free slot",
|
|
2803
|
-
" self._expire_reserved()\n\n" +
|
|
2804
|
-
" # quadwork#478 + #502: force-replace\n" +
|
|
2805
|
-
" if force:\n" +
|
|
2806
|
-
" ghosts = [n for n, i in self._instances.items() if i.base == base]\n" +
|
|
2807
|
-
" for name in ghosts:\n" +
|
|
2808
|
-
" del self._instances[name]\n" +
|
|
2809
|
-
" stale_reserved = [rn for rn in self._reserved\n" +
|
|
2810
|
-
" if self._parse_name(rn)[0] == base]\n" +
|
|
2811
|
-
" for rn in stale_reserved:\n" +
|
|
2812
|
-
" del self._reserved[rn]\n\n" +
|
|
2813
|
-
" # Find next free slot",
|
|
2814
|
-
);
|
|
2815
|
-
fs.writeFileSync(regPath, reg);
|
|
2816
|
-
console.log(`[ghost-fix] ${p.id}: patched registry.py with force-replace support`);
|
|
2817
|
-
} else if (!reg.includes("stale_reserved")) {
|
|
2818
|
-
reg = reg.replace(
|
|
2819
|
-
/( +)for name in ghosts:\n\1 del self\._instances\[name\]\n\1 self\._reserved\[name\] = time\.time\(\)/,
|
|
2820
|
-
"$1for name in ghosts:\n$1 del self._instances[name]\n" +
|
|
2821
|
-
"$1stale_reserved = [rn for rn in self._reserved\n" +
|
|
2822
|
-
"$1 if self._parse_name(rn)[0] == base]\n" +
|
|
2823
|
-
"$1for rn in stale_reserved:\n" +
|
|
2824
|
-
"$1 del self._reserved[rn]",
|
|
2825
|
-
);
|
|
2826
|
-
fs.writeFileSync(regPath, reg);
|
|
2827
|
-
console.log(`[ghost-fix] ${p.id}: upgraded registry.py force-replace to clear _reserved (#502)`);
|
|
2828
|
-
}
|
|
2829
|
-
} catch (err) {
|
|
2830
|
-
console.warn(`[ghost-fix] ${p.id}: failed to patch registry.py: ${err.message}`);
|
|
2831
|
-
}
|
|
2832
|
-
}
|
|
2833
|
-
const appPath = path.join(acDir, "app.py");
|
|
2834
|
-
if (fs.existsSync(appPath)) {
|
|
2835
|
-
try {
|
|
2836
|
-
let app = fs.readFileSync(appPath, "utf-8");
|
|
2837
|
-
if (!app.includes("force = bool(body.get(\"force\"")) {
|
|
2838
|
-
app = app.replace(
|
|
2839
|
-
" result = registry.register(base, label)\n",
|
|
2840
|
-
" force = bool(body.get(\"force\", False))\n result = registry.register(base, label, force=force)\n",
|
|
2841
|
-
);
|
|
2842
|
-
fs.writeFileSync(appPath, app);
|
|
2843
|
-
console.log(`[ghost-fix] ${p.id}: patched app.py with force-replace support`);
|
|
2844
|
-
}
|
|
2845
|
-
} catch (err) {
|
|
2846
|
-
console.warn(`[ghost-fix] ${p.id}: failed to patch app.py: ${err.message}`);
|
|
2847
|
-
}
|
|
2848
|
-
}
|
|
2849
|
-
if (fs.existsSync(appPath)) {
|
|
2850
|
-
try {
|
|
2851
|
-
const app = fs.readFileSync(appPath, "utf-8");
|
|
2852
|
-
if (app.includes("_CRASH_TIMEOUT = 15")) {
|
|
2853
|
-
patchCrashTimeout(acDir);
|
|
2854
|
-
console.log(`[idle-fix] ${p.id}: crash timeout patched on disk`);
|
|
2855
|
-
acRestartNeeded.push(p.id);
|
|
2856
|
-
}
|
|
2857
|
-
} catch (err) {
|
|
2858
|
-
console.warn(`[idle-fix] ${p.id}: failed to patch app.py crash timeout: ${err.message}`);
|
|
2859
|
-
}
|
|
2860
|
-
}
|
|
2861
|
-
}
|
|
2862
|
-
|
|
2863
|
-
// CLI-based agent sections
|
|
2864
|
-
for (const p of projects) {
|
|
2865
|
-
const acPath = projectAgentchattrConfigPath(p.id);
|
|
2866
|
-
if (!fs.existsSync(acPath)) continue;
|
|
2867
|
-
try {
|
|
2868
|
-
let toml = fs.readFileSync(acPath, "utf-8");
|
|
2869
|
-
const cliSections = new Set();
|
|
2870
|
-
for (const [, agentCfg] of Object.entries(p.agents || {})) {
|
|
2871
|
-
const cmd = agentCfg.command || "claude";
|
|
2872
|
-
const cli = cmd.split("/").pop().split(" ")[0];
|
|
2873
|
-
cliSections.add(cli);
|
|
2874
|
-
}
|
|
2875
|
-
let changed = false;
|
|
2876
|
-
for (const cli of cliSections) {
|
|
2877
|
-
if (!new RegExp(`^\\[agents\\.${cli}\\]`, "m").test(toml)) {
|
|
2878
|
-
const injectMode = cli === "codex" ? "proxy_flag" : cli === "gemini" ? "env" : "flag";
|
|
2879
|
-
toml += `\n[agents.${cli}]\ncommand = "${cli}"\nlabel = "${cli}"\nmcp_inject = "${injectMode}"\n`;
|
|
2880
|
-
changed = true;
|
|
2881
|
-
}
|
|
2882
|
-
}
|
|
2883
|
-
if (changed) {
|
|
2884
|
-
fs.writeFileSync(acPath, toml);
|
|
2885
|
-
console.log(`[#596] ${p.id}: added CLI-based agent sections to config.toml`);
|
|
2886
|
-
}
|
|
2887
|
-
} catch (err) {
|
|
2888
|
-
console.warn(`[#596] ${p.id}: config.toml migration failed: ${err.message}`);
|
|
2889
|
-
}
|
|
2890
|
-
}
|
|
2891
|
-
|
|
2892
1760
|
// #690: seed DESIGN-GUIDE.md into existing agent worktrees
|
|
2893
1761
|
const designGuideSrc = path.join(__dirname, "..", "templates", "seeds", "DESIGN-GUIDE.md");
|
|
2894
1762
|
if (fs.existsSync(designGuideSrc)) {
|
|
@@ -2909,99 +1777,66 @@ function runStartupMigrations(cfg) {
|
|
|
2909
1777
|
}
|
|
2910
1778
|
}
|
|
2911
1779
|
|
|
2912
|
-
return acRestartNeeded;
|
|
2913
1780
|
}
|
|
2914
1781
|
|
|
2915
1782
|
server.listen(PORT, "127.0.0.1", async () => {
|
|
2916
1783
|
console.log(`QuadWork server listening on http://127.0.0.1:${PORT}`);
|
|
2917
1784
|
syncTriggersFromConfig();
|
|
2918
|
-
// #579: detect AC processes already running (spawned by cmdStart before
|
|
2919
|
-
// the server module loaded). Without this, chattrProcesses is empty on
|
|
2920
|
-
// boot and the health monitor can't track cmdStart-spawned ACs, while
|
|
2921
|
-
// the dashboard's Start button would redundantly kill+respawn them.
|
|
2922
1785
|
const startupCfg = readConfig();
|
|
2923
|
-
|
|
2924
|
-
|
|
2925
|
-
|
|
2926
|
-
|
|
2927
|
-
|
|
2928
|
-
|
|
2929
|
-
|
|
2930
|
-
|
|
2931
|
-
|
|
2932
|
-
|
|
2933
|
-
|
|
1786
|
+
|
|
1787
|
+
// #719: Migrate AC chat history to JSONL before initializing file-chat.
|
|
1788
|
+
const migrationFailed = new Set(runAcMigration(startupCfg));
|
|
1789
|
+
|
|
1790
|
+
// #722: One-time switchover — set all projects to file-based chat.
|
|
1791
|
+
if (!startupCfg.file_chat_switchover_done) {
|
|
1792
|
+
let switched = false;
|
|
1793
|
+
for (const p of (startupCfg.projects || [])) {
|
|
1794
|
+
if (p.chat_mode !== "file" && !migrationFailed.has(p.id)) {
|
|
1795
|
+
p.chat_mode = "file";
|
|
1796
|
+
switched = true;
|
|
1797
|
+
console.log(`[startup] ${p.id}: switched to file-based chat`);
|
|
1798
|
+
}
|
|
2934
1799
|
}
|
|
1800
|
+
startupCfg.file_chat_switchover_done = true;
|
|
1801
|
+
writeConfig(startupCfg);
|
|
1802
|
+
if (switched) console.log("[startup] file-chat switchover complete");
|
|
2935
1803
|
}
|
|
2936
|
-
|
|
2937
|
-
//
|
|
2938
|
-
// instances receive the fix without requiring a restart.
|
|
2939
|
-
// #448: retry after 5s for projects where AC isn't up yet at boot.
|
|
1804
|
+
|
|
1805
|
+
// Initialize file-chat engine for all projects.
|
|
2940
1806
|
for (const p of (startupCfg.projects || [])) {
|
|
2941
|
-
|
|
2942
|
-
|
|
2943
|
-
|
|
2944
|
-
|
|
2945
|
-
|
|
2946
|
-
|
|
2947
|
-
|
|
2948
|
-
|
|
2949
|
-
|
|
2950
|
-
|
|
2951
|
-
|
|
2952
|
-
|
|
2953
|
-
if (startupCfg._acRestartNeeded) {
|
|
2954
|
-
for (const projectId of startupCfg._acRestartNeeded) {
|
|
2955
|
-
const { url } = resolveProjectChattr(projectId);
|
|
2956
|
-
const portMatch = url.match(/:(\d+)/);
|
|
2957
|
-
const port = portMatch ? parseInt(portMatch[1], 10) : 8300;
|
|
2958
|
-
isPortAlive(port).then((alive) => {
|
|
2959
|
-
if (!alive) return;
|
|
2960
|
-
console.log(`[idle-fix] ${projectId}: restarting AC (port ${port}) so running process observes _CRASH_TIMEOUT = 120 (#629)`);
|
|
2961
|
-
return fetch(`http://127.0.0.1:${PORT}/api/agentchattr/${encodeURIComponent(projectId)}/restart`, {
|
|
2962
|
-
method: "POST",
|
|
2963
|
-
headers: { "Content-Type": "application/json" },
|
|
2964
|
-
body: JSON.stringify({ action: "restart" }),
|
|
2965
|
-
});
|
|
2966
|
-
}).then((r) => {
|
|
2967
|
-
if (r && r.ok) console.log(`[idle-fix] ${projectId}: AC restarted successfully`);
|
|
2968
|
-
else if (r) console.warn(`[idle-fix] ${projectId}: AC restart returned ${r.status}`);
|
|
2969
|
-
}).catch((err) => {
|
|
2970
|
-
console.warn(`[idle-fix] ${projectId}: AC restart failed: ${err.message}`);
|
|
2971
|
-
});
|
|
1807
|
+
if (p.chat_mode === "file") {
|
|
1808
|
+
if (migrationFailed.has(p.id)) {
|
|
1809
|
+
console.error(`[startup] ${p.id}: migration failed — skipping file-chat init`);
|
|
1810
|
+
continue;
|
|
1811
|
+
}
|
|
1812
|
+
try {
|
|
1813
|
+
fileChat.initProject(p.id);
|
|
1814
|
+
console.log(`[startup] ${p.id}: file-chat engine initialized`);
|
|
1815
|
+
} catch (err) {
|
|
1816
|
+
console.error(`[startup] FATAL: ${p.id}: ${err.message}`);
|
|
1817
|
+
process.exit(1);
|
|
1818
|
+
}
|
|
2972
1819
|
}
|
|
2973
1820
|
}
|
|
2974
|
-
|
|
1821
|
+
|
|
1822
|
+
runStartupMigrations(startupCfg);
|
|
1823
|
+
|
|
2975
1824
|
if (startupCfg.butler && startupCfg.butler.enabled && startupCfg.butler.auto_start) {
|
|
2976
1825
|
const result = spawnButlerPty();
|
|
2977
1826
|
if (result.ok) console.log(`[butler] auto-started (PID: ${result.pid})`);
|
|
2978
1827
|
else console.warn(`[butler] auto-start failed: ${result.error}`);
|
|
2979
1828
|
}
|
|
2980
|
-
// #416: start the AC health monitor
|
|
2981
|
-
startAcHealthMonitor();
|
|
2982
|
-
// #705: start the stuck-agent watchdog
|
|
2983
1829
|
startWatchdog();
|
|
2984
1830
|
});
|
|
2985
1831
|
|
|
2986
|
-
|
|
2987
|
-
|
|
2988
|
-
|
|
2989
|
-
|
|
2990
|
-
|
|
2991
|
-
|
|
2992
|
-
* invisible to the CLI. Without this, a Ctrl+C in the foreground
|
|
2993
|
-
* quadwork terminal would exit the Node process and orphan every
|
|
2994
|
-
* dashboard-started python run.py. See review on quadwork#213.
|
|
2995
|
-
*/
|
|
2996
|
-
function shutdownChattrProcesses() {
|
|
2997
|
-
for (const [, proc] of chattrProcesses) {
|
|
2998
|
-
if (proc && proc.process) {
|
|
2999
|
-
try { proc.process.kill("SIGTERM"); } catch {}
|
|
1832
|
+
function shutdown() {
|
|
1833
|
+
stopButlerPty();
|
|
1834
|
+
const cfg = readConfig();
|
|
1835
|
+
for (const p of (cfg.projects || [])) {
|
|
1836
|
+
if (p.chat_mode === "file") {
|
|
1837
|
+
try { fileChat.shutdownProject(p.id); } catch {}
|
|
3000
1838
|
}
|
|
3001
1839
|
}
|
|
3002
|
-
chattrProcesses.clear();
|
|
3003
|
-
// #631: stop Butler PTY on shutdown
|
|
3004
|
-
stopButlerPty();
|
|
3005
1840
|
}
|
|
3006
1841
|
|
|
3007
|
-
module.exports = {
|
|
1842
|
+
module.exports = { shutdown };
|