agent-coord-mcp 0.7.1 → 0.8.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +24 -6
- package/dist/server.js +3 -1
- package/dist/server.js.map +1 -1
- package/dist/tools.js +488 -17
- package/dist/tools.js.map +1 -1
- package/hooks/peek-coord.mjs +5 -1
- package/hooks/tmux-pusher.mjs +73 -4
- package/package.json +1 -1
- package/scripts/coord-pusher.mjs +63 -4
- package/src/server.ts +18 -0
- package/src/tools.ts +509 -15
package/src/tools.ts
CHANGED
|
@@ -10,6 +10,9 @@ import {
|
|
|
10
10
|
CURSOR_DIR,
|
|
11
11
|
DEFAULT_ROOM,
|
|
12
12
|
INBOX_DIR,
|
|
13
|
+
ROOT,
|
|
14
|
+
ROOM_FILE,
|
|
15
|
+
ROOMS_DIR,
|
|
13
16
|
ROOMS_FILE,
|
|
14
17
|
STATUS_FILE,
|
|
15
18
|
addMember,
|
|
@@ -71,6 +74,10 @@ type Message = {
|
|
|
71
74
|
text: string;
|
|
72
75
|
// System notices (join/part/topic/nick) — rendered distinctly by clients.
|
|
73
76
|
system?: boolean;
|
|
77
|
+
// Control commands (`/clear`, `/compact`) addressed at the agent's CLI, not
|
|
78
|
+
// its operator. The tmux pushers inject these RAW (no banner/prefix) so the
|
|
79
|
+
// TUI runs them as real slash commands; every other consumer ignores them.
|
|
80
|
+
control?: boolean;
|
|
74
81
|
};
|
|
75
82
|
|
|
76
83
|
type StatusEntry = {
|
|
@@ -245,26 +252,12 @@ export async function listAgentsTool() {
|
|
|
245
252
|
|
|
246
253
|
async function loadLiveTransports(): Promise<Map<string, TransportMarker>> {
|
|
247
254
|
const out = new Map<string, TransportMarker>();
|
|
248
|
-
// For remote markers we can't pid-check the foreign process — instead we trust
|
|
249
|
-
// the registry's lastHeartbeat, which the remote pusher refreshes every minute.
|
|
250
255
|
const reg = await readJson<AgentRegistry>(AGENTS_FILE, {});
|
|
251
256
|
const now = Date.now();
|
|
252
257
|
for (const fname of await listTransportFiles()) {
|
|
253
258
|
const file = path.join(TRANSPORT_DIR, fname);
|
|
254
259
|
const marker = await readJson<TransportMarker | null>(file, null);
|
|
255
|
-
if (!marker) {
|
|
256
|
-
await deleteFile(file);
|
|
257
|
-
continue;
|
|
258
|
-
}
|
|
259
|
-
const isRemote = marker.transport === "tmux-push-remote";
|
|
260
|
-
if (isRemote) {
|
|
261
|
-
const entry = reg[marker.agentId];
|
|
262
|
-
const fresh = !!entry && now - entry.lastHeartbeat < STALE_MS;
|
|
263
|
-
if (!fresh) {
|
|
264
|
-
await deleteFile(file);
|
|
265
|
-
continue;
|
|
266
|
-
}
|
|
267
|
-
} else if (!isPidAlive(marker.pid)) {
|
|
260
|
+
if (!marker || !isMarkerLive(marker, reg, now)) {
|
|
268
261
|
await deleteFile(file);
|
|
269
262
|
continue;
|
|
270
263
|
}
|
|
@@ -273,6 +266,17 @@ async function loadLiveTransports(): Promise<Map<string, TransportMarker>> {
|
|
|
273
266
|
return out;
|
|
274
267
|
}
|
|
275
268
|
|
|
269
|
+
// Liveness for a transport marker. Local markers carry a real pid we can probe;
|
|
270
|
+
// remote markers (tmux-push-remote, pid 0 on a foreign host) can't be — so we
|
|
271
|
+
// trust the registry heartbeat the remote pusher refreshes (within STALE_MS).
|
|
272
|
+
function isMarkerLive(marker: TransportMarker, reg: AgentRegistry, now: number): boolean {
|
|
273
|
+
if (marker.transport === "tmux-push-remote") {
|
|
274
|
+
const entry = reg[marker.agentId];
|
|
275
|
+
return !!entry && now - entry.lastHeartbeat < STALE_MS;
|
|
276
|
+
}
|
|
277
|
+
return isPidAlive(marker.pid);
|
|
278
|
+
}
|
|
279
|
+
|
|
276
280
|
function isPidAlive(pid: number): boolean {
|
|
277
281
|
if (!pid || pid <= 0) return false;
|
|
278
282
|
try {
|
|
@@ -335,6 +339,123 @@ export async function sendMessageTool(args: {
|
|
|
335
339
|
return { ok: true, id: msg.id, target, room: chan };
|
|
336
340
|
}
|
|
337
341
|
|
|
342
|
+
// ---------- send_command (context-management control commands) ----------
|
|
343
|
+
|
|
344
|
+
// The only slash commands a lead may inject into a sub-agent's CLI. Locked on
|
|
345
|
+
// purpose: these wipe/compact context (cheap, reversible-by-the-agent), nothing
|
|
346
|
+
// that mutates the repo or the bus. Stored WITHOUT the leading slash; the wire
|
|
347
|
+
// text is `/${cmd}`.
|
|
348
|
+
export const CONTROL_COMMANDS = ["clear", "compact"] as const;
|
|
349
|
+
|
|
350
|
+
// Transports whose pusher can actually TYPE a slash command into a live CLI.
|
|
351
|
+
// A control command is meaningless to a plain MCP poller, so send_command is
|
|
352
|
+
// gated to agents currently attached over one of these.
|
|
353
|
+
const TMUX_TRANSPORTS = new Set(["tmux-push", "tmux-push-remote"]);
|
|
354
|
+
|
|
355
|
+
// Normalize "clear" / "/clear" / " /Clear " → "clear"; null if not allowlisted.
|
|
356
|
+
function normalizeControlCommand(raw: string): string | null {
|
|
357
|
+
const c = raw.trim().replace(/^\/+/, "").toLowerCase();
|
|
358
|
+
return (CONTROL_COMMANDS as readonly string[]).includes(c) ? c : null;
|
|
359
|
+
}
|
|
360
|
+
|
|
361
|
+
// Live transports filtered to the tmux-push family (local + remote).
|
|
362
|
+
async function liveTmuxTargets(): Promise<Map<string, TransportMarker>> {
|
|
363
|
+
const all = await loadLiveTransports();
|
|
364
|
+
const out = new Map<string, TransportMarker>();
|
|
365
|
+
for (const [id, m] of all) if (TMUX_TRANSPORTS.has(m.transport)) out.set(id, m);
|
|
366
|
+
return out;
|
|
367
|
+
}
|
|
368
|
+
|
|
369
|
+
export const sendCommandSchema = {
|
|
370
|
+
from: z.string().min(1),
|
|
371
|
+
to: z.string().optional(),
|
|
372
|
+
room: z.string().optional(),
|
|
373
|
+
command: z.string().min(1),
|
|
374
|
+
};
|
|
375
|
+
|
|
376
|
+
// Inject a context-management slash command into a sub-agent's live tmux
|
|
377
|
+
// session. Writes a control-flagged message the pushers deliver RAW (no banner,
|
|
378
|
+
// no `[DM …]` prefix) so the receiving CLI runs it as a real slash command.
|
|
379
|
+
// Hard-gated to tmux: refuses unless the target(s) have a live tmux-push(-remote)
|
|
380
|
+
// transport, so a command never rots unexecuted in an offline inbox.
|
|
381
|
+
export async function sendCommandTool(args: {
|
|
382
|
+
from: string;
|
|
383
|
+
to?: string;
|
|
384
|
+
room?: string;
|
|
385
|
+
command: string;
|
|
386
|
+
}) {
|
|
387
|
+
const cmd = normalizeControlCommand(args.command);
|
|
388
|
+
if (!cmd) {
|
|
389
|
+
return {
|
|
390
|
+
ok: false,
|
|
391
|
+
error: `unsupported command '${args.command}'. Allowed: ${CONTROL_COMMANDS.map((c) => "/" + c).join(", ")}`,
|
|
392
|
+
};
|
|
393
|
+
}
|
|
394
|
+
if (!args.to && !args.room) {
|
|
395
|
+
return { ok: false, error: "specify 'to' (a single agent) or 'room' (a channel's tmux-attached members)" };
|
|
396
|
+
}
|
|
397
|
+
if (args.to && args.room) {
|
|
398
|
+
return { ok: false, error: "specify only one of 'to' or 'room'" };
|
|
399
|
+
}
|
|
400
|
+
|
|
401
|
+
const text = `/${cmd}`;
|
|
402
|
+
const liveTmux = await liveTmuxTargets();
|
|
403
|
+
|
|
404
|
+
// DM: target must itself be tmux-attached.
|
|
405
|
+
if (args.to) {
|
|
406
|
+
const marker = liveTmux.get(args.to);
|
|
407
|
+
if (!marker) {
|
|
408
|
+
return {
|
|
409
|
+
ok: false,
|
|
410
|
+
error: `'${args.to}' has no live tmux-push transport — control commands can only be injected into a tmux session. Attach it (join/attach_agent) or target an attached agent.`,
|
|
411
|
+
};
|
|
412
|
+
}
|
|
413
|
+
const msg: Message = {
|
|
414
|
+
id: randomUUID(),
|
|
415
|
+
ts: Date.now(),
|
|
416
|
+
from: args.from,
|
|
417
|
+
to: args.to,
|
|
418
|
+
text,
|
|
419
|
+
control: true,
|
|
420
|
+
};
|
|
421
|
+
const target = inboxFile(args.to);
|
|
422
|
+
await appendJsonl(target, msg);
|
|
423
|
+
return { ok: true, id: msg.id, command: text, target, delivered: [args.to], transport: marker.transport };
|
|
424
|
+
}
|
|
425
|
+
|
|
426
|
+
// Room: broadcast to every tmux-attached member (never the sender itself).
|
|
427
|
+
const chan = normalizeRoom(args.room);
|
|
428
|
+
const rooms = await getRooms();
|
|
429
|
+
const members = rooms[chan]?.members ?? [];
|
|
430
|
+
const delivered = members.filter((m) => m !== args.from && liveTmux.has(m));
|
|
431
|
+
if (delivered.length === 0) {
|
|
432
|
+
return {
|
|
433
|
+
ok: false,
|
|
434
|
+
error: `no tmux-attached members in #${chan} to receive '${text}' (${members.length} member(s) total). Control commands only fire in a live tmux session.`,
|
|
435
|
+
};
|
|
436
|
+
}
|
|
437
|
+
const skipped = members.filter((m) => m !== args.from && !liveTmux.has(m));
|
|
438
|
+
const msg: Message = {
|
|
439
|
+
id: randomUUID(),
|
|
440
|
+
ts: Date.now(),
|
|
441
|
+
from: args.from,
|
|
442
|
+
room: chan,
|
|
443
|
+
text,
|
|
444
|
+
control: true,
|
|
445
|
+
};
|
|
446
|
+
const target = roomFile(chan);
|
|
447
|
+
await appendJsonl(target, msg);
|
|
448
|
+
return {
|
|
449
|
+
ok: true,
|
|
450
|
+
id: msg.id,
|
|
451
|
+
command: text,
|
|
452
|
+
target,
|
|
453
|
+
room: chan,
|
|
454
|
+
delivered,
|
|
455
|
+
skipped: skipped.length ? skipped : undefined,
|
|
456
|
+
};
|
|
457
|
+
}
|
|
458
|
+
|
|
338
459
|
// ---------- read_messages ----------
|
|
339
460
|
|
|
340
461
|
export const readMessagesSchema = {
|
|
@@ -1121,6 +1242,379 @@ export async function clearTransportTool(args: { agentId: string }) {
|
|
|
1121
1242
|
return { ok: true, removed };
|
|
1122
1243
|
}
|
|
1123
1244
|
|
|
1245
|
+
// ---------- doctor (bus-wide health check) ----------
|
|
1246
|
+
|
|
1247
|
+
type DoctorLevel = "ok" | "warn" | "error";
|
|
1248
|
+
type DoctorFinding = {
|
|
1249
|
+
check: string;
|
|
1250
|
+
level: DoctorLevel;
|
|
1251
|
+
detail: string;
|
|
1252
|
+
fixable: boolean;
|
|
1253
|
+
items?: string[];
|
|
1254
|
+
};
|
|
1255
|
+
|
|
1256
|
+
// Count non-empty lines vs successfully-parsed entries in a JSONL file.
|
|
1257
|
+
// Offsets index the PARSED entries (see readJsonl), so `parsed` is the figure
|
|
1258
|
+
// cursor math is compared against; `malformed` is the desync risk.
|
|
1259
|
+
async function scanJsonl(file: string): Promise<{ lines: number; parsed: number; malformed: number }> {
|
|
1260
|
+
if (!existsSync(file)) return { lines: 0, parsed: 0, malformed: 0 };
|
|
1261
|
+
const raw = await fsp.readFile(file, "utf8");
|
|
1262
|
+
let lines = 0;
|
|
1263
|
+
let parsed = 0;
|
|
1264
|
+
for (const line of raw.split("\n")) {
|
|
1265
|
+
if (!line.trim()) continue;
|
|
1266
|
+
lines++;
|
|
1267
|
+
try {
|
|
1268
|
+
JSON.parse(line);
|
|
1269
|
+
parsed++;
|
|
1270
|
+
} catch {
|
|
1271
|
+
// malformed
|
|
1272
|
+
}
|
|
1273
|
+
}
|
|
1274
|
+
return { lines, parsed, malformed: lines - parsed };
|
|
1275
|
+
}
|
|
1276
|
+
|
|
1277
|
+
// Find leftover proper-lockfile lock dirs (`<file>.lock`) across the state
|
|
1278
|
+
// dirs. Anything older than the threshold is almost certainly orphaned by a
|
|
1279
|
+
// crashed writer (withLock's stale window is 5s).
|
|
1280
|
+
async function scanStaleLocks(olderThanMs: number, now: number): Promise<{ path: string; ageMs: number }[]> {
|
|
1281
|
+
const out: { path: string; ageMs: number }[] = [];
|
|
1282
|
+
const dirs = [ROOT, INBOX_DIR, CURSOR_DIR, ROOMS_DIR, TRANSPORT_DIR];
|
|
1283
|
+
for (const dir of dirs) {
|
|
1284
|
+
if (!existsSync(dir)) continue;
|
|
1285
|
+
let names: string[];
|
|
1286
|
+
try {
|
|
1287
|
+
names = await fsp.readdir(dir);
|
|
1288
|
+
} catch {
|
|
1289
|
+
continue;
|
|
1290
|
+
}
|
|
1291
|
+
for (const name of names) {
|
|
1292
|
+
if (!name.endsWith(".lock")) continue;
|
|
1293
|
+
const p = path.join(dir, name);
|
|
1294
|
+
try {
|
|
1295
|
+
const st = await fsp.stat(p);
|
|
1296
|
+
const ageMs = now - st.mtimeMs;
|
|
1297
|
+
if (ageMs > olderThanMs) out.push({ path: p, ageMs });
|
|
1298
|
+
} catch {
|
|
1299
|
+
// vanished mid-scan
|
|
1300
|
+
}
|
|
1301
|
+
}
|
|
1302
|
+
}
|
|
1303
|
+
return out;
|
|
1304
|
+
}
|
|
1305
|
+
|
|
1306
|
+
export const doctorSchema = {
|
|
1307
|
+
fix: z.boolean().optional(),
|
|
1308
|
+
maxFileBytes: z.number().int().positive().optional(),
|
|
1309
|
+
};
|
|
1310
|
+
|
|
1311
|
+
export async function doctorTool(args: { fix?: boolean; maxFileBytes?: number }) {
|
|
1312
|
+
const fix = args.fix ?? false;
|
|
1313
|
+
const maxBytes = args.maxFileBytes ?? 5 * 1024 * 1024;
|
|
1314
|
+
const now = Date.now();
|
|
1315
|
+
const findings: DoctorFinding[] = [];
|
|
1316
|
+
const fixed: string[] = [];
|
|
1317
|
+
|
|
1318
|
+
const reg = await readJson<AgentRegistry>(AGENTS_FILE, {});
|
|
1319
|
+
const known = new Set(Object.keys(reg));
|
|
1320
|
+
const rooms = await getRooms();
|
|
1321
|
+
const channels = Object.keys(rooms);
|
|
1322
|
+
|
|
1323
|
+
// 1. Orphan transport markers (dead local pid, or stale remote heartbeat).
|
|
1324
|
+
{
|
|
1325
|
+
const dead: string[] = [];
|
|
1326
|
+
for (const fname of await listTransportFiles()) {
|
|
1327
|
+
const file = path.join(TRANSPORT_DIR, fname);
|
|
1328
|
+
const marker = await readJson<TransportMarker | null>(file, null);
|
|
1329
|
+
if (!marker || !isMarkerLive(marker, reg, now)) {
|
|
1330
|
+
dead.push(file);
|
|
1331
|
+
if (fix) {
|
|
1332
|
+
await deleteFile(file);
|
|
1333
|
+
fixed.push(`deleted stale transport marker ${fname}`);
|
|
1334
|
+
}
|
|
1335
|
+
}
|
|
1336
|
+
}
|
|
1337
|
+
findings.push({
|
|
1338
|
+
check: "orphan-transport-markers",
|
|
1339
|
+
level: dead.length ? "warn" : "ok",
|
|
1340
|
+
detail: dead.length ? `${dead.length} stale transport marker(s) (dead pid or expired remote heartbeat)` : "no stale transport markers",
|
|
1341
|
+
fixable: true,
|
|
1342
|
+
items: dead.length ? dead.map((f) => path.basename(f)) : undefined,
|
|
1343
|
+
});
|
|
1344
|
+
}
|
|
1345
|
+
|
|
1346
|
+
// 2. Orphan room memberships (member not in the registry).
|
|
1347
|
+
{
|
|
1348
|
+
const orphans = new Set<string>();
|
|
1349
|
+
for (const e of Object.values(rooms)) {
|
|
1350
|
+
for (const m of e.members ?? []) if (!known.has(m)) orphans.add(m);
|
|
1351
|
+
}
|
|
1352
|
+
if (fix && orphans.size) {
|
|
1353
|
+
await updateJson<RoomRegistry>(ROOMS_FILE, {}, (cur) => {
|
|
1354
|
+
for (const e of Object.values(cur)) {
|
|
1355
|
+
if (e.members?.length) e.members = e.members.filter((m) => known.has(m));
|
|
1356
|
+
}
|
|
1357
|
+
return cur;
|
|
1358
|
+
});
|
|
1359
|
+
fixed.push(`dropped ${orphans.size} orphan membership(s): ${[...orphans].join(", ")}`);
|
|
1360
|
+
}
|
|
1361
|
+
findings.push({
|
|
1362
|
+
check: "orphan-room-memberships",
|
|
1363
|
+
level: orphans.size ? "warn" : "ok",
|
|
1364
|
+
detail: orphans.size ? `${orphans.size} channel member(s) not in the registry` : "all channel members are registered",
|
|
1365
|
+
fixable: true,
|
|
1366
|
+
items: orphans.size ? [...orphans] : undefined,
|
|
1367
|
+
});
|
|
1368
|
+
}
|
|
1369
|
+
|
|
1370
|
+
// 3. Orphan inbox / cursor files (owner not registered).
|
|
1371
|
+
{
|
|
1372
|
+
const orphanInbox: string[] = [];
|
|
1373
|
+
for (const fname of await listInboxFiles()) {
|
|
1374
|
+
const id = fname.replace(/\.jsonl$/, "");
|
|
1375
|
+
if (!known.has(id)) {
|
|
1376
|
+
orphanInbox.push(id);
|
|
1377
|
+
if (fix) {
|
|
1378
|
+
await deleteFile(path.join(INBOX_DIR, fname));
|
|
1379
|
+
fixed.push(`deleted orphan inbox ${fname}`);
|
|
1380
|
+
}
|
|
1381
|
+
}
|
|
1382
|
+
}
|
|
1383
|
+
const orphanCursor: string[] = [];
|
|
1384
|
+
for (const fname of await listCursorFiles()) {
|
|
1385
|
+
const id = fname.replace(/\.json$/, "");
|
|
1386
|
+
if (!known.has(id)) {
|
|
1387
|
+
orphanCursor.push(id);
|
|
1388
|
+
if (fix) {
|
|
1389
|
+
await deleteFile(path.join(CURSOR_DIR, fname));
|
|
1390
|
+
fixed.push(`deleted orphan cursor ${fname}`);
|
|
1391
|
+
}
|
|
1392
|
+
}
|
|
1393
|
+
}
|
|
1394
|
+
const total = orphanInbox.length + orphanCursor.length;
|
|
1395
|
+
findings.push({
|
|
1396
|
+
check: "orphan-inboxes-cursors",
|
|
1397
|
+
level: total ? "warn" : "ok",
|
|
1398
|
+
detail: total
|
|
1399
|
+
? `${orphanInbox.length} inbox + ${orphanCursor.length} cursor file(s) for unregistered ids`
|
|
1400
|
+
: "no orphan inbox/cursor files",
|
|
1401
|
+
fixable: true,
|
|
1402
|
+
items: total ? [...new Set([...orphanInbox, ...orphanCursor])] : undefined,
|
|
1403
|
+
});
|
|
1404
|
+
}
|
|
1405
|
+
|
|
1406
|
+
// Precompute parsed line counts for cursor + malformed checks.
|
|
1407
|
+
const counts = new Map<string, { lines: number; parsed: number; malformed: number }>();
|
|
1408
|
+
const countFor = async (file: string) => {
|
|
1409
|
+
if (!counts.has(file)) counts.set(file, await scanJsonl(file));
|
|
1410
|
+
return counts.get(file)!;
|
|
1411
|
+
};
|
|
1412
|
+
|
|
1413
|
+
// 4. Cursor offsets past end-of-file (would return [] forever).
|
|
1414
|
+
{
|
|
1415
|
+
const broken: string[] = [];
|
|
1416
|
+
for (const fname of await listCursorFiles()) {
|
|
1417
|
+
const id = fname.replace(/\.json$/, "");
|
|
1418
|
+
const cursorPath = path.join(CURSOR_DIR, fname);
|
|
1419
|
+
const cursor = await readJson<Cursor>(cursorPath, {});
|
|
1420
|
+
const overflow: string[] = [];
|
|
1421
|
+
const inboxMax = (await countFor(inboxFile(id))).parsed;
|
|
1422
|
+
if ((cursor.inboxOffset ?? 0) > inboxMax) overflow.push(`inboxOffset ${cursor.inboxOffset}>${inboxMax}`);
|
|
1423
|
+
const roomMax = (await countFor(ROOM_FILE)).parsed;
|
|
1424
|
+
if ((cursor.roomOffset ?? 0) > roomMax) overflow.push(`roomOffset ${cursor.roomOffset}>${roomMax}`);
|
|
1425
|
+
const statusMax = (await countFor(STATUS_FILE)).parsed;
|
|
1426
|
+
if ((cursor.statusOffset ?? 0) > statusMax) overflow.push(`statusOffset ${cursor.statusOffset}>${statusMax}`);
|
|
1427
|
+
for (const [chan, off] of Object.entries(cursor.roomOffsets ?? {})) {
|
|
1428
|
+
const max = (await countFor(roomFile(chan))).parsed;
|
|
1429
|
+
if (off > max) overflow.push(`roomOffsets[${chan}] ${off}>${max}`);
|
|
1430
|
+
}
|
|
1431
|
+
if (overflow.length) {
|
|
1432
|
+
broken.push(`${id}: ${overflow.join(", ")}`);
|
|
1433
|
+
if (fix) {
|
|
1434
|
+
await updateJson<Cursor>(cursorPath, {}, (c) => {
|
|
1435
|
+
if ((c.inboxOffset ?? 0) > inboxMax) c.inboxOffset = inboxMax;
|
|
1436
|
+
if ((c.roomOffset ?? 0) > roomMax) c.roomOffset = roomMax;
|
|
1437
|
+
if ((c.statusOffset ?? 0) > statusMax) c.statusOffset = statusMax;
|
|
1438
|
+
if (c.roomOffsets) {
|
|
1439
|
+
for (const chan of Object.keys(c.roomOffsets)) {
|
|
1440
|
+
const max = counts.get(roomFile(chan))?.parsed ?? 0;
|
|
1441
|
+
if (c.roomOffsets[chan] > max) c.roomOffsets[chan] = max;
|
|
1442
|
+
}
|
|
1443
|
+
}
|
|
1444
|
+
return c;
|
|
1445
|
+
});
|
|
1446
|
+
fixed.push(`clamped cursor offsets for ${id}`);
|
|
1447
|
+
}
|
|
1448
|
+
}
|
|
1449
|
+
}
|
|
1450
|
+
findings.push({
|
|
1451
|
+
check: "cursor-past-eof",
|
|
1452
|
+
level: broken.length ? "error" : "ok",
|
|
1453
|
+
detail: broken.length ? `${broken.length} cursor(s) with an offset past EOF — delivery stalled` : "all cursor offsets are within bounds",
|
|
1454
|
+
fixable: true,
|
|
1455
|
+
items: broken.length ? broken : undefined,
|
|
1456
|
+
});
|
|
1457
|
+
}
|
|
1458
|
+
|
|
1459
|
+
// 5. Malformed JSONL lines (silently desync offset math between server + hooks).
|
|
1460
|
+
{
|
|
1461
|
+
const jsonlFiles = [
|
|
1462
|
+
ROOM_FILE,
|
|
1463
|
+
STATUS_FILE,
|
|
1464
|
+
...channels.filter((c) => c !== DEFAULT_ROOM).map((c) => roomFile(c)),
|
|
1465
|
+
...(await listInboxFiles()).map((f) => path.join(INBOX_DIR, f)),
|
|
1466
|
+
];
|
|
1467
|
+
const bad: string[] = [];
|
|
1468
|
+
for (const file of jsonlFiles) {
|
|
1469
|
+
const c = await countFor(file);
|
|
1470
|
+
if (c.malformed > 0) {
|
|
1471
|
+
bad.push(`${path.basename(file)} (${c.malformed})`);
|
|
1472
|
+
if (fix) {
|
|
1473
|
+
await fsp.copyFile(file, file + ".bak");
|
|
1474
|
+
await rewriteJsonl(file, () => true); // drops unparseable lines
|
|
1475
|
+
fixed.push(`rewrote ${path.basename(file)} dropping ${c.malformed} malformed line(s) (backup: ${path.basename(file)}.bak)`);
|
|
1476
|
+
}
|
|
1477
|
+
}
|
|
1478
|
+
}
|
|
1479
|
+
findings.push({
|
|
1480
|
+
check: "malformed-jsonl",
|
|
1481
|
+
level: bad.length ? "warn" : "ok",
|
|
1482
|
+
detail: bad.length ? `${bad.length} file(s) contain unparseable lines` : "no malformed JSONL lines",
|
|
1483
|
+
fixable: true,
|
|
1484
|
+
items: bad.length ? bad : undefined,
|
|
1485
|
+
});
|
|
1486
|
+
}
|
|
1487
|
+
|
|
1488
|
+
// 6. Stale agents (registered, no live transport, heartbeat past EVICT_MS). Report only.
|
|
1489
|
+
{
|
|
1490
|
+
// Compute liveness WITHOUT deleting dead markers — loadLiveTransports
|
|
1491
|
+
// prunes as a side effect, which would make this read-only check mutate
|
|
1492
|
+
// state (and pre-empt the orphan-marker fix in check 1).
|
|
1493
|
+
const live = new Set<string>();
|
|
1494
|
+
for (const fname of await listTransportFiles()) {
|
|
1495
|
+
const marker = await readJson<TransportMarker | null>(path.join(TRANSPORT_DIR, fname), null);
|
|
1496
|
+
if (marker && isMarkerLive(marker, reg, now)) live.add(marker.agentId);
|
|
1497
|
+
}
|
|
1498
|
+
const stale: string[] = [];
|
|
1499
|
+
for (const [id, a] of Object.entries(reg)) {
|
|
1500
|
+
if (live.has(id)) continue;
|
|
1501
|
+
if (now - a.lastHeartbeat > EVICT_MS) stale.push(`${id} (${Math.floor((now - a.lastHeartbeat) / 3600000)}h)`);
|
|
1502
|
+
}
|
|
1503
|
+
findings.push({
|
|
1504
|
+
check: "stale-agents",
|
|
1505
|
+
level: stale.length ? "warn" : "ok",
|
|
1506
|
+
detail: stale.length ? `${stale.length} agent(s) past the eviction window — next list_agents will drop them` : "no stale agents",
|
|
1507
|
+
fixable: false,
|
|
1508
|
+
items: stale.length ? stale : undefined,
|
|
1509
|
+
});
|
|
1510
|
+
}
|
|
1511
|
+
|
|
1512
|
+
// 7. Oversized JSONL files. Report only (suggest prune).
|
|
1513
|
+
{
|
|
1514
|
+
const big: string[] = [];
|
|
1515
|
+
const candidates = [
|
|
1516
|
+
ROOM_FILE,
|
|
1517
|
+
STATUS_FILE,
|
|
1518
|
+
...channels.filter((c) => c !== DEFAULT_ROOM).map((c) => roomFile(c)),
|
|
1519
|
+
...(await listInboxFiles()).map((f) => path.join(INBOX_DIR, f)),
|
|
1520
|
+
];
|
|
1521
|
+
for (const file of candidates) {
|
|
1522
|
+
const sz = await fileSize(file);
|
|
1523
|
+
if (sz > maxBytes) big.push(`${path.basename(file)} (${(sz / 1024 / 1024).toFixed(1)}MB)`);
|
|
1524
|
+
}
|
|
1525
|
+
findings.push({
|
|
1526
|
+
check: "oversized-files",
|
|
1527
|
+
level: big.length ? "warn" : "ok",
|
|
1528
|
+
detail: big.length ? `${big.length} file(s) over ${(maxBytes / 1024 / 1024).toFixed(0)}MB — consider prune` : "no oversized files",
|
|
1529
|
+
fixable: false,
|
|
1530
|
+
items: big.length ? big : undefined,
|
|
1531
|
+
});
|
|
1532
|
+
}
|
|
1533
|
+
|
|
1534
|
+
// 8. Stale lock dirs from crashed writers.
|
|
1535
|
+
{
|
|
1536
|
+
const locks = await scanStaleLocks(60_000, now);
|
|
1537
|
+
for (const l of locks) {
|
|
1538
|
+
if (fix) {
|
|
1539
|
+
try {
|
|
1540
|
+
await fsp.rm(l.path, { recursive: true, force: true });
|
|
1541
|
+
fixed.push(`removed stale lock ${path.basename(l.path)}`);
|
|
1542
|
+
} catch {
|
|
1543
|
+
// ignore
|
|
1544
|
+
}
|
|
1545
|
+
}
|
|
1546
|
+
}
|
|
1547
|
+
findings.push({
|
|
1548
|
+
check: "stale-locks",
|
|
1549
|
+
level: locks.length ? "warn" : "ok",
|
|
1550
|
+
detail: locks.length ? `${locks.length} lock dir(s) older than 60s — likely from a crashed writer` : "no stale locks",
|
|
1551
|
+
fixable: true,
|
|
1552
|
+
items: locks.length ? locks.map((l) => `${path.basename(l.path)} (${Math.floor(l.ageMs / 1000)}s)`) : undefined,
|
|
1553
|
+
});
|
|
1554
|
+
}
|
|
1555
|
+
|
|
1556
|
+
// 9. Channel/registry consistency: rooms/<chan>.jsonl files without a registry entry.
|
|
1557
|
+
{
|
|
1558
|
+
const orphanFiles: string[] = [];
|
|
1559
|
+
if (existsSync(ROOMS_DIR)) {
|
|
1560
|
+
let names: string[] = [];
|
|
1561
|
+
try {
|
|
1562
|
+
names = await fsp.readdir(ROOMS_DIR);
|
|
1563
|
+
} catch {
|
|
1564
|
+
// ignore
|
|
1565
|
+
}
|
|
1566
|
+
for (const name of names) {
|
|
1567
|
+
if (!name.endsWith(".jsonl")) continue;
|
|
1568
|
+
const chan = name.replace(/\.jsonl$/, "");
|
|
1569
|
+
if (!rooms[chan]) {
|
|
1570
|
+
orphanFiles.push(name);
|
|
1571
|
+
if (fix) {
|
|
1572
|
+
await ensureRoom(chan, "doctor");
|
|
1573
|
+
fixed.push(`registered channel '${chan}' (had a JSONL file but no registry entry)`);
|
|
1574
|
+
}
|
|
1575
|
+
}
|
|
1576
|
+
}
|
|
1577
|
+
}
|
|
1578
|
+
findings.push({
|
|
1579
|
+
check: "channel-registry-consistency",
|
|
1580
|
+
level: orphanFiles.length ? "warn" : "ok",
|
|
1581
|
+
detail: orphanFiles.length ? `${orphanFiles.length} channel file(s) with no registry entry` : "channel files and registry agree",
|
|
1582
|
+
fixable: true,
|
|
1583
|
+
items: orphanFiles.length ? orphanFiles : undefined,
|
|
1584
|
+
});
|
|
1585
|
+
}
|
|
1586
|
+
|
|
1587
|
+
// 10. Environment sanity. Report only.
|
|
1588
|
+
{
|
|
1589
|
+
const tmuxProbe = spawnSync("tmux", ["-V"]);
|
|
1590
|
+
const tmuxOk = tmuxProbe.status === 0;
|
|
1591
|
+
findings.push({
|
|
1592
|
+
check: "environment",
|
|
1593
|
+
level: tmuxOk ? "ok" : "warn",
|
|
1594
|
+
detail: tmuxOk
|
|
1595
|
+
? `root=${ROOT}; node=${process.execPath}; tmux=${(tmuxProbe.stdout ?? "").toString().trim() || "present"}`
|
|
1596
|
+
: `root=${ROOT}; node=${process.execPath}; tmux NOT on PATH — the tmux-push transport will not work`,
|
|
1597
|
+
fixable: false,
|
|
1598
|
+
items: [`root=${ROOT}`, `execPath=${process.execPath}`, `inTmux=${!!process.env.TMUX_PANE}`],
|
|
1599
|
+
});
|
|
1600
|
+
}
|
|
1601
|
+
|
|
1602
|
+
const summary = {
|
|
1603
|
+
ok: findings.filter((f) => f.level === "ok").length,
|
|
1604
|
+
warn: findings.filter((f) => f.level === "warn").length,
|
|
1605
|
+
error: findings.filter((f) => f.level === "error").length,
|
|
1606
|
+
};
|
|
1607
|
+
return {
|
|
1608
|
+
ok: true,
|
|
1609
|
+
healthy: summary.warn === 0 && summary.error === 0,
|
|
1610
|
+
fixApplied: fix,
|
|
1611
|
+
root: ROOT,
|
|
1612
|
+
findings,
|
|
1613
|
+
fixed: fix ? fixed : undefined,
|
|
1614
|
+
summary,
|
|
1615
|
+
};
|
|
1616
|
+
}
|
|
1617
|
+
|
|
1124
1618
|
// ---------- helpers ----------
|
|
1125
1619
|
|
|
1126
1620
|
async function moveFile(from: string, to: string): Promise<boolean> {
|