agent-coord-mcp 0.5.3 → 0.7.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/src/tools.ts CHANGED
@@ -10,6 +10,9 @@ import {
10
10
  CURSOR_DIR,
11
11
  DEFAULT_ROOM,
12
12
  INBOX_DIR,
13
+ ROOT,
14
+ ROOM_FILE,
15
+ ROOMS_DIR,
13
16
  ROOMS_FILE,
14
17
  STATUS_FILE,
15
18
  addMember,
@@ -32,6 +35,7 @@ import {
32
35
  removeMember,
33
36
  rewriteJsonl,
34
37
  roomFile,
38
+ rotateAgentToken,
35
39
  setRoomMeta,
36
40
  transportFile,
37
41
  TRANSPORT_DIR,
@@ -54,6 +58,9 @@ type TransportMarker = {
54
58
  pid: number;
55
59
  tmuxTarget?: string;
56
60
  since: number;
61
+ // Remote pushers run on a different machine; the local pid is meaningless,
62
+ // so we tag the host and use heartbeat-based liveness instead of pidAlive.
63
+ host?: string;
57
64
  };
58
65
 
59
66
  type AgentRegistry = Record<string, AgentEntry>;
@@ -241,14 +248,12 @@ export async function listAgentsTool() {
241
248
 
242
249
  async function loadLiveTransports(): Promise<Map<string, TransportMarker>> {
243
250
  const out = new Map<string, TransportMarker>();
251
+ const reg = await readJson<AgentRegistry>(AGENTS_FILE, {});
252
+ const now = Date.now();
244
253
  for (const fname of await listTransportFiles()) {
245
254
  const file = path.join(TRANSPORT_DIR, fname);
246
255
  const marker = await readJson<TransportMarker | null>(file, null);
247
- if (!marker) {
248
- await deleteFile(file);
249
- continue;
250
- }
251
- if (!isPidAlive(marker.pid)) {
256
+ if (!marker || !isMarkerLive(marker, reg, now)) {
252
257
  await deleteFile(file);
253
258
  continue;
254
259
  }
@@ -257,6 +262,17 @@ async function loadLiveTransports(): Promise<Map<string, TransportMarker>> {
257
262
  return out;
258
263
  }
259
264
 
265
+ // Liveness for a transport marker. Local markers carry a real pid we can probe;
266
+ // remote markers (tmux-push-remote, pid 0 on a foreign host) can't be — so we
267
+ // trust the registry heartbeat the remote pusher refreshes (within STALE_MS).
268
+ function isMarkerLive(marker: TransportMarker, reg: AgentRegistry, now: number): boolean {
269
+ if (marker.transport === "tmux-push-remote") {
270
+ const entry = reg[marker.agentId];
271
+ return !!entry && now - entry.lastHeartbeat < STALE_MS;
272
+ }
273
+ return isPidAlive(marker.pid);
274
+ }
275
+
260
276
  function isPidAlive(pid: number): boolean {
261
277
  if (!pid || pid <= 0) return false;
262
278
  try {
@@ -1038,6 +1054,11 @@ export async function renameAgentTool(args: { agentId: string; newAgentId: strin
1038
1054
  await moveFile(cursorFile(oldId), cursorFile(newId));
1039
1055
  await moveFile(transportFile(oldId), transportFile(newId));
1040
1056
 
1057
+ // Identity-binding token rotation: if tokens.json exists and had the old
1058
+ // id, move its token to the new id atomically. Lets the same bearer keep
1059
+ // authenticating after rename — no-op if binding isn't configured.
1060
+ await rotateAgentToken(oldId, newId);
1061
+
1041
1062
  // Broadcast a NICK notice to every channel the agent was in.
1042
1063
  for (const chan of joined) {
1043
1064
  await appendJsonl(roomFile(chan), sysMsg(newId, chan, `is now known as ${newId} (was ${oldId})`));
@@ -1055,6 +1076,424 @@ export async function renameAgentTool(args: { agentId: string; newAgentId: strin
1055
1076
  };
1056
1077
  }
1057
1078
 
1079
+ // ---------- transport markers (for remote pushers) ----------
1080
+
1081
+ export const reportTransportSchema = {
1082
+ agentId: z.string().min(1),
1083
+ transport: z.string().min(1),
1084
+ tmuxTarget: z.string().optional(),
1085
+ host: z.string().optional(),
1086
+ since: z.number().optional(),
1087
+ };
1088
+
1089
+ // Called by an external push daemon (typically scripts/coord-pusher.mjs on a
1090
+ // remote machine) to publish a transport marker so list_agents reflects the
1091
+ // attachment. The local tmux-push path writes the marker directly inside
1092
+ // attach_agent; this is the wire-callable equivalent for remote pushers.
1093
+ export async function reportTransportTool(args: {
1094
+ agentId: string;
1095
+ transport: string;
1096
+ tmuxTarget?: string;
1097
+ host?: string;
1098
+ since?: number;
1099
+ }) {
1100
+ const marker: TransportMarker = {
1101
+ agentId: args.agentId,
1102
+ transport: args.transport,
1103
+ pid: 0, // not meaningful for remote; liveness comes from heartbeat
1104
+ tmuxTarget: args.tmuxTarget,
1105
+ host: args.host,
1106
+ since: args.since ?? Date.now(),
1107
+ };
1108
+ await updateJson<TransportMarker>(transportFile(args.agentId), marker, () => marker);
1109
+ return { ok: true, marker };
1110
+ }
1111
+
1112
+ export const clearTransportSchema = {
1113
+ agentId: z.string().min(1),
1114
+ };
1115
+
1116
+ // Idempotent remote-counterpart to detach_agent: just deletes the marker. Used
1117
+ // by the remote pusher on graceful shutdown so list_agents stops showing it
1118
+ // attached. (Does NOT try to kill any process — there's nothing local to kill.)
1119
+ export async function clearTransportTool(args: { agentId: string }) {
1120
+ const removed = await deleteFile(transportFile(args.agentId));
1121
+ return { ok: true, removed };
1122
+ }
1123
+
1124
+ // ---------- doctor (bus-wide health check) ----------
1125
+
1126
+ type DoctorLevel = "ok" | "warn" | "error";
1127
+ type DoctorFinding = {
1128
+ check: string;
1129
+ level: DoctorLevel;
1130
+ detail: string;
1131
+ fixable: boolean;
1132
+ items?: string[];
1133
+ };
1134
+
1135
+ // Count non-empty lines vs successfully-parsed entries in a JSONL file.
1136
+ // Offsets index the PARSED entries (see readJsonl), so `parsed` is the figure
1137
+ // cursor math is compared against; `malformed` is the desync risk.
1138
+ async function scanJsonl(file: string): Promise<{ lines: number; parsed: number; malformed: number }> {
1139
+ if (!existsSync(file)) return { lines: 0, parsed: 0, malformed: 0 };
1140
+ const raw = await fsp.readFile(file, "utf8");
1141
+ let lines = 0;
1142
+ let parsed = 0;
1143
+ for (const line of raw.split("\n")) {
1144
+ if (!line.trim()) continue;
1145
+ lines++;
1146
+ try {
1147
+ JSON.parse(line);
1148
+ parsed++;
1149
+ } catch {
1150
+ // malformed
1151
+ }
1152
+ }
1153
+ return { lines, parsed, malformed: lines - parsed };
1154
+ }
1155
+
1156
+ // Find leftover proper-lockfile lock dirs (`<file>.lock`) across the state
1157
+ // dirs. Anything older than the threshold is almost certainly orphaned by a
1158
+ // crashed writer (withLock's stale window is 5s).
1159
+ async function scanStaleLocks(olderThanMs: number, now: number): Promise<{ path: string; ageMs: number }[]> {
1160
+ const out: { path: string; ageMs: number }[] = [];
1161
+ const dirs = [ROOT, INBOX_DIR, CURSOR_DIR, ROOMS_DIR, TRANSPORT_DIR];
1162
+ for (const dir of dirs) {
1163
+ if (!existsSync(dir)) continue;
1164
+ let names: string[];
1165
+ try {
1166
+ names = await fsp.readdir(dir);
1167
+ } catch {
1168
+ continue;
1169
+ }
1170
+ for (const name of names) {
1171
+ if (!name.endsWith(".lock")) continue;
1172
+ const p = path.join(dir, name);
1173
+ try {
1174
+ const st = await fsp.stat(p);
1175
+ const ageMs = now - st.mtimeMs;
1176
+ if (ageMs > olderThanMs) out.push({ path: p, ageMs });
1177
+ } catch {
1178
+ // vanished mid-scan
1179
+ }
1180
+ }
1181
+ }
1182
+ return out;
1183
+ }
1184
+
1185
+ export const doctorSchema = {
1186
+ fix: z.boolean().optional(),
1187
+ maxFileBytes: z.number().int().positive().optional(),
1188
+ };
1189
+
1190
+ export async function doctorTool(args: { fix?: boolean; maxFileBytes?: number }) {
1191
+ const fix = args.fix ?? false;
1192
+ const maxBytes = args.maxFileBytes ?? 5 * 1024 * 1024;
1193
+ const now = Date.now();
1194
+ const findings: DoctorFinding[] = [];
1195
+ const fixed: string[] = [];
1196
+
1197
+ const reg = await readJson<AgentRegistry>(AGENTS_FILE, {});
1198
+ const known = new Set(Object.keys(reg));
1199
+ const rooms = await getRooms();
1200
+ const channels = Object.keys(rooms);
1201
+
1202
+ // 1. Orphan transport markers (dead local pid, or stale remote heartbeat).
1203
+ {
1204
+ const dead: string[] = [];
1205
+ for (const fname of await listTransportFiles()) {
1206
+ const file = path.join(TRANSPORT_DIR, fname);
1207
+ const marker = await readJson<TransportMarker | null>(file, null);
1208
+ if (!marker || !isMarkerLive(marker, reg, now)) {
1209
+ dead.push(file);
1210
+ if (fix) {
1211
+ await deleteFile(file);
1212
+ fixed.push(`deleted stale transport marker ${fname}`);
1213
+ }
1214
+ }
1215
+ }
1216
+ findings.push({
1217
+ check: "orphan-transport-markers",
1218
+ level: dead.length ? "warn" : "ok",
1219
+ detail: dead.length ? `${dead.length} stale transport marker(s) (dead pid or expired remote heartbeat)` : "no stale transport markers",
1220
+ fixable: true,
1221
+ items: dead.length ? dead.map((f) => path.basename(f)) : undefined,
1222
+ });
1223
+ }
1224
+
1225
+ // 2. Orphan room memberships (member not in the registry).
1226
+ {
1227
+ const orphans = new Set<string>();
1228
+ for (const e of Object.values(rooms)) {
1229
+ for (const m of e.members ?? []) if (!known.has(m)) orphans.add(m);
1230
+ }
1231
+ if (fix && orphans.size) {
1232
+ await updateJson<RoomRegistry>(ROOMS_FILE, {}, (cur) => {
1233
+ for (const e of Object.values(cur)) {
1234
+ if (e.members?.length) e.members = e.members.filter((m) => known.has(m));
1235
+ }
1236
+ return cur;
1237
+ });
1238
+ fixed.push(`dropped ${orphans.size} orphan membership(s): ${[...orphans].join(", ")}`);
1239
+ }
1240
+ findings.push({
1241
+ check: "orphan-room-memberships",
1242
+ level: orphans.size ? "warn" : "ok",
1243
+ detail: orphans.size ? `${orphans.size} channel member(s) not in the registry` : "all channel members are registered",
1244
+ fixable: true,
1245
+ items: orphans.size ? [...orphans] : undefined,
1246
+ });
1247
+ }
1248
+
1249
+ // 3. Orphan inbox / cursor files (owner not registered).
1250
+ {
1251
+ const orphanInbox: string[] = [];
1252
+ for (const fname of await listInboxFiles()) {
1253
+ const id = fname.replace(/\.jsonl$/, "");
1254
+ if (!known.has(id)) {
1255
+ orphanInbox.push(id);
1256
+ if (fix) {
1257
+ await deleteFile(path.join(INBOX_DIR, fname));
1258
+ fixed.push(`deleted orphan inbox ${fname}`);
1259
+ }
1260
+ }
1261
+ }
1262
+ const orphanCursor: string[] = [];
1263
+ for (const fname of await listCursorFiles()) {
1264
+ const id = fname.replace(/\.json$/, "");
1265
+ if (!known.has(id)) {
1266
+ orphanCursor.push(id);
1267
+ if (fix) {
1268
+ await deleteFile(path.join(CURSOR_DIR, fname));
1269
+ fixed.push(`deleted orphan cursor ${fname}`);
1270
+ }
1271
+ }
1272
+ }
1273
+ const total = orphanInbox.length + orphanCursor.length;
1274
+ findings.push({
1275
+ check: "orphan-inboxes-cursors",
1276
+ level: total ? "warn" : "ok",
1277
+ detail: total
1278
+ ? `${orphanInbox.length} inbox + ${orphanCursor.length} cursor file(s) for unregistered ids`
1279
+ : "no orphan inbox/cursor files",
1280
+ fixable: true,
1281
+ items: total ? [...new Set([...orphanInbox, ...orphanCursor])] : undefined,
1282
+ });
1283
+ }
1284
+
1285
+ // Precompute parsed line counts for cursor + malformed checks.
1286
+ const counts = new Map<string, { lines: number; parsed: number; malformed: number }>();
1287
+ const countFor = async (file: string) => {
1288
+ if (!counts.has(file)) counts.set(file, await scanJsonl(file));
1289
+ return counts.get(file)!;
1290
+ };
1291
+
1292
+ // 4. Cursor offsets past end-of-file (would return [] forever).
1293
+ {
1294
+ const broken: string[] = [];
1295
+ for (const fname of await listCursorFiles()) {
1296
+ const id = fname.replace(/\.json$/, "");
1297
+ const cursorPath = path.join(CURSOR_DIR, fname);
1298
+ const cursor = await readJson<Cursor>(cursorPath, {});
1299
+ const overflow: string[] = [];
1300
+ const inboxMax = (await countFor(inboxFile(id))).parsed;
1301
+ if ((cursor.inboxOffset ?? 0) > inboxMax) overflow.push(`inboxOffset ${cursor.inboxOffset}>${inboxMax}`);
1302
+ const roomMax = (await countFor(ROOM_FILE)).parsed;
1303
+ if ((cursor.roomOffset ?? 0) > roomMax) overflow.push(`roomOffset ${cursor.roomOffset}>${roomMax}`);
1304
+ const statusMax = (await countFor(STATUS_FILE)).parsed;
1305
+ if ((cursor.statusOffset ?? 0) > statusMax) overflow.push(`statusOffset ${cursor.statusOffset}>${statusMax}`);
1306
+ for (const [chan, off] of Object.entries(cursor.roomOffsets ?? {})) {
1307
+ const max = (await countFor(roomFile(chan))).parsed;
1308
+ if (off > max) overflow.push(`roomOffsets[${chan}] ${off}>${max}`);
1309
+ }
1310
+ if (overflow.length) {
1311
+ broken.push(`${id}: ${overflow.join(", ")}`);
1312
+ if (fix) {
1313
+ await updateJson<Cursor>(cursorPath, {}, (c) => {
1314
+ if ((c.inboxOffset ?? 0) > inboxMax) c.inboxOffset = inboxMax;
1315
+ if ((c.roomOffset ?? 0) > roomMax) c.roomOffset = roomMax;
1316
+ if ((c.statusOffset ?? 0) > statusMax) c.statusOffset = statusMax;
1317
+ if (c.roomOffsets) {
1318
+ for (const chan of Object.keys(c.roomOffsets)) {
1319
+ const max = counts.get(roomFile(chan))?.parsed ?? 0;
1320
+ if (c.roomOffsets[chan] > max) c.roomOffsets[chan] = max;
1321
+ }
1322
+ }
1323
+ return c;
1324
+ });
1325
+ fixed.push(`clamped cursor offsets for ${id}`);
1326
+ }
1327
+ }
1328
+ }
1329
+ findings.push({
1330
+ check: "cursor-past-eof",
1331
+ level: broken.length ? "error" : "ok",
1332
+ detail: broken.length ? `${broken.length} cursor(s) with an offset past EOF — delivery stalled` : "all cursor offsets are within bounds",
1333
+ fixable: true,
1334
+ items: broken.length ? broken : undefined,
1335
+ });
1336
+ }
1337
+
1338
+ // 5. Malformed JSONL lines (silently desync offset math between server + hooks).
1339
+ {
1340
+ const jsonlFiles = [
1341
+ ROOM_FILE,
1342
+ STATUS_FILE,
1343
+ ...channels.filter((c) => c !== DEFAULT_ROOM).map((c) => roomFile(c)),
1344
+ ...(await listInboxFiles()).map((f) => path.join(INBOX_DIR, f)),
1345
+ ];
1346
+ const bad: string[] = [];
1347
+ for (const file of jsonlFiles) {
1348
+ const c = await countFor(file);
1349
+ if (c.malformed > 0) {
1350
+ bad.push(`${path.basename(file)} (${c.malformed})`);
1351
+ if (fix) {
1352
+ await fsp.copyFile(file, file + ".bak");
1353
+ await rewriteJsonl(file, () => true); // drops unparseable lines
1354
+ fixed.push(`rewrote ${path.basename(file)} dropping ${c.malformed} malformed line(s) (backup: ${path.basename(file)}.bak)`);
1355
+ }
1356
+ }
1357
+ }
1358
+ findings.push({
1359
+ check: "malformed-jsonl",
1360
+ level: bad.length ? "warn" : "ok",
1361
+ detail: bad.length ? `${bad.length} file(s) contain unparseable lines` : "no malformed JSONL lines",
1362
+ fixable: true,
1363
+ items: bad.length ? bad : undefined,
1364
+ });
1365
+ }
1366
+
1367
+ // 6. Stale agents (registered, no live transport, heartbeat past EVICT_MS). Report only.
1368
+ {
1369
+ // Compute liveness WITHOUT deleting dead markers — loadLiveTransports
1370
+ // prunes as a side effect, which would make this read-only check mutate
1371
+ // state (and pre-empt the orphan-marker fix in check 1).
1372
+ const live = new Set<string>();
1373
+ for (const fname of await listTransportFiles()) {
1374
+ const marker = await readJson<TransportMarker | null>(path.join(TRANSPORT_DIR, fname), null);
1375
+ if (marker && isMarkerLive(marker, reg, now)) live.add(marker.agentId);
1376
+ }
1377
+ const stale: string[] = [];
1378
+ for (const [id, a] of Object.entries(reg)) {
1379
+ if (live.has(id)) continue;
1380
+ if (now - a.lastHeartbeat > EVICT_MS) stale.push(`${id} (${Math.floor((now - a.lastHeartbeat) / 3600000)}h)`);
1381
+ }
1382
+ findings.push({
1383
+ check: "stale-agents",
1384
+ level: stale.length ? "warn" : "ok",
1385
+ detail: stale.length ? `${stale.length} agent(s) past the eviction window — next list_agents will drop them` : "no stale agents",
1386
+ fixable: false,
1387
+ items: stale.length ? stale : undefined,
1388
+ });
1389
+ }
1390
+
1391
+ // 7. Oversized JSONL files. Report only (suggest prune).
1392
+ {
1393
+ const big: string[] = [];
1394
+ const candidates = [
1395
+ ROOM_FILE,
1396
+ STATUS_FILE,
1397
+ ...channels.filter((c) => c !== DEFAULT_ROOM).map((c) => roomFile(c)),
1398
+ ...(await listInboxFiles()).map((f) => path.join(INBOX_DIR, f)),
1399
+ ];
1400
+ for (const file of candidates) {
1401
+ const sz = await fileSize(file);
1402
+ if (sz > maxBytes) big.push(`${path.basename(file)} (${(sz / 1024 / 1024).toFixed(1)}MB)`);
1403
+ }
1404
+ findings.push({
1405
+ check: "oversized-files",
1406
+ level: big.length ? "warn" : "ok",
1407
+ detail: big.length ? `${big.length} file(s) over ${(maxBytes / 1024 / 1024).toFixed(0)}MB — consider prune` : "no oversized files",
1408
+ fixable: false,
1409
+ items: big.length ? big : undefined,
1410
+ });
1411
+ }
1412
+
1413
+ // 8. Stale lock dirs from crashed writers.
1414
+ {
1415
+ const locks = await scanStaleLocks(60_000, now);
1416
+ for (const l of locks) {
1417
+ if (fix) {
1418
+ try {
1419
+ await fsp.rm(l.path, { recursive: true, force: true });
1420
+ fixed.push(`removed stale lock ${path.basename(l.path)}`);
1421
+ } catch {
1422
+ // ignore
1423
+ }
1424
+ }
1425
+ }
1426
+ findings.push({
1427
+ check: "stale-locks",
1428
+ level: locks.length ? "warn" : "ok",
1429
+ detail: locks.length ? `${locks.length} lock dir(s) older than 60s — likely from a crashed writer` : "no stale locks",
1430
+ fixable: true,
1431
+ items: locks.length ? locks.map((l) => `${path.basename(l.path)} (${Math.floor(l.ageMs / 1000)}s)`) : undefined,
1432
+ });
1433
+ }
1434
+
1435
+ // 9. Channel/registry consistency: rooms/<chan>.jsonl files without a registry entry.
1436
+ {
1437
+ const orphanFiles: string[] = [];
1438
+ if (existsSync(ROOMS_DIR)) {
1439
+ let names: string[] = [];
1440
+ try {
1441
+ names = await fsp.readdir(ROOMS_DIR);
1442
+ } catch {
1443
+ // ignore
1444
+ }
1445
+ for (const name of names) {
1446
+ if (!name.endsWith(".jsonl")) continue;
1447
+ const chan = name.replace(/\.jsonl$/, "");
1448
+ if (!rooms[chan]) {
1449
+ orphanFiles.push(name);
1450
+ if (fix) {
1451
+ await ensureRoom(chan, "doctor");
1452
+ fixed.push(`registered channel '${chan}' (had a JSONL file but no registry entry)`);
1453
+ }
1454
+ }
1455
+ }
1456
+ }
1457
+ findings.push({
1458
+ check: "channel-registry-consistency",
1459
+ level: orphanFiles.length ? "warn" : "ok",
1460
+ detail: orphanFiles.length ? `${orphanFiles.length} channel file(s) with no registry entry` : "channel files and registry agree",
1461
+ fixable: true,
1462
+ items: orphanFiles.length ? orphanFiles : undefined,
1463
+ });
1464
+ }
1465
+
1466
+ // 10. Environment sanity. Report only.
1467
+ {
1468
+ const tmuxProbe = spawnSync("tmux", ["-V"]);
1469
+ const tmuxOk = tmuxProbe.status === 0;
1470
+ findings.push({
1471
+ check: "environment",
1472
+ level: tmuxOk ? "ok" : "warn",
1473
+ detail: tmuxOk
1474
+ ? `root=${ROOT}; node=${process.execPath}; tmux=${(tmuxProbe.stdout ?? "").toString().trim() || "present"}`
1475
+ : `root=${ROOT}; node=${process.execPath}; tmux NOT on PATH — the tmux-push transport will not work`,
1476
+ fixable: false,
1477
+ items: [`root=${ROOT}`, `execPath=${process.execPath}`, `inTmux=${!!process.env.TMUX_PANE}`],
1478
+ });
1479
+ }
1480
+
1481
+ const summary = {
1482
+ ok: findings.filter((f) => f.level === "ok").length,
1483
+ warn: findings.filter((f) => f.level === "warn").length,
1484
+ error: findings.filter((f) => f.level === "error").length,
1485
+ };
1486
+ return {
1487
+ ok: true,
1488
+ healthy: summary.warn === 0 && summary.error === 0,
1489
+ fixApplied: fix,
1490
+ root: ROOT,
1491
+ findings,
1492
+ fixed: fix ? fixed : undefined,
1493
+ summary,
1494
+ };
1495
+ }
1496
+
1058
1497
  // ---------- helpers ----------
1059
1498
 
1060
1499
  async function moveFile(from: string, to: string): Promise<boolean> {