agentflow-core 0.6.0 → 0.6.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -206,9 +206,289 @@ function getTraceTree(trace) {
206
206
  return result;
207
207
  }
208
208
 
209
+ // src/process-audit.ts
210
+ import { execSync } from "child_process";
211
+ import { existsSync, readdirSync, readFileSync, statSync } from "fs";
212
+ import { basename, join } from "path";
213
+ function isPidAlive(pid) {
214
+ try {
215
+ process.kill(pid, 0);
216
+ return true;
217
+ } catch {
218
+ return false;
219
+ }
220
+ }
221
+ function pidMatchesName(pid, name) {
222
+ try {
223
+ const cmdline = readFileSync(`/proc/${pid}/cmdline`, "utf8");
224
+ return cmdline.includes(name);
225
+ } catch {
226
+ return false;
227
+ }
228
+ }
229
+ function readPidFile(path) {
230
+ try {
231
+ const pid = parseInt(readFileSync(path, "utf8").trim(), 10);
232
+ return isNaN(pid) ? null : pid;
233
+ } catch {
234
+ return null;
235
+ }
236
+ }
237
+ function auditPidFile(config) {
238
+ if (!config.pidFile) return null;
239
+ const pid = readPidFile(config.pidFile);
240
+ if (pid === null) {
241
+ return {
242
+ path: config.pidFile,
243
+ pid: null,
244
+ alive: false,
245
+ matchesProcess: false,
246
+ stale: !existsSync(config.pidFile),
247
+ reason: existsSync(config.pidFile) ? "PID file exists but content is invalid" : "No PID file found"
248
+ };
249
+ }
250
+ const alive = isPidAlive(pid);
251
+ const matchesProcess = alive ? pidMatchesName(pid, config.processName) : false;
252
+ const stale = !alive || alive && !matchesProcess;
253
+ let reason;
254
+ if (alive && matchesProcess) {
255
+ reason = `PID ${pid} alive and matches ${config.processName}`;
256
+ } else if (alive && !matchesProcess) {
257
+ reason = `PID ${pid} alive but is NOT ${config.processName} (PID reused by another process)`;
258
+ } else {
259
+ reason = `PID ${pid} no longer exists`;
260
+ }
261
+ return { path: config.pidFile, pid, alive, matchesProcess, stale, reason };
262
+ }
263
+ function auditSystemd(config) {
264
+ if (config.systemdUnit === null || config.systemdUnit === void 0) return null;
265
+ const unit = config.systemdUnit;
266
+ try {
267
+ const raw = execSync(
268
+ `systemctl --user show ${unit} --property=ActiveState,SubState,MainPID,NRestarts,Result --no-pager 2>/dev/null`,
269
+ { encoding: "utf8", timeout: 5e3 }
270
+ );
271
+ const props = {};
272
+ for (const line of raw.trim().split("\n")) {
273
+ const [k, ...v] = line.split("=");
274
+ if (k) props[k.trim()] = v.join("=").trim();
275
+ }
276
+ const activeState = props["ActiveState"] ?? "unknown";
277
+ const subState = props["SubState"] ?? "unknown";
278
+ const mainPid = parseInt(props["MainPID"] ?? "0", 10);
279
+ const restarts = parseInt(props["NRestarts"] ?? "0", 10);
280
+ const result = props["Result"] ?? "unknown";
281
+ return {
282
+ unit,
283
+ activeState,
284
+ subState,
285
+ mainPid,
286
+ restarts,
287
+ result,
288
+ crashLooping: activeState === "activating" && subState === "auto-restart",
289
+ failed: activeState === "failed"
290
+ };
291
+ } catch {
292
+ return null;
293
+ }
294
+ }
295
+ function auditWorkers(config) {
296
+ if (!config.workersFile || !existsSync(config.workersFile)) return null;
297
+ try {
298
+ const data = JSON.parse(readFileSync(config.workersFile, "utf8"));
299
+ const orchPid = data.pid ?? null;
300
+ const orchAlive = orchPid ? isPidAlive(orchPid) : false;
301
+ const workers = [];
302
+ for (const [name, info] of Object.entries(data.tools ?? {})) {
303
+ const w = info;
304
+ const wPid = w.pid ?? null;
305
+ const wAlive = wPid ? isPidAlive(wPid) : false;
306
+ workers.push({
307
+ name,
308
+ pid: wPid,
309
+ declaredStatus: w.status ?? "unknown",
310
+ alive: wAlive,
311
+ stale: w.status === "running" && !wAlive
312
+ });
313
+ }
314
+ return {
315
+ orchestratorPid: orchPid,
316
+ orchestratorAlive: orchAlive,
317
+ startedAt: data.started_at ?? "",
318
+ workers
319
+ };
320
+ } catch {
321
+ return null;
322
+ }
323
+ }
324
+ function readCmdline(pid) {
325
+ try {
326
+ return readFileSync(`/proc/${pid}/cmdline`, "utf8").replace(/\0/g, " ").trim();
327
+ } catch {
328
+ return "";
329
+ }
330
+ }
331
+ function getOsProcesses(processName) {
332
+ try {
333
+ const raw = execSync(
334
+ `ps -eo pid,pcpu,pmem,etime,lstart,args --no-headers`,
335
+ { encoding: "utf8", timeout: 5e3 }
336
+ );
337
+ const results = [];
338
+ for (const line of raw.split("\n")) {
339
+ if (!line.includes(processName)) continue;
340
+ if (line.includes("process-audit") || line.includes(" grep ")) continue;
341
+ const trimmed = line.trim();
342
+ const parts = trimmed.split(/\s+/);
343
+ const pid = parseInt(parts[0] ?? "0", 10);
344
+ if (isNaN(pid) || pid <= 0) continue;
345
+ const cpu = parts[1] ?? "0";
346
+ const mem = parts[2] ?? "0";
347
+ const elapsed = parts[3] ?? "";
348
+ const started = parts.slice(4, 9).join(" ");
349
+ const command = parts.slice(9).join(" ");
350
+ const cmdline = readCmdline(pid);
351
+ results.push({ pid, cpu, mem, elapsed, started, command, cmdline });
352
+ }
353
+ return results;
354
+ } catch {
355
+ return [];
356
+ }
357
+ }
358
+ function discoverProcessConfig(dirs) {
359
+ let pidFile;
360
+ let workersFile;
361
+ let processName = "";
362
+ for (const dir of dirs) {
363
+ if (!existsSync(dir)) continue;
364
+ let entries;
365
+ try {
366
+ entries = readdirSync(dir);
367
+ } catch {
368
+ continue;
369
+ }
370
+ for (const f of entries) {
371
+ const fp = join(dir, f);
372
+ try {
373
+ if (!statSync(fp).isFile()) continue;
374
+ } catch {
375
+ continue;
376
+ }
377
+ if (f.endsWith(".pid") && !pidFile) {
378
+ pidFile = fp;
379
+ if (!processName) {
380
+ processName = basename(f, ".pid");
381
+ }
382
+ }
383
+ if ((f === "workers.json" || f.endsWith("-workers.json")) && !workersFile) {
384
+ workersFile = fp;
385
+ if (!processName && f !== "workers.json") {
386
+ processName = basename(f, "-workers.json");
387
+ }
388
+ }
389
+ }
390
+ }
391
+ if (!processName && !pidFile && !workersFile) return null;
392
+ if (!processName) processName = "agent";
393
+ return { processName, pidFile, workersFile };
394
+ }
395
+ function auditProcesses(config) {
396
+ const pidFile = auditPidFile(config);
397
+ const systemd = auditSystemd(config);
398
+ const workers = auditWorkers(config);
399
+ const osProcesses = getOsProcesses(config.processName);
400
+ const knownPids = /* @__PURE__ */ new Set();
401
+ if (pidFile?.pid && !pidFile.stale) knownPids.add(pidFile.pid);
402
+ if (workers) {
403
+ if (workers.orchestratorPid) knownPids.add(workers.orchestratorPid);
404
+ for (const w of workers.workers) {
405
+ if (w.pid) knownPids.add(w.pid);
406
+ }
407
+ }
408
+ if (systemd?.mainPid) knownPids.add(systemd.mainPid);
409
+ const selfPid = process.pid;
410
+ const selfPpid = process.ppid;
411
+ const orphans = osProcesses.filter(
412
+ (p) => !knownPids.has(p.pid) && p.pid !== selfPid && p.pid !== selfPpid
413
+ );
414
+ const problems = [];
415
+ if (pidFile?.stale) problems.push(`Stale PID file: ${pidFile.reason}`);
416
+ if (systemd?.crashLooping) problems.push("Systemd unit is crash-looping (auto-restart)");
417
+ if (systemd?.failed) problems.push("Systemd unit has failed");
418
+ if (systemd && systemd.restarts > 10) problems.push(`High systemd restart count: ${systemd.restarts}`);
419
+ if (pidFile?.pid && systemd?.mainPid && pidFile.pid !== systemd.mainPid) {
420
+ problems.push(`PID mismatch: file says ${pidFile.pid}, systemd says ${systemd.mainPid}`);
421
+ }
422
+ if (workers) {
423
+ for (const w of workers.workers) {
424
+ if (w.stale) problems.push(`Worker "${w.name}" (pid ${w.pid}) declares running but is dead`);
425
+ }
426
+ }
427
+ if (orphans.length > 0) problems.push(`${orphans.length} orphan process(es) not tracked by PID file or workers registry`);
428
+ return { pidFile, systemd, workers, osProcesses, orphans, problems };
429
+ }
430
+ function formatAuditReport(result) {
431
+ const lines = [];
432
+ lines.push("");
433
+ lines.push("\u2554\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2557");
434
+ lines.push("\u2551 \u{1F50D} P R O C E S S A U D I T \u2551");
435
+ lines.push("\u255A\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u255D");
436
+ if (result.pidFile) {
437
+ const pf = result.pidFile;
438
+ const icon = pf.pid && pf.alive && pf.matchesProcess ? "\u2705" : pf.stale ? "\u26A0\uFE0F " : "\u2139\uFE0F ";
439
+ lines.push(`
440
+ PID File: ${pf.path}`);
441
+ lines.push(` ${icon} ${pf.reason}`);
442
+ }
443
+ if (result.systemd) {
444
+ const sd = result.systemd;
445
+ const icon = sd.activeState === "active" ? "\u{1F7E2}" : sd.crashLooping ? "\u{1F7E1}" : sd.failed ? "\u{1F534}" : "\u26AA";
446
+ lines.push(`
447
+ Systemd: ${sd.unit}`);
448
+ lines.push(` ${icon} State: ${sd.activeState} (${sd.subState}) Result: ${sd.result}`);
449
+ lines.push(` Main PID: ${sd.mainPid || "none"} Restarts: ${sd.restarts}`);
450
+ }
451
+ if (result.workers) {
452
+ const w = result.workers;
453
+ lines.push(`
454
+ Workers (orchestrator pid ${w.orchestratorPid ?? "unknown"} ${w.orchestratorAlive ? "\u2705" : "\u274C"})`);
455
+ for (const worker of w.workers) {
456
+ const icon = worker.declaredStatus === "running" && worker.alive ? "\u{1F7E2}" : worker.stale ? "\u{1F534} STALE" : "\u26AA";
457
+ lines.push(` ${icon} ${worker.name.padEnd(14)} pid=${String(worker.pid ?? "-").padEnd(8)} status=${worker.declaredStatus}`);
458
+ }
459
+ }
460
+ if (result.osProcesses.length > 0) {
461
+ lines.push(`
462
+ OS Processes (${result.osProcesses.length} total)`);
463
+ for (const p of result.osProcesses) {
464
+ lines.push(` PID ${String(p.pid).padEnd(8)} CPU=${p.cpu.padEnd(6)} MEM=${p.mem.padEnd(6)} Up=${p.elapsed.padEnd(10)} ${p.command.substring(0, 50)}`);
465
+ }
466
+ }
467
+ if (result.orphans.length > 0) {
468
+ lines.push(`
469
+ \u26A0\uFE0F ${result.orphans.length} ORPHAN PROCESS(ES):`);
470
+ for (const p of result.orphans) {
471
+ lines.push(` PID ${String(p.pid).padEnd(8)} CPU=${p.cpu.padEnd(6)} MEM=${p.mem.padEnd(6)} Up=${p.elapsed}`);
472
+ lines.push(` Started: ${p.started}`);
473
+ lines.push(` Command: ${p.cmdline || p.command}`);
474
+ }
475
+ }
476
+ lines.push("");
477
+ if (result.problems.length === 0) {
478
+ lines.push(" \u2705 All checks passed \u2014 no process issues detected.");
479
+ } else {
480
+ lines.push(` \u26A0\uFE0F ${result.problems.length} issue(s):`);
481
+ for (const p of result.problems) {
482
+ lines.push(` \u2022 ${p}`);
483
+ }
484
+ }
485
+ lines.push("");
486
+ return lines.join("\n");
487
+ }
488
+
209
489
  // src/live.ts
210
- import { existsSync, readdirSync, readFileSync, statSync, watch } from "fs";
211
- import { basename, join, resolve } from "path";
490
+ import { existsSync as existsSync2, readdirSync as readdirSync2, readFileSync as readFileSync2, statSync as statSync2, watch } from "fs";
491
+ import { basename as basename2, join as join2, resolve } from "path";
212
492
  var C = {
213
493
  reset: "\x1B[0m",
214
494
  bold: "\x1B[1m",
@@ -283,7 +563,7 @@ function scanFiles(dirs, recursive) {
283
563
  const seen = /* @__PURE__ */ new Set();
284
564
  function scanDir(d, topLevel) {
285
565
  try {
286
- const dirStat = statSync(d);
566
+ const dirStat = statSync2(d);
287
567
  const dirMtime = dirStat.mtime.getTime();
288
568
  const cachedMtime = dirMtimeCache.get(d);
289
569
  if (cachedMtime === dirMtime) {
@@ -299,13 +579,13 @@ function scanFiles(dirs, recursive) {
299
579
  }
300
580
  }
301
581
  const dirResults = [];
302
- for (const f of readdirSync(d)) {
582
+ for (const f of readdirSync2(d)) {
303
583
  if (f.startsWith(".")) continue;
304
- const fp = join(d, f);
584
+ const fp = join2(d, f);
305
585
  if (seen.has(fp)) continue;
306
586
  let stat;
307
587
  try {
308
- stat = statSync(fp);
588
+ stat = statSync2(fp);
309
589
  } catch {
310
590
  continue;
311
591
  }
@@ -337,13 +617,13 @@ function scanFiles(dirs, recursive) {
337
617
  }
338
618
  function safeReadJson(fp) {
339
619
  try {
340
- return JSON.parse(readFileSync(fp, "utf8"));
620
+ return JSON.parse(readFileSync2(fp, "utf8"));
341
621
  } catch {
342
622
  return null;
343
623
  }
344
624
  }
345
625
  function nameFromFile(filename) {
346
- return basename(filename).replace(/\.(json|jsonl)$/, "").replace(/-state$/, "");
626
+ return basename2(filename).replace(/\.(json|jsonl)$/, "").replace(/-state$/, "");
347
627
  }
348
628
  function normalizeStatus(val) {
349
629
  if (typeof val !== "string") return "unknown";
@@ -521,7 +801,7 @@ function processJsonFile(file) {
521
801
  }
522
802
  function processJsonlFile(file) {
523
803
  try {
524
- const content = readFileSync(file.path, "utf8").trim();
804
+ const content = readFileSync2(file.path, "utf8").trim();
525
805
  if (!content) return [];
526
806
  const lines = content.split("\n");
527
807
  const lineCount = lines.length;
@@ -673,6 +953,9 @@ var prevFileCount = 0;
673
953
  var newExecCount = 0;
674
954
  var sessionStart = Date.now();
675
955
  var firstRender = true;
956
+ var cachedAuditConfig = null;
957
+ var cachedAuditResult = null;
958
+ var lastAuditTime = 0;
676
959
  var fileCache = /* @__PURE__ */ new Map();
677
960
  function getRecordsCached(f) {
678
961
  const cached = fileCache.get(f.path);
@@ -792,6 +1075,24 @@ function render(config) {
792
1075
  const level = Math.round(v / maxBucket * 8);
793
1076
  return (failBuckets[i] > 0 ? C.red : C.green) + sparkChars[level] + C.reset;
794
1077
  }).join("");
1078
+ let auditResult = null;
1079
+ if (now - lastAuditTime > 1e4) {
1080
+ if (!cachedAuditConfig) {
1081
+ cachedAuditConfig = discoverProcessConfig(config.dirs);
1082
+ }
1083
+ if (cachedAuditConfig) {
1084
+ try {
1085
+ auditResult = auditProcesses(cachedAuditConfig);
1086
+ cachedAuditResult = auditResult;
1087
+ lastAuditTime = now;
1088
+ } catch (err) {
1089
+ process.stderr.write(`[agentflow] process audit error: ${err instanceof Error ? err.message : err}
1090
+ `);
1091
+ }
1092
+ }
1093
+ } else {
1094
+ auditResult = cachedAuditResult;
1095
+ }
795
1096
  const distributedTraces = [];
796
1097
  if (allTraces.length > 1) {
797
1098
  const traceGroups = groupByTraceId(allTraces);
@@ -872,6 +1173,50 @@ function render(config) {
872
1173
  );
873
1174
  writeLine(L, "");
874
1175
  writeLine(L, ` ${C.bold}Activity (1h)${C.reset} ${spark} ${C.dim}\u2190 now${C.reset}`);
1176
+ if (auditResult) {
1177
+ const ar = auditResult;
1178
+ const healthy = ar.problems.length === 0;
1179
+ const healthIcon = healthy ? `${C.green}\u25CF${C.reset}` : `${C.red}\u25CF${C.reset}`;
1180
+ const healthLabel = healthy ? `${C.green}healthy${C.reset}` : `${C.red}${ar.problems.length} issue(s)${C.reset}`;
1181
+ const workerParts = [];
1182
+ if (ar.workers) {
1183
+ for (const w of ar.workers.workers) {
1184
+ const wIcon = w.declaredStatus === "running" && w.alive ? `${C.green}\u25CF${C.reset}` : w.stale ? `${C.red}\u25CF${C.reset}` : `${C.dim}\u25CB${C.reset}`;
1185
+ workerParts.push(`${wIcon} ${w.name}`);
1186
+ }
1187
+ }
1188
+ let sysdLabel = "";
1189
+ if (ar.systemd) {
1190
+ const si = ar.systemd.activeState === "active" ? `${C.green}\u25CF${C.reset}` : ar.systemd.crashLooping ? `${C.yellow}\u25CF${C.reset}` : ar.systemd.failed ? `${C.red}\u25CF${C.reset}` : `${C.dim}\u25CB${C.reset}`;
1191
+ sysdLabel = ` ${C.bold}Systemd${C.reset} ${si} ${ar.systemd.activeState}`;
1192
+ if (ar.systemd.restarts > 0) sysdLabel += ` ${C.dim}(${ar.systemd.restarts} restarts)${C.reset}`;
1193
+ }
1194
+ let pidLabel = "";
1195
+ if (ar.pidFile?.pid) {
1196
+ const pi = ar.pidFile.alive && ar.pidFile.matchesProcess ? `${C.green}\u25CF${C.reset}` : `${C.red}\u25CF${C.reset}`;
1197
+ pidLabel = ` ${C.bold}PID${C.reset} ${pi} ${ar.pidFile.pid}`;
1198
+ }
1199
+ writeLine(L, "");
1200
+ writeLine(L, ` ${C.bold}${C.under}Process Health${C.reset}`);
1201
+ writeLine(L, ` ${healthIcon} ${healthLabel}${pidLabel}${sysdLabel} ${C.bold}Procs${C.reset} ${C.dim}${ar.osProcesses.length}${C.reset} ${ar.orphans.length > 0 ? `${C.red}Orphans ${ar.orphans.length}${C.reset}` : `${C.dim}Orphans 0${C.reset}`}`);
1202
+ if (workerParts.length > 0) {
1203
+ writeLine(L, ` ${C.dim}Workers${C.reset} ${workerParts.join(" ")}`);
1204
+ }
1205
+ if (!healthy) {
1206
+ for (const p of ar.problems.slice(0, 3)) {
1207
+ writeLine(L, ` ${C.red}\u2022${C.reset} ${C.dim}${p}${C.reset}`);
1208
+ }
1209
+ }
1210
+ if (ar.orphans.length > 0) {
1211
+ for (const o of ar.orphans.slice(0, 5)) {
1212
+ const cmd = (o.cmdline || o.command).substring(0, detailWidth);
1213
+ writeLine(L, ` ${C.red}?${C.reset} ${C.dim}pid=${o.pid} cpu=${o.cpu} mem=${o.mem} up=${o.elapsed}${C.reset} ${C.dim}${cmd}${C.reset}`);
1214
+ }
1215
+ if (ar.orphans.length > 5) {
1216
+ writeLine(L, ` ${C.dim}... +${ar.orphans.length - 5} more orphans${C.reset}`);
1217
+ }
1218
+ }
1219
+ }
875
1220
  writeLine(L, "");
876
1221
  writeLine(
877
1222
  L,
@@ -990,13 +1335,13 @@ function getDistDepth(dt, spanId, visited) {
990
1335
  }
991
1336
  function startLive(argv) {
992
1337
  const config = parseArgs(argv);
993
- const valid = config.dirs.filter((d) => existsSync(d));
1338
+ const valid = config.dirs.filter((d) => existsSync2(d));
994
1339
  if (valid.length === 0) {
995
1340
  console.error(`No valid directories found: ${config.dirs.join(", ")}`);
996
1341
  console.error("Specify directories containing JSON/JSONL files: agentflow live <dir> [dir...]");
997
1342
  process.exit(1);
998
1343
  }
999
- const invalid = config.dirs.filter((d) => !existsSync(d));
1344
+ const invalid = config.dirs.filter((d) => !existsSync2(d));
1000
1345
  if (invalid.length > 0) {
1001
1346
  console.warn(`Skipping non-existent: ${invalid.join(", ")}`);
1002
1347
  }
@@ -1019,262 +1364,6 @@ function startLive(argv) {
1019
1364
  });
1020
1365
  }
1021
1366
 
1022
- // src/process-audit.ts
1023
- import { execSync } from "child_process";
1024
- import { existsSync as existsSync2, readdirSync as readdirSync2, readFileSync as readFileSync2, statSync as statSync2 } from "fs";
1025
- import { basename as basename2, join as join2 } from "path";
1026
- function isPidAlive(pid) {
1027
- try {
1028
- process.kill(pid, 0);
1029
- return true;
1030
- } catch {
1031
- return false;
1032
- }
1033
- }
1034
- function pidMatchesName(pid, name) {
1035
- try {
1036
- const cmdline = readFileSync2(`/proc/${pid}/cmdline`, "utf8");
1037
- return cmdline.includes(name);
1038
- } catch {
1039
- return false;
1040
- }
1041
- }
1042
- function readPidFile(path) {
1043
- try {
1044
- const pid = parseInt(readFileSync2(path, "utf8").trim(), 10);
1045
- return isNaN(pid) ? null : pid;
1046
- } catch {
1047
- return null;
1048
- }
1049
- }
1050
- function auditPidFile(config) {
1051
- if (!config.pidFile) return null;
1052
- const pid = readPidFile(config.pidFile);
1053
- if (pid === null) {
1054
- return {
1055
- path: config.pidFile,
1056
- pid: null,
1057
- alive: false,
1058
- matchesProcess: false,
1059
- stale: !existsSync2(config.pidFile),
1060
- reason: existsSync2(config.pidFile) ? "PID file exists but content is invalid" : "No PID file found"
1061
- };
1062
- }
1063
- const alive = isPidAlive(pid);
1064
- const matchesProcess = alive ? pidMatchesName(pid, config.processName) : false;
1065
- const stale = !alive || alive && !matchesProcess;
1066
- let reason;
1067
- if (alive && matchesProcess) {
1068
- reason = `PID ${pid} alive and matches ${config.processName}`;
1069
- } else if (alive && !matchesProcess) {
1070
- reason = `PID ${pid} alive but is NOT ${config.processName} (PID reused by another process)`;
1071
- } else {
1072
- reason = `PID ${pid} no longer exists`;
1073
- }
1074
- return { path: config.pidFile, pid, alive, matchesProcess, stale, reason };
1075
- }
1076
- function auditSystemd(config) {
1077
- if (config.systemdUnit === null || config.systemdUnit === void 0) return null;
1078
- const unit = config.systemdUnit;
1079
- try {
1080
- const raw = execSync(
1081
- `systemctl --user show ${unit} --property=ActiveState,SubState,MainPID,NRestarts,Result --no-pager 2>/dev/null`,
1082
- { encoding: "utf8", timeout: 5e3 }
1083
- );
1084
- const props = {};
1085
- for (const line of raw.trim().split("\n")) {
1086
- const [k, ...v] = line.split("=");
1087
- if (k) props[k.trim()] = v.join("=").trim();
1088
- }
1089
- const activeState = props["ActiveState"] ?? "unknown";
1090
- const subState = props["SubState"] ?? "unknown";
1091
- const mainPid = parseInt(props["MainPID"] ?? "0", 10);
1092
- const restarts = parseInt(props["NRestarts"] ?? "0", 10);
1093
- const result = props["Result"] ?? "unknown";
1094
- return {
1095
- unit,
1096
- activeState,
1097
- subState,
1098
- mainPid,
1099
- restarts,
1100
- result,
1101
- crashLooping: activeState === "activating" && subState === "auto-restart",
1102
- failed: activeState === "failed"
1103
- };
1104
- } catch {
1105
- return null;
1106
- }
1107
- }
1108
- function auditWorkers(config) {
1109
- if (!config.workersFile || !existsSync2(config.workersFile)) return null;
1110
- try {
1111
- const data = JSON.parse(readFileSync2(config.workersFile, "utf8"));
1112
- const orchPid = data.pid ?? null;
1113
- const orchAlive = orchPid ? isPidAlive(orchPid) : false;
1114
- const workers = [];
1115
- for (const [name, info] of Object.entries(data.tools ?? {})) {
1116
- const w = info;
1117
- const wPid = w.pid ?? null;
1118
- const wAlive = wPid ? isPidAlive(wPid) : false;
1119
- workers.push({
1120
- name,
1121
- pid: wPid,
1122
- declaredStatus: w.status ?? "unknown",
1123
- alive: wAlive,
1124
- stale: w.status === "running" && !wAlive
1125
- });
1126
- }
1127
- return {
1128
- orchestratorPid: orchPid,
1129
- orchestratorAlive: orchAlive,
1130
- startedAt: data.started_at ?? "",
1131
- workers
1132
- };
1133
- } catch {
1134
- return null;
1135
- }
1136
- }
1137
- function getOsProcesses(processName) {
1138
- try {
1139
- const raw = execSync(`ps aux`, { encoding: "utf8", timeout: 5e3 });
1140
- return raw.split("\n").filter((line) => line.includes(processName) && !line.includes("process-audit") && !line.includes("grep")).map((line) => {
1141
- const parts = line.trim().split(/\s+/);
1142
- return {
1143
- pid: parseInt(parts[1] ?? "0", 10),
1144
- cpu: parts[2] ?? "0",
1145
- mem: parts[3] ?? "0",
1146
- command: parts.slice(10).join(" ")
1147
- };
1148
- }).filter((p) => !isNaN(p.pid) && p.pid > 0);
1149
- } catch {
1150
- return [];
1151
- }
1152
- }
1153
- function discoverProcessConfig(dirs) {
1154
- let pidFile;
1155
- let workersFile;
1156
- let processName = "";
1157
- for (const dir of dirs) {
1158
- if (!existsSync2(dir)) continue;
1159
- let entries;
1160
- try {
1161
- entries = readdirSync2(dir);
1162
- } catch {
1163
- continue;
1164
- }
1165
- for (const f of entries) {
1166
- const fp = join2(dir, f);
1167
- try {
1168
- if (!statSync2(fp).isFile()) continue;
1169
- } catch {
1170
- continue;
1171
- }
1172
- if (f.endsWith(".pid") && !pidFile) {
1173
- pidFile = fp;
1174
- if (!processName) {
1175
- processName = basename2(f, ".pid");
1176
- }
1177
- }
1178
- if ((f === "workers.json" || f.endsWith("-workers.json")) && !workersFile) {
1179
- workersFile = fp;
1180
- if (!processName && f !== "workers.json") {
1181
- processName = basename2(f, "-workers.json");
1182
- }
1183
- }
1184
- }
1185
- }
1186
- if (!processName && !pidFile && !workersFile) return null;
1187
- if (!processName) processName = "agent";
1188
- return { processName, pidFile, workersFile };
1189
- }
1190
- function auditProcesses(config) {
1191
- const pidFile = auditPidFile(config);
1192
- const systemd = auditSystemd(config);
1193
- const workers = auditWorkers(config);
1194
- const osProcesses = getOsProcesses(config.processName);
1195
- const knownPids = /* @__PURE__ */ new Set();
1196
- if (pidFile?.pid && !pidFile.stale) knownPids.add(pidFile.pid);
1197
- if (workers) {
1198
- if (workers.orchestratorPid) knownPids.add(workers.orchestratorPid);
1199
- for (const w of workers.workers) {
1200
- if (w.pid) knownPids.add(w.pid);
1201
- }
1202
- }
1203
- if (systemd?.mainPid) knownPids.add(systemd.mainPid);
1204
- const orphans = osProcesses.filter((p) => !knownPids.has(p.pid));
1205
- const problems = [];
1206
- if (pidFile?.stale) problems.push(`Stale PID file: ${pidFile.reason}`);
1207
- if (systemd?.crashLooping) problems.push("Systemd unit is crash-looping (auto-restart)");
1208
- if (systemd?.failed) problems.push("Systemd unit has failed");
1209
- if (systemd && systemd.restarts > 10) problems.push(`High systemd restart count: ${systemd.restarts}`);
1210
- if (pidFile?.pid && systemd?.mainPid && pidFile.pid !== systemd.mainPid) {
1211
- problems.push(`PID mismatch: file says ${pidFile.pid}, systemd says ${systemd.mainPid}`);
1212
- }
1213
- if (workers) {
1214
- for (const w of workers.workers) {
1215
- if (w.stale) problems.push(`Worker "${w.name}" (pid ${w.pid}) declares running but is dead`);
1216
- }
1217
- }
1218
- if (orphans.length > 0) problems.push(`${orphans.length} orphan process(es) not tracked by PID file or workers registry`);
1219
- return { pidFile, systemd, workers, osProcesses, orphans, problems };
1220
- }
1221
- function formatAuditReport(result) {
1222
- const lines = [];
1223
- lines.push("");
1224
- lines.push("\u2554\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2557");
1225
- lines.push("\u2551 \u{1F50D} P R O C E S S A U D I T \u2551");
1226
- lines.push("\u255A\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u255D");
1227
- if (result.pidFile) {
1228
- const pf = result.pidFile;
1229
- const icon = pf.pid && pf.alive && pf.matchesProcess ? "\u2705" : pf.stale ? "\u26A0\uFE0F " : "\u2139\uFE0F ";
1230
- lines.push(`
1231
- PID File: ${pf.path}`);
1232
- lines.push(` ${icon} ${pf.reason}`);
1233
- }
1234
- if (result.systemd) {
1235
- const sd = result.systemd;
1236
- const icon = sd.activeState === "active" ? "\u{1F7E2}" : sd.crashLooping ? "\u{1F7E1}" : sd.failed ? "\u{1F534}" : "\u26AA";
1237
- lines.push(`
1238
- Systemd: ${sd.unit}`);
1239
- lines.push(` ${icon} State: ${sd.activeState} (${sd.subState}) Result: ${sd.result}`);
1240
- lines.push(` Main PID: ${sd.mainPid || "none"} Restarts: ${sd.restarts}`);
1241
- }
1242
- if (result.workers) {
1243
- const w = result.workers;
1244
- lines.push(`
1245
- Workers (orchestrator pid ${w.orchestratorPid ?? "unknown"} ${w.orchestratorAlive ? "\u2705" : "\u274C"})`);
1246
- for (const worker of w.workers) {
1247
- const icon = worker.declaredStatus === "running" && worker.alive ? "\u{1F7E2}" : worker.stale ? "\u{1F534} STALE" : "\u26AA";
1248
- lines.push(` ${icon} ${worker.name.padEnd(14)} pid=${String(worker.pid ?? "-").padEnd(8)} status=${worker.declaredStatus}`);
1249
- }
1250
- }
1251
- if (result.osProcesses.length > 0) {
1252
- lines.push(`
1253
- OS Processes (${result.osProcesses.length} total)`);
1254
- for (const p of result.osProcesses) {
1255
- lines.push(` PID ${String(p.pid).padEnd(8)} CPU=${p.cpu.padEnd(6)} MEM=${p.mem.padEnd(6)} ${p.command.substring(0, 55)}`);
1256
- }
1257
- }
1258
- if (result.orphans.length > 0) {
1259
- lines.push(`
1260
- \u26A0\uFE0F ${result.orphans.length} ORPHAN PROCESS(ES):`);
1261
- for (const p of result.orphans) {
1262
- lines.push(` PID ${p.pid} \u2014 not tracked by PID file or workers registry`);
1263
- }
1264
- }
1265
- lines.push("");
1266
- if (result.problems.length === 0) {
1267
- lines.push(" \u2705 All checks passed \u2014 no process issues detected.");
1268
- } else {
1269
- lines.push(` \u26A0\uFE0F ${result.problems.length} issue(s):`);
1270
- for (const p of result.problems) {
1271
- lines.push(` \u2022 ${p}`);
1272
- }
1273
- }
1274
- lines.push("");
1275
- return lines.join("\n");
1276
- }
1277
-
1278
1367
  // src/graph-builder.ts
1279
1368
  import { randomUUID } from "crypto";
1280
1369
  function deepFreeze(obj) {
@@ -2459,10 +2548,10 @@ export {
2459
2548
  groupByTraceId,
2460
2549
  stitchTrace,
2461
2550
  getTraceTree,
2462
- startLive,
2463
2551
  discoverProcessConfig,
2464
2552
  auditProcesses,
2465
2553
  formatAuditReport,
2554
+ startLive,
2466
2555
  createGraphBuilder,
2467
2556
  runTraced,
2468
2557
  createTraceStore,