agentflow-core 0.6.0 → 0.6.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -206,9 +206,265 @@ function getTraceTree(trace) {
206
206
  return result;
207
207
  }
208
208
 
209
+ // src/process-audit.ts
210
+ import { execSync } from "child_process";
211
+ import { existsSync, readdirSync, readFileSync, statSync } from "fs";
212
+ import { basename, join } from "path";
213
+ function isPidAlive(pid) {
214
+ try {
215
+ process.kill(pid, 0);
216
+ return true;
217
+ } catch {
218
+ return false;
219
+ }
220
+ }
221
+ function pidMatchesName(pid, name) {
222
+ try {
223
+ const cmdline = readFileSync(`/proc/${pid}/cmdline`, "utf8");
224
+ return cmdline.includes(name);
225
+ } catch {
226
+ return false;
227
+ }
228
+ }
229
+ function readPidFile(path) {
230
+ try {
231
+ const pid = parseInt(readFileSync(path, "utf8").trim(), 10);
232
+ return isNaN(pid) ? null : pid;
233
+ } catch {
234
+ return null;
235
+ }
236
+ }
237
+ function auditPidFile(config) {
238
+ if (!config.pidFile) return null;
239
+ const pid = readPidFile(config.pidFile);
240
+ if (pid === null) {
241
+ return {
242
+ path: config.pidFile,
243
+ pid: null,
244
+ alive: false,
245
+ matchesProcess: false,
246
+ stale: !existsSync(config.pidFile),
247
+ reason: existsSync(config.pidFile) ? "PID file exists but content is invalid" : "No PID file found"
248
+ };
249
+ }
250
+ const alive = isPidAlive(pid);
251
+ const matchesProcess = alive ? pidMatchesName(pid, config.processName) : false;
252
+ const stale = !alive || alive && !matchesProcess;
253
+ let reason;
254
+ if (alive && matchesProcess) {
255
+ reason = `PID ${pid} alive and matches ${config.processName}`;
256
+ } else if (alive && !matchesProcess) {
257
+ reason = `PID ${pid} alive but is NOT ${config.processName} (PID reused by another process)`;
258
+ } else {
259
+ reason = `PID ${pid} no longer exists`;
260
+ }
261
+ return { path: config.pidFile, pid, alive, matchesProcess, stale, reason };
262
+ }
263
+ function auditSystemd(config) {
264
+ if (config.systemdUnit === null || config.systemdUnit === void 0) return null;
265
+ const unit = config.systemdUnit;
266
+ try {
267
+ const raw = execSync(
268
+ `systemctl --user show ${unit} --property=ActiveState,SubState,MainPID,NRestarts,Result --no-pager 2>/dev/null`,
269
+ { encoding: "utf8", timeout: 5e3 }
270
+ );
271
+ const props = {};
272
+ for (const line of raw.trim().split("\n")) {
273
+ const [k, ...v] = line.split("=");
274
+ if (k) props[k.trim()] = v.join("=").trim();
275
+ }
276
+ const activeState = props["ActiveState"] ?? "unknown";
277
+ const subState = props["SubState"] ?? "unknown";
278
+ const mainPid = parseInt(props["MainPID"] ?? "0", 10);
279
+ const restarts = parseInt(props["NRestarts"] ?? "0", 10);
280
+ const result = props["Result"] ?? "unknown";
281
+ return {
282
+ unit,
283
+ activeState,
284
+ subState,
285
+ mainPid,
286
+ restarts,
287
+ result,
288
+ crashLooping: activeState === "activating" && subState === "auto-restart",
289
+ failed: activeState === "failed"
290
+ };
291
+ } catch {
292
+ return null;
293
+ }
294
+ }
295
+ function auditWorkers(config) {
296
+ if (!config.workersFile || !existsSync(config.workersFile)) return null;
297
+ try {
298
+ const data = JSON.parse(readFileSync(config.workersFile, "utf8"));
299
+ const orchPid = data.pid ?? null;
300
+ const orchAlive = orchPid ? isPidAlive(orchPid) : false;
301
+ const workers = [];
302
+ for (const [name, info] of Object.entries(data.tools ?? {})) {
303
+ const w = info;
304
+ const wPid = w.pid ?? null;
305
+ const wAlive = wPid ? isPidAlive(wPid) : false;
306
+ workers.push({
307
+ name,
308
+ pid: wPid,
309
+ declaredStatus: w.status ?? "unknown",
310
+ alive: wAlive,
311
+ stale: w.status === "running" && !wAlive
312
+ });
313
+ }
314
+ return {
315
+ orchestratorPid: orchPid,
316
+ orchestratorAlive: orchAlive,
317
+ startedAt: data.started_at ?? "",
318
+ workers
319
+ };
320
+ } catch {
321
+ return null;
322
+ }
323
+ }
324
+ function getOsProcesses(processName) {
325
+ try {
326
+ const raw = execSync(`ps aux`, { encoding: "utf8", timeout: 5e3 });
327
+ return raw.split("\n").filter((line) => line.includes(processName) && !line.includes("process-audit") && !line.includes("grep")).map((line) => {
328
+ const parts = line.trim().split(/\s+/);
329
+ return {
330
+ pid: parseInt(parts[1] ?? "0", 10),
331
+ cpu: parts[2] ?? "0",
332
+ mem: parts[3] ?? "0",
333
+ command: parts.slice(10).join(" ")
334
+ };
335
+ }).filter((p) => !isNaN(p.pid) && p.pid > 0);
336
+ } catch {
337
+ return [];
338
+ }
339
+ }
340
+ function discoverProcessConfig(dirs) {
341
+ let pidFile;
342
+ let workersFile;
343
+ let processName = "";
344
+ for (const dir of dirs) {
345
+ if (!existsSync(dir)) continue;
346
+ let entries;
347
+ try {
348
+ entries = readdirSync(dir);
349
+ } catch {
350
+ continue;
351
+ }
352
+ for (const f of entries) {
353
+ const fp = join(dir, f);
354
+ try {
355
+ if (!statSync(fp).isFile()) continue;
356
+ } catch {
357
+ continue;
358
+ }
359
+ if (f.endsWith(".pid") && !pidFile) {
360
+ pidFile = fp;
361
+ if (!processName) {
362
+ processName = basename(f, ".pid");
363
+ }
364
+ }
365
+ if ((f === "workers.json" || f.endsWith("-workers.json")) && !workersFile) {
366
+ workersFile = fp;
367
+ if (!processName && f !== "workers.json") {
368
+ processName = basename(f, "-workers.json");
369
+ }
370
+ }
371
+ }
372
+ }
373
+ if (!processName && !pidFile && !workersFile) return null;
374
+ if (!processName) processName = "agent";
375
+ return { processName, pidFile, workersFile };
376
+ }
377
+ function auditProcesses(config) {
378
+ const pidFile = auditPidFile(config);
379
+ const systemd = auditSystemd(config);
380
+ const workers = auditWorkers(config);
381
+ const osProcesses = getOsProcesses(config.processName);
382
+ const knownPids = /* @__PURE__ */ new Set();
383
+ if (pidFile?.pid && !pidFile.stale) knownPids.add(pidFile.pid);
384
+ if (workers) {
385
+ if (workers.orchestratorPid) knownPids.add(workers.orchestratorPid);
386
+ for (const w of workers.workers) {
387
+ if (w.pid) knownPids.add(w.pid);
388
+ }
389
+ }
390
+ if (systemd?.mainPid) knownPids.add(systemd.mainPid);
391
+ const orphans = osProcesses.filter((p) => !knownPids.has(p.pid));
392
+ const problems = [];
393
+ if (pidFile?.stale) problems.push(`Stale PID file: ${pidFile.reason}`);
394
+ if (systemd?.crashLooping) problems.push("Systemd unit is crash-looping (auto-restart)");
395
+ if (systemd?.failed) problems.push("Systemd unit has failed");
396
+ if (systemd && systemd.restarts > 10) problems.push(`High systemd restart count: ${systemd.restarts}`);
397
+ if (pidFile?.pid && systemd?.mainPid && pidFile.pid !== systemd.mainPid) {
398
+ problems.push(`PID mismatch: file says ${pidFile.pid}, systemd says ${systemd.mainPid}`);
399
+ }
400
+ if (workers) {
401
+ for (const w of workers.workers) {
402
+ if (w.stale) problems.push(`Worker "${w.name}" (pid ${w.pid}) declares running but is dead`);
403
+ }
404
+ }
405
+ if (orphans.length > 0) problems.push(`${orphans.length} orphan process(es) not tracked by PID file or workers registry`);
406
+ return { pidFile, systemd, workers, osProcesses, orphans, problems };
407
+ }
408
+ function formatAuditReport(result) {
409
+ const lines = [];
410
+ lines.push("");
411
+ lines.push("\u2554\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2557");
412
+ lines.push("\u2551 \u{1F50D} P R O C E S S A U D I T \u2551");
413
+ lines.push("\u255A\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u255D");
414
+ if (result.pidFile) {
415
+ const pf = result.pidFile;
416
+ const icon = pf.pid && pf.alive && pf.matchesProcess ? "\u2705" : pf.stale ? "\u26A0\uFE0F " : "\u2139\uFE0F ";
417
+ lines.push(`
418
+ PID File: ${pf.path}`);
419
+ lines.push(` ${icon} ${pf.reason}`);
420
+ }
421
+ if (result.systemd) {
422
+ const sd = result.systemd;
423
+ const icon = sd.activeState === "active" ? "\u{1F7E2}" : sd.crashLooping ? "\u{1F7E1}" : sd.failed ? "\u{1F534}" : "\u26AA";
424
+ lines.push(`
425
+ Systemd: ${sd.unit}`);
426
+ lines.push(` ${icon} State: ${sd.activeState} (${sd.subState}) Result: ${sd.result}`);
427
+ lines.push(` Main PID: ${sd.mainPid || "none"} Restarts: ${sd.restarts}`);
428
+ }
429
+ if (result.workers) {
430
+ const w = result.workers;
431
+ lines.push(`
432
+ Workers (orchestrator pid ${w.orchestratorPid ?? "unknown"} ${w.orchestratorAlive ? "\u2705" : "\u274C"})`);
433
+ for (const worker of w.workers) {
434
+ const icon = worker.declaredStatus === "running" && worker.alive ? "\u{1F7E2}" : worker.stale ? "\u{1F534} STALE" : "\u26AA";
435
+ lines.push(` ${icon} ${worker.name.padEnd(14)} pid=${String(worker.pid ?? "-").padEnd(8)} status=${worker.declaredStatus}`);
436
+ }
437
+ }
438
+ if (result.osProcesses.length > 0) {
439
+ lines.push(`
440
+ OS Processes (${result.osProcesses.length} total)`);
441
+ for (const p of result.osProcesses) {
442
+ lines.push(` PID ${String(p.pid).padEnd(8)} CPU=${p.cpu.padEnd(6)} MEM=${p.mem.padEnd(6)} ${p.command.substring(0, 55)}`);
443
+ }
444
+ }
445
+ if (result.orphans.length > 0) {
446
+ lines.push(`
447
+ \u26A0\uFE0F ${result.orphans.length} ORPHAN PROCESS(ES):`);
448
+ for (const p of result.orphans) {
449
+ lines.push(` PID ${p.pid} \u2014 not tracked by PID file or workers registry`);
450
+ }
451
+ }
452
+ lines.push("");
453
+ if (result.problems.length === 0) {
454
+ lines.push(" \u2705 All checks passed \u2014 no process issues detected.");
455
+ } else {
456
+ lines.push(` \u26A0\uFE0F ${result.problems.length} issue(s):`);
457
+ for (const p of result.problems) {
458
+ lines.push(` \u2022 ${p}`);
459
+ }
460
+ }
461
+ lines.push("");
462
+ return lines.join("\n");
463
+ }
464
+
209
465
  // src/live.ts
210
- import { existsSync, readdirSync, readFileSync, statSync, watch } from "fs";
211
- import { basename, join, resolve } from "path";
466
+ import { existsSync as existsSync2, readdirSync as readdirSync2, readFileSync as readFileSync2, statSync as statSync2, watch } from "fs";
467
+ import { basename as basename2, join as join2, resolve } from "path";
212
468
  var C = {
213
469
  reset: "\x1B[0m",
214
470
  bold: "\x1B[1m",
@@ -283,7 +539,7 @@ function scanFiles(dirs, recursive) {
283
539
  const seen = /* @__PURE__ */ new Set();
284
540
  function scanDir(d, topLevel) {
285
541
  try {
286
- const dirStat = statSync(d);
542
+ const dirStat = statSync2(d);
287
543
  const dirMtime = dirStat.mtime.getTime();
288
544
  const cachedMtime = dirMtimeCache.get(d);
289
545
  if (cachedMtime === dirMtime) {
@@ -299,13 +555,13 @@ function scanFiles(dirs, recursive) {
299
555
  }
300
556
  }
301
557
  const dirResults = [];
302
- for (const f of readdirSync(d)) {
558
+ for (const f of readdirSync2(d)) {
303
559
  if (f.startsWith(".")) continue;
304
- const fp = join(d, f);
560
+ const fp = join2(d, f);
305
561
  if (seen.has(fp)) continue;
306
562
  let stat;
307
563
  try {
308
- stat = statSync(fp);
564
+ stat = statSync2(fp);
309
565
  } catch {
310
566
  continue;
311
567
  }
@@ -337,13 +593,13 @@ function scanFiles(dirs, recursive) {
337
593
  }
338
594
  function safeReadJson(fp) {
339
595
  try {
340
- return JSON.parse(readFileSync(fp, "utf8"));
596
+ return JSON.parse(readFileSync2(fp, "utf8"));
341
597
  } catch {
342
598
  return null;
343
599
  }
344
600
  }
345
601
  function nameFromFile(filename) {
346
- return basename(filename).replace(/\.(json|jsonl)$/, "").replace(/-state$/, "");
602
+ return basename2(filename).replace(/\.(json|jsonl)$/, "").replace(/-state$/, "");
347
603
  }
348
604
  function normalizeStatus(val) {
349
605
  if (typeof val !== "string") return "unknown";
@@ -521,7 +777,7 @@ function processJsonFile(file) {
521
777
  }
522
778
  function processJsonlFile(file) {
523
779
  try {
524
- const content = readFileSync(file.path, "utf8").trim();
780
+ const content = readFileSync2(file.path, "utf8").trim();
525
781
  if (!content) return [];
526
782
  const lines = content.split("\n");
527
783
  const lineCount = lines.length;
@@ -673,6 +929,9 @@ var prevFileCount = 0;
673
929
  var newExecCount = 0;
674
930
  var sessionStart = Date.now();
675
931
  var firstRender = true;
932
+ var cachedAuditConfig = null;
933
+ var cachedAuditResult = null;
934
+ var lastAuditTime = 0;
676
935
  var fileCache = /* @__PURE__ */ new Map();
677
936
  function getRecordsCached(f) {
678
937
  const cached = fileCache.get(f.path);
@@ -792,6 +1051,22 @@ function render(config) {
792
1051
  const level = Math.round(v / maxBucket * 8);
793
1052
  return (failBuckets[i] > 0 ? C.red : C.green) + sparkChars[level] + C.reset;
794
1053
  }).join("");
1054
+ let auditResult = null;
1055
+ if (now - lastAuditTime > 1e4) {
1056
+ if (!cachedAuditConfig) {
1057
+ cachedAuditConfig = discoverProcessConfig(config.dirs);
1058
+ }
1059
+ if (cachedAuditConfig) {
1060
+ try {
1061
+ auditResult = auditProcesses(cachedAuditConfig);
1062
+ cachedAuditResult = auditResult;
1063
+ lastAuditTime = now;
1064
+ } catch {
1065
+ }
1066
+ }
1067
+ } else {
1068
+ auditResult = cachedAuditResult;
1069
+ }
795
1070
  const distributedTraces = [];
796
1071
  if (allTraces.length > 1) {
797
1072
  const traceGroups = groupByTraceId(allTraces);
@@ -872,6 +1147,41 @@ function render(config) {
872
1147
  );
873
1148
  writeLine(L, "");
874
1149
  writeLine(L, ` ${C.bold}Activity (1h)${C.reset} ${spark} ${C.dim}\u2190 now${C.reset}`);
1150
+ if (auditResult) {
1151
+ const ar = auditResult;
1152
+ const healthy = ar.problems.length === 0;
1153
+ const healthIcon = healthy ? `${C.green}\u25CF${C.reset}` : `${C.red}\u25CF${C.reset}`;
1154
+ const healthLabel = healthy ? `${C.green}healthy${C.reset}` : `${C.red}${ar.problems.length} issue(s)${C.reset}`;
1155
+ const workerParts = [];
1156
+ if (ar.workers) {
1157
+ for (const w of ar.workers.workers) {
1158
+ const wIcon = w.declaredStatus === "running" && w.alive ? `${C.green}\u25CF${C.reset}` : w.stale ? `${C.red}\u25CF${C.reset}` : `${C.dim}\u25CB${C.reset}`;
1159
+ workerParts.push(`${wIcon} ${w.name}`);
1160
+ }
1161
+ }
1162
+ let sysdLabel = "";
1163
+ if (ar.systemd) {
1164
+ const si = ar.systemd.activeState === "active" ? `${C.green}\u25CF${C.reset}` : ar.systemd.crashLooping ? `${C.yellow}\u25CF${C.reset}` : ar.systemd.failed ? `${C.red}\u25CF${C.reset}` : `${C.dim}\u25CB${C.reset}`;
1165
+ sysdLabel = ` ${C.bold}Systemd${C.reset} ${si} ${ar.systemd.activeState}`;
1166
+ if (ar.systemd.restarts > 0) sysdLabel += ` ${C.dim}(${ar.systemd.restarts} restarts)${C.reset}`;
1167
+ }
1168
+ let pidLabel = "";
1169
+ if (ar.pidFile?.pid) {
1170
+ const pi = ar.pidFile.alive && ar.pidFile.matchesProcess ? `${C.green}\u25CF${C.reset}` : `${C.red}\u25CF${C.reset}`;
1171
+ pidLabel = ` ${C.bold}PID${C.reset} ${pi} ${ar.pidFile.pid}`;
1172
+ }
1173
+ writeLine(L, "");
1174
+ writeLine(L, ` ${C.bold}${C.under}Process Health${C.reset}`);
1175
+ writeLine(L, ` ${healthIcon} ${healthLabel}${pidLabel}${sysdLabel} ${C.bold}Procs${C.reset} ${C.dim}${ar.osProcesses.length}${C.reset} ${ar.orphans.length > 0 ? `${C.red}Orphans ${ar.orphans.length}${C.reset}` : `${C.dim}Orphans 0${C.reset}`}`);
1176
+ if (workerParts.length > 0) {
1177
+ writeLine(L, ` ${C.dim}Workers${C.reset} ${workerParts.join(" ")}`);
1178
+ }
1179
+ if (!healthy) {
1180
+ for (const p of ar.problems.slice(0, 3)) {
1181
+ writeLine(L, ` ${C.red}\u2022${C.reset} ${C.dim}${p}${C.reset}`);
1182
+ }
1183
+ }
1184
+ }
875
1185
  writeLine(L, "");
876
1186
  writeLine(
877
1187
  L,
@@ -990,13 +1300,13 @@ function getDistDepth(dt, spanId, visited) {
990
1300
  }
991
1301
  function startLive(argv) {
992
1302
  const config = parseArgs(argv);
993
- const valid = config.dirs.filter((d) => existsSync(d));
1303
+ const valid = config.dirs.filter((d) => existsSync2(d));
994
1304
  if (valid.length === 0) {
995
1305
  console.error(`No valid directories found: ${config.dirs.join(", ")}`);
996
1306
  console.error("Specify directories containing JSON/JSONL files: agentflow live <dir> [dir...]");
997
1307
  process.exit(1);
998
1308
  }
999
- const invalid = config.dirs.filter((d) => !existsSync(d));
1309
+ const invalid = config.dirs.filter((d) => !existsSync2(d));
1000
1310
  if (invalid.length > 0) {
1001
1311
  console.warn(`Skipping non-existent: ${invalid.join(", ")}`);
1002
1312
  }
@@ -1019,262 +1329,6 @@ function startLive(argv) {
1019
1329
  });
1020
1330
  }
1021
1331
 
1022
- // src/process-audit.ts
1023
- import { execSync } from "child_process";
1024
- import { existsSync as existsSync2, readdirSync as readdirSync2, readFileSync as readFileSync2, statSync as statSync2 } from "fs";
1025
- import { basename as basename2, join as join2 } from "path";
1026
- function isPidAlive(pid) {
1027
- try {
1028
- process.kill(pid, 0);
1029
- return true;
1030
- } catch {
1031
- return false;
1032
- }
1033
- }
1034
- function pidMatchesName(pid, name) {
1035
- try {
1036
- const cmdline = readFileSync2(`/proc/${pid}/cmdline`, "utf8");
1037
- return cmdline.includes(name);
1038
- } catch {
1039
- return false;
1040
- }
1041
- }
1042
- function readPidFile(path) {
1043
- try {
1044
- const pid = parseInt(readFileSync2(path, "utf8").trim(), 10);
1045
- return isNaN(pid) ? null : pid;
1046
- } catch {
1047
- return null;
1048
- }
1049
- }
1050
- function auditPidFile(config) {
1051
- if (!config.pidFile) return null;
1052
- const pid = readPidFile(config.pidFile);
1053
- if (pid === null) {
1054
- return {
1055
- path: config.pidFile,
1056
- pid: null,
1057
- alive: false,
1058
- matchesProcess: false,
1059
- stale: !existsSync2(config.pidFile),
1060
- reason: existsSync2(config.pidFile) ? "PID file exists but content is invalid" : "No PID file found"
1061
- };
1062
- }
1063
- const alive = isPidAlive(pid);
1064
- const matchesProcess = alive ? pidMatchesName(pid, config.processName) : false;
1065
- const stale = !alive || alive && !matchesProcess;
1066
- let reason;
1067
- if (alive && matchesProcess) {
1068
- reason = `PID ${pid} alive and matches ${config.processName}`;
1069
- } else if (alive && !matchesProcess) {
1070
- reason = `PID ${pid} alive but is NOT ${config.processName} (PID reused by another process)`;
1071
- } else {
1072
- reason = `PID ${pid} no longer exists`;
1073
- }
1074
- return { path: config.pidFile, pid, alive, matchesProcess, stale, reason };
1075
- }
1076
- function auditSystemd(config) {
1077
- if (config.systemdUnit === null || config.systemdUnit === void 0) return null;
1078
- const unit = config.systemdUnit;
1079
- try {
1080
- const raw = execSync(
1081
- `systemctl --user show ${unit} --property=ActiveState,SubState,MainPID,NRestarts,Result --no-pager 2>/dev/null`,
1082
- { encoding: "utf8", timeout: 5e3 }
1083
- );
1084
- const props = {};
1085
- for (const line of raw.trim().split("\n")) {
1086
- const [k, ...v] = line.split("=");
1087
- if (k) props[k.trim()] = v.join("=").trim();
1088
- }
1089
- const activeState = props["ActiveState"] ?? "unknown";
1090
- const subState = props["SubState"] ?? "unknown";
1091
- const mainPid = parseInt(props["MainPID"] ?? "0", 10);
1092
- const restarts = parseInt(props["NRestarts"] ?? "0", 10);
1093
- const result = props["Result"] ?? "unknown";
1094
- return {
1095
- unit,
1096
- activeState,
1097
- subState,
1098
- mainPid,
1099
- restarts,
1100
- result,
1101
- crashLooping: activeState === "activating" && subState === "auto-restart",
1102
- failed: activeState === "failed"
1103
- };
1104
- } catch {
1105
- return null;
1106
- }
1107
- }
1108
- function auditWorkers(config) {
1109
- if (!config.workersFile || !existsSync2(config.workersFile)) return null;
1110
- try {
1111
- const data = JSON.parse(readFileSync2(config.workersFile, "utf8"));
1112
- const orchPid = data.pid ?? null;
1113
- const orchAlive = orchPid ? isPidAlive(orchPid) : false;
1114
- const workers = [];
1115
- for (const [name, info] of Object.entries(data.tools ?? {})) {
1116
- const w = info;
1117
- const wPid = w.pid ?? null;
1118
- const wAlive = wPid ? isPidAlive(wPid) : false;
1119
- workers.push({
1120
- name,
1121
- pid: wPid,
1122
- declaredStatus: w.status ?? "unknown",
1123
- alive: wAlive,
1124
- stale: w.status === "running" && !wAlive
1125
- });
1126
- }
1127
- return {
1128
- orchestratorPid: orchPid,
1129
- orchestratorAlive: orchAlive,
1130
- startedAt: data.started_at ?? "",
1131
- workers
1132
- };
1133
- } catch {
1134
- return null;
1135
- }
1136
- }
1137
- function getOsProcesses(processName) {
1138
- try {
1139
- const raw = execSync(`ps aux`, { encoding: "utf8", timeout: 5e3 });
1140
- return raw.split("\n").filter((line) => line.includes(processName) && !line.includes("process-audit") && !line.includes("grep")).map((line) => {
1141
- const parts = line.trim().split(/\s+/);
1142
- return {
1143
- pid: parseInt(parts[1] ?? "0", 10),
1144
- cpu: parts[2] ?? "0",
1145
- mem: parts[3] ?? "0",
1146
- command: parts.slice(10).join(" ")
1147
- };
1148
- }).filter((p) => !isNaN(p.pid) && p.pid > 0);
1149
- } catch {
1150
- return [];
1151
- }
1152
- }
1153
- function discoverProcessConfig(dirs) {
1154
- let pidFile;
1155
- let workersFile;
1156
- let processName = "";
1157
- for (const dir of dirs) {
1158
- if (!existsSync2(dir)) continue;
1159
- let entries;
1160
- try {
1161
- entries = readdirSync2(dir);
1162
- } catch {
1163
- continue;
1164
- }
1165
- for (const f of entries) {
1166
- const fp = join2(dir, f);
1167
- try {
1168
- if (!statSync2(fp).isFile()) continue;
1169
- } catch {
1170
- continue;
1171
- }
1172
- if (f.endsWith(".pid") && !pidFile) {
1173
- pidFile = fp;
1174
- if (!processName) {
1175
- processName = basename2(f, ".pid");
1176
- }
1177
- }
1178
- if ((f === "workers.json" || f.endsWith("-workers.json")) && !workersFile) {
1179
- workersFile = fp;
1180
- if (!processName && f !== "workers.json") {
1181
- processName = basename2(f, "-workers.json");
1182
- }
1183
- }
1184
- }
1185
- }
1186
- if (!processName && !pidFile && !workersFile) return null;
1187
- if (!processName) processName = "agent";
1188
- return { processName, pidFile, workersFile };
1189
- }
1190
- function auditProcesses(config) {
1191
- const pidFile = auditPidFile(config);
1192
- const systemd = auditSystemd(config);
1193
- const workers = auditWorkers(config);
1194
- const osProcesses = getOsProcesses(config.processName);
1195
- const knownPids = /* @__PURE__ */ new Set();
1196
- if (pidFile?.pid && !pidFile.stale) knownPids.add(pidFile.pid);
1197
- if (workers) {
1198
- if (workers.orchestratorPid) knownPids.add(workers.orchestratorPid);
1199
- for (const w of workers.workers) {
1200
- if (w.pid) knownPids.add(w.pid);
1201
- }
1202
- }
1203
- if (systemd?.mainPid) knownPids.add(systemd.mainPid);
1204
- const orphans = osProcesses.filter((p) => !knownPids.has(p.pid));
1205
- const problems = [];
1206
- if (pidFile?.stale) problems.push(`Stale PID file: ${pidFile.reason}`);
1207
- if (systemd?.crashLooping) problems.push("Systemd unit is crash-looping (auto-restart)");
1208
- if (systemd?.failed) problems.push("Systemd unit has failed");
1209
- if (systemd && systemd.restarts > 10) problems.push(`High systemd restart count: ${systemd.restarts}`);
1210
- if (pidFile?.pid && systemd?.mainPid && pidFile.pid !== systemd.mainPid) {
1211
- problems.push(`PID mismatch: file says ${pidFile.pid}, systemd says ${systemd.mainPid}`);
1212
- }
1213
- if (workers) {
1214
- for (const w of workers.workers) {
1215
- if (w.stale) problems.push(`Worker "${w.name}" (pid ${w.pid}) declares running but is dead`);
1216
- }
1217
- }
1218
- if (orphans.length > 0) problems.push(`${orphans.length} orphan process(es) not tracked by PID file or workers registry`);
1219
- return { pidFile, systemd, workers, osProcesses, orphans, problems };
1220
- }
1221
- function formatAuditReport(result) {
1222
- const lines = [];
1223
- lines.push("");
1224
- lines.push("\u2554\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2557");
1225
- lines.push("\u2551 \u{1F50D} P R O C E S S A U D I T \u2551");
1226
- lines.push("\u255A\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u255D");
1227
- if (result.pidFile) {
1228
- const pf = result.pidFile;
1229
- const icon = pf.pid && pf.alive && pf.matchesProcess ? "\u2705" : pf.stale ? "\u26A0\uFE0F " : "\u2139\uFE0F ";
1230
- lines.push(`
1231
- PID File: ${pf.path}`);
1232
- lines.push(` ${icon} ${pf.reason}`);
1233
- }
1234
- if (result.systemd) {
1235
- const sd = result.systemd;
1236
- const icon = sd.activeState === "active" ? "\u{1F7E2}" : sd.crashLooping ? "\u{1F7E1}" : sd.failed ? "\u{1F534}" : "\u26AA";
1237
- lines.push(`
1238
- Systemd: ${sd.unit}`);
1239
- lines.push(` ${icon} State: ${sd.activeState} (${sd.subState}) Result: ${sd.result}`);
1240
- lines.push(` Main PID: ${sd.mainPid || "none"} Restarts: ${sd.restarts}`);
1241
- }
1242
- if (result.workers) {
1243
- const w = result.workers;
1244
- lines.push(`
1245
- Workers (orchestrator pid ${w.orchestratorPid ?? "unknown"} ${w.orchestratorAlive ? "\u2705" : "\u274C"})`);
1246
- for (const worker of w.workers) {
1247
- const icon = worker.declaredStatus === "running" && worker.alive ? "\u{1F7E2}" : worker.stale ? "\u{1F534} STALE" : "\u26AA";
1248
- lines.push(` ${icon} ${worker.name.padEnd(14)} pid=${String(worker.pid ?? "-").padEnd(8)} status=${worker.declaredStatus}`);
1249
- }
1250
- }
1251
- if (result.osProcesses.length > 0) {
1252
- lines.push(`
1253
- OS Processes (${result.osProcesses.length} total)`);
1254
- for (const p of result.osProcesses) {
1255
- lines.push(` PID ${String(p.pid).padEnd(8)} CPU=${p.cpu.padEnd(6)} MEM=${p.mem.padEnd(6)} ${p.command.substring(0, 55)}`);
1256
- }
1257
- }
1258
- if (result.orphans.length > 0) {
1259
- lines.push(`
1260
- \u26A0\uFE0F ${result.orphans.length} ORPHAN PROCESS(ES):`);
1261
- for (const p of result.orphans) {
1262
- lines.push(` PID ${p.pid} \u2014 not tracked by PID file or workers registry`);
1263
- }
1264
- }
1265
- lines.push("");
1266
- if (result.problems.length === 0) {
1267
- lines.push(" \u2705 All checks passed \u2014 no process issues detected.");
1268
- } else {
1269
- lines.push(` \u26A0\uFE0F ${result.problems.length} issue(s):`);
1270
- for (const p of result.problems) {
1271
- lines.push(` \u2022 ${p}`);
1272
- }
1273
- }
1274
- lines.push("");
1275
- return lines.join("\n");
1276
- }
1277
-
1278
1332
  // src/graph-builder.ts
1279
1333
  import { randomUUID } from "crypto";
1280
1334
  function deepFreeze(obj) {
@@ -2459,10 +2513,10 @@ export {
2459
2513
  groupByTraceId,
2460
2514
  stitchTrace,
2461
2515
  getTraceTree,
2462
- startLive,
2463
2516
  discoverProcessConfig,
2464
2517
  auditProcesses,
2465
2518
  formatAuditReport,
2519
+ startLive,
2466
2520
  createGraphBuilder,
2467
2521
  runTraced,
2468
2522
  createTraceStore,