agent-yes 1.122.2 → 1.123.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (56) hide show
  1. package/default.config.yaml +19 -0
  2. package/dist/{SUPPORTED_CLIS-BTu2brih.js → SUPPORTED_CLIS-B4O2cFlt.js} +2 -2
  3. package/dist/SUPPORTED_CLIS-DHkqGoNv.js +8 -0
  4. package/dist/{agent-yes.config-z-IPzH5U.js → agent-yes.config-D6ycMApr.js} +2 -65
  5. package/dist/cli.js +6 -6
  6. package/dist/configShared-C5QaNPnz.js +71 -0
  7. package/dist/{globalPidIndex-gZuTvTBs.js → globalPidIndex-C7r2m6s7.js} +19 -20
  8. package/dist/index.js +4 -4
  9. package/dist/pidStore-C4c2O15q.js +5 -0
  10. package/dist/{pidStore-B5vBu8Px.js → pidStore-CGKIhaJO.js} +5 -4
  11. package/dist/reaper-BLVA780B.js +3 -0
  12. package/dist/{reaper-Dj8R7ltI.js → reaper-BkjPN7mw.js} +24 -2
  13. package/dist/{remotes-CpGcTr7A.js → remotes-BRCDVnR7.js} +1 -1
  14. package/dist/{remotes-D2fqaRU8.js → remotes-D8GvSbhf.js} +1 -1
  15. package/dist/{schedule-DgRrdA_n.js → schedule-DULdIkU9.js} +7 -7
  16. package/dist/{serve-tn7ZetZs.js → serve-r_2v9EKc.js} +202 -58
  17. package/dist/{setup-dZhgpNse.js → setup-DHa6fX8M.js} +3 -3
  18. package/dist/{share-CksllWW-.js → share-YuM6-Q6A.js} +78 -4
  19. package/dist/{subcommands-D9BWZilr.js → subcommands-B13Kto-u.js} +647 -32
  20. package/dist/subcommands-Tv6AwUkD.js +7 -0
  21. package/dist/{tray-DjCIyakK.js → tray-BVnJLThD.js} +1 -1
  22. package/dist/{ts-CIf0uaR7.js → ts-DgukRoEI.js} +10 -7
  23. package/dist/{versionChecker-DjxKi4qe.js → versionChecker-BqOr1YqC.js} +2 -2
  24. package/dist/{workspaceConfig-XP2NEWmV.js → workspaceConfig-BJO4fzEn.js} +1 -1
  25. package/lab/ui/console-logic.js +222 -10
  26. package/lab/ui/icon.svg +5 -0
  27. package/lab/ui/index.html +689 -14
  28. package/lab/ui/landing.html +276 -0
  29. package/lab/ui/manifest.webmanifest +14 -0
  30. package/lab/ui/sw.js +56 -0
  31. package/package.json +5 -1
  32. package/ts/agentTree.spec.ts +92 -0
  33. package/ts/agentTree.ts +149 -0
  34. package/ts/configShared.ts +4 -0
  35. package/ts/globalPidIndex.ts +28 -20
  36. package/ts/idleWaiter.spec.ts +7 -1
  37. package/ts/index.ts +9 -0
  38. package/ts/lsWatch.spec.ts +61 -0
  39. package/ts/lsWatch.ts +94 -0
  40. package/ts/needsInput.spec.ts +55 -0
  41. package/ts/needsInput.ts +68 -0
  42. package/ts/pidStore.ts +3 -0
  43. package/ts/reaper.spec.ts +26 -2
  44. package/ts/reaper.ts +25 -0
  45. package/ts/resultEnvelope.spec.ts +43 -0
  46. package/ts/resultEnvelope.ts +88 -0
  47. package/ts/serve.ts +276 -41
  48. package/ts/share.ts +156 -3
  49. package/ts/subcommands.ts +0 -0
  50. package/ts/todoParse.spec.ts +68 -0
  51. package/ts/todoParse.ts +88 -0
  52. package/ts/utils.spec.ts +4 -1
  53. package/dist/SUPPORTED_CLIS-DcOKE9Nz.js +0 -8
  54. package/dist/pidStore-7y1cTcAE.js +0 -5
  55. package/dist/reaper-HqcUms2d.js +0 -3
  56. package/dist/subcommands-D8sHibKu.js +0 -6
package/ts/serve.ts CHANGED
@@ -1,5 +1,5 @@
1
- import { mkdir, open, readFile, writeFile } from "fs/promises";
2
- import { watch } from "node:fs";
1
+ import { mkdir, open, readFile, stat, writeFile } from "fs/promises";
2
+ import { renameSync, watch, writeFileSync } from "node:fs";
3
3
  import { fileURLToPath } from "node:url";
4
4
  import { createHash, randomBytes, timingSafeEqual } from "crypto";
5
5
  import { homedir, hostname, userInfo } from "os";
@@ -7,6 +7,7 @@ import path from "path";
7
7
  import yargs from "yargs";
8
8
  import {
9
9
  controlCodeFromName,
10
+ extractTaskCounts,
10
11
  listRecords,
11
12
  readNotes,
12
13
  renderRawLog,
@@ -15,6 +16,8 @@ import {
15
16
  writeToIpc,
16
17
  type CommonOpts,
17
18
  } from "./subcommands.ts";
19
+ import { updateGlobalPidStatus } from "./globalPidIndex.ts";
20
+ import { pgidForWrapper } from "./reaper.ts";
18
21
  import { SUPPORTED_CLIS } from "./SUPPORTED_CLIS.ts";
19
22
  import { getInstalledPackage } from "./versionChecker.ts";
20
23
 
@@ -28,6 +31,19 @@ function tokenPath(): string {
28
31
  return path.join(agentYesHome(), ".serve-token");
29
32
  }
30
33
 
34
+ // Liveness heartbeat for the WebRTC daemon. The native WebRTC stack
35
+ // (node-datachannel) has been observed to freeze the entire JS event loop after
36
+ // long uptime — main thread stuck in a pthread rendezvous — so signaling stays
37
+ // connected but the host answers nobody, and NO in-process timer (self-heal,
38
+ // idle-restart) can recover it because JS has stopped running. The serve loop
39
+ // stamps this file every HEARTBEAT_WRITE_MS; `ay serve healthcheck` reports the
40
+ // daemon unhealthy once it goes stale, and oxmgr's --health-cmd restarts it.
41
+ function heartbeatPath(): string {
42
+ return path.join(agentYesHome(), ".serve-heartbeat");
43
+ }
44
+ const HEARTBEAT_WRITE_MS = 5_000;
45
+ const HEARTBEAT_STALE_MS = 15_000; // event loop is wedged if no stamp this long (3 missed)
46
+
31
47
  async function loadOrCreateToken(tokenFlag?: string): Promise<string> {
32
48
  if (tokenFlag) return tokenFlag;
33
49
  try {
@@ -320,9 +336,42 @@ async function cmdServeDaemon(sub: string, args: string[]): Promise<number> {
320
336
  // args after `--`. Both auto-restart on crash by default (pm2) / via the
321
337
  // explicit flag (oxmgr).
322
338
  const serveArgv = ayServeArgv(effArgs);
339
+ // WebRTC daemons get an oxmgr health watchdog: the native WebRTC stack can
340
+ // freeze the JS event loop (host answers nobody, no in-process timer can
341
+ // recover it), so an EXTERNAL probe of the serve heartbeat is the only thing
342
+ // that can detect+restart it. 15s stale + 3 misses at 10s ≈ 45s to auto-recover.
343
+ const webrtcDaemon = effArgs.some((a) => a.startsWith("--webrtc") || a.startsWith("--share"));
344
+ const oxmgrHealth =
345
+ webrtcDaemon && mgr.id === "oxmgr"
346
+ ? [
347
+ "--health-cmd",
348
+ ayServeArgv(["healthcheck"]).join(" "),
349
+ "--health-interval",
350
+ "10",
351
+ "--health-timeout",
352
+ "5",
353
+ "--health-max-failures",
354
+ "3",
355
+ ]
356
+ : [];
323
357
  const startArgv =
324
358
  mgr.id === "oxmgr"
325
- ? [mgr.bin, "start", serveArgv.join(" "), "--name", DAEMON_NAME, "--restart", "always"]
359
+ ? [
360
+ mgr.bin,
361
+ "start",
362
+ serveArgv.join(" "),
363
+ "--name",
364
+ DAEMON_NAME,
365
+ "--restart",
366
+ "always",
367
+ // Persistent daemon: oxmgr's default lifetime cap of 10 restarts would
368
+ // eventually stop respawning it (updates, reboots, the health-watchdog
369
+ // recovering a frozen WebRTC stack). Raise it far out of the way; the
370
+ // crash-restart-limit still guards against a tight crash loop.
371
+ "--max-restarts",
372
+ "1000000",
373
+ ...oxmgrHealth,
374
+ ]
326
375
  : [
327
376
  mgr.bin,
328
377
  "start",
@@ -528,6 +577,28 @@ export async function cmdServe(rest: string[]): Promise<number> {
528
577
  // Daemon subcommands
529
578
  const sub = rest[0];
530
579
  if (sub === "status") return cmdServeStatus(rest.slice(1));
580
+ if (sub === "healthcheck") {
581
+ // oxmgr --health-cmd liveness probe. Exit non-zero only when the heartbeat is
582
+ // demonstrably stale (event loop wedged), so the manager restarts us. A
583
+ // missing/just-started/unparseable heartbeat is treated as healthy to avoid
584
+ // flapping a daemon that simply hasn't stamped yet.
585
+ try {
586
+ const raw = (await readFile(heartbeatPath(), "utf-8")).trim();
587
+ const ts = Number(raw);
588
+ // Only declare unhealthy on a VALID, genuinely-old stamp. Empty/partial/NaN
589
+ // (Number("") === 0!) is treated as healthy so a torn read or a not-yet-
590
+ // written file can't trigger a false restart. (Writes are atomic via
591
+ // temp+rename, so a torn read shouldn't happen — this is belt-and-braces.)
592
+ const age = Date.now() - ts;
593
+ if (raw.length > 0 && Number.isFinite(ts) && ts > 0 && age > HEARTBEAT_STALE_MS) {
594
+ process.stderr.write(`unhealthy: serve heartbeat stale by ${age}ms\n`);
595
+ return 1;
596
+ }
597
+ } catch {
598
+ /* no heartbeat yet — treat as healthy */
599
+ }
600
+ return 0;
601
+ }
531
602
  if (sub === "install" || sub === "uninstall" || sub === "logs") {
532
603
  return cmdServeDaemon(sub, rest.slice(1));
533
604
  }
@@ -655,11 +726,40 @@ export async function cmdServe(rest: string[]): Promise<number> {
655
726
  }
656
727
  };
657
728
 
658
- // Per-cwd git snapshot for the list: branch + dirty/changed count + ahead/behind
659
- // vs upstream, all from a single `git status --porcelain --branch`. Cached per
660
- // cwd with a short TTL so the 1s subscribe tick (and /api/ls polls) spawn at most
661
- // one git per repo every few seconds agents sharing a cwd share the result.
662
- // Non-git dirs, errors, and timeouts cache as null.
729
+ // Per-agent task progress ({done,total}) parsed from the agent's rendered TUI
730
+ // screen (the durable raw log). Cached per (size, mtime) exactly like logTitle:
731
+ // re-parse only when the log grew, so the 1s tick stays cheap even though each
732
+ // parse renders a log window through xterm. Works for every CLI the source is
733
+ // the drawn todo block, not a CLI-specific session file. See extractTaskCounts.
734
+ const taskCache = new Map<
735
+ string,
736
+ { size: number; mtimeMs: number; tasks: { done: number; total: number } | null }
737
+ >();
738
+ const logTasks = async (
739
+ logFile: string | null | undefined,
740
+ ): Promise<{ done: number; total: number } | null> => {
741
+ if (!logFile) return null;
742
+ try {
743
+ const { size, mtimeMs } = await stat(logFile);
744
+ const hit = taskCache.get(logFile);
745
+ if (hit && hit.size === size && hit.mtimeMs === mtimeMs) return hit.tasks;
746
+ const tasks = await extractTaskCounts(logFile);
747
+ taskCache.set(logFile, { size, mtimeMs, tasks });
748
+ return tasks;
749
+ } catch {
750
+ return null;
751
+ }
752
+ };
753
+
754
+ // Per-repo git snapshot for the list (branch + dirty/changed + ahead/behind, from
755
+ // one `git status --porcelain --branch`). WATCHER-INVALIDATED, not polled: a read
756
+ // returns the cached snapshot instantly and NEVER spawns `git status` on the
757
+ // request path. A per-repo-root fs watcher recomputes (debounced) only when the
758
+ // repo actually changes, so an idle fleet costs ~0 git processes. The old design
759
+ // forked one `git status` per agent every poll tick — with dozens of agents that
760
+ // concurrent fan-out pinned host load (high load-average, low CPU: fork + I/O,
761
+ // not compute). Modeled on VSCode's git extension: watch + debounce, no interval
762
+ // poll (just a slow safety recompute for events a watcher might miss).
663
763
  interface GitInfo {
664
764
  branch: string | null;
665
765
  dirty: boolean;
@@ -667,16 +767,11 @@ export async function cmdServe(rest: string[]): Promise<number> {
667
767
  ahead: number;
668
768
  behind: number;
669
769
  }
670
- const GIT_TTL_MS = 5000;
671
- const gitCache = new Map<string, { at: number; val: GitInfo | null }>();
672
- const gitStatus = async (cwd: string | null | undefined): Promise<GitInfo | null> => {
673
- if (!cwd) return null;
674
- const now = Date.now();
675
- const hit = gitCache.get(cwd);
676
- if (hit && now - hit.at < GIT_TTL_MS) return hit.val;
677
- let val: GitInfo | null = null;
770
+ const GIT_DEBOUNCE_MS = 800; // coalesce a burst of edits into one recompute
771
+ const GIT_SAFETY_MS = 60_000; // backstop recompute for any missed watch event
772
+ const runGit = async (args: string[], cwd: string): Promise<string | null> => {
678
773
  try {
679
- const proc = Bun.spawn(["git", "status", "--porcelain", "--branch"], {
774
+ const proc = Bun.spawn(["git", ...args], {
680
775
  cwd,
681
776
  stdout: "pipe",
682
777
  stderr: "ignore",
@@ -684,23 +779,89 @@ export async function cmdServe(rest: string[]): Promise<number> {
684
779
  });
685
780
  const out = await new Response(proc.stdout).text();
686
781
  await proc.exited;
687
- if (proc.exitCode === 0) {
688
- const lines = out.split("\n");
689
- // Branch header, e.g. "## main...origin/main [ahead 1, behind 2]",
690
- // "## main" (no upstream), "## HEAD (no branch)", or "## No commits yet on x".
691
- const h = /^## (.+)$/.exec(lines[0] ?? "")?.[1] ?? "";
692
- const unborn = /^No commits yet on (.+)$/.exec(h);
693
- const branch = unborn ? unborn[1]! : /^(.+?)(?:\.\.\.|\s|$)/.exec(h)?.[1] || null;
694
- const ahead = Number(/\bahead (\d+)/.exec(h)?.[1] ?? 0);
695
- const behind = Number(/\bbehind (\d+)/.exec(h)?.[1] ?? 0);
696
- const changed = lines.slice(1).filter((l) => l.trim().length > 0).length;
697
- val = { branch, dirty: changed > 0, changed, ahead, behind };
782
+ return proc.exitCode === 0 ? out : null;
783
+ } catch {
784
+ return null; // git missing, not a repo, or timed out
785
+ }
786
+ };
787
+ const parseGitStatus = (out: string): GitInfo => {
788
+ const lines = out.split("\n");
789
+ // Branch header, e.g. "## main...origin/main [ahead 1, behind 2]", "## main"
790
+ // (no upstream), "## HEAD (no branch)", or "## No commits yet on x".
791
+ const h = /^## (.+)$/.exec(lines[0] ?? "")?.[1] ?? "";
792
+ const unborn = /^No commits yet on (.+)$/.exec(h);
793
+ const branch = unborn ? unborn[1]! : /^(.+?)(?:\.\.\.|\s|$)/.exec(h)?.[1] || null;
794
+ const ahead = Number(/\bahead (\d+)/.exec(h)?.[1] ?? 0);
795
+ const behind = Number(/\bbehind (\d+)/.exec(h)?.[1] ?? 0);
796
+ const changed = lines.slice(1).filter((l) => l.trim().length > 0).length;
797
+ return { branch, dirty: changed > 0, changed, ahead, behind };
798
+ };
799
+ // cwd -> repo root ("" = resolved, not a repo). Resolved once per cwd via a cheap
800
+ // `git rev-parse --show-toplevel` (no tree scan) and cached, so many agents in the
801
+ // same repo (or its submodules/subdirs) share one watcher + snapshot.
802
+ const rootOfCwd = new Map<string, string>();
803
+ const resolveRoot = async (cwd: string): Promise<string> => {
804
+ const cached = rootOfCwd.get(cwd);
805
+ if (cached !== undefined) return cached;
806
+ const root = ((await runGit(["rev-parse", "--show-toplevel"], cwd)) ?? "").trim();
807
+ rootOfCwd.set(cwd, root);
808
+ return root;
809
+ };
810
+ interface RepoWatch {
811
+ val: GitInfo | null;
812
+ busy: boolean;
813
+ timer: ReturnType<typeof setTimeout> | null;
814
+ }
815
+ const repoWatch = new Map<string, RepoWatch>();
816
+ const recompute = (root: string, rw: RepoWatch) => {
817
+ if (rw.timer) return; // a recompute is already queued (debounce + throttle)
818
+ rw.timer = setTimeout(async () => {
819
+ rw.timer = null;
820
+ if (rw.busy) return void recompute(root, rw); // re-arm if one is in flight
821
+ rw.busy = true;
822
+ try {
823
+ const out = await runGit(["status", "--porcelain", "--branch"], root);
824
+ if (out != null) rw.val = parseGitStatus(out);
825
+ } finally {
826
+ rw.busy = false;
698
827
  }
828
+ }, GIT_DEBOUNCE_MS);
829
+ };
830
+ const ensureRepoWatch = (root: string): RepoWatch => {
831
+ const existing = repoWatch.get(root);
832
+ if (existing) return existing;
833
+ const rw: RepoWatch = { val: null, busy: false, timer: null };
834
+ repoWatch.set(root, rw);
835
+ recompute(root, rw); // initial snapshot
836
+ // Ignore high-churn paths that never change `git status` output: our own log
837
+ // dir (.agent-yes, written on every PTY byte — would re-trigger forever),
838
+ // gitignored deps (node_modules), and git's own lock files.
839
+ const onChange = (file: string) => {
840
+ if (file.includes(".agent-yes") || file.includes("node_modules") || file.endsWith(".lock"))
841
+ return;
842
+ recompute(root, rw);
843
+ };
844
+ try {
845
+ // macOS/Windows: one recursive watcher (FSEvents/ReadDirectoryChanges) covers
846
+ // the working tree (dirty) AND .git (branch/ahead-behind) cheaply.
847
+ watch(root, { recursive: true }, (_e, f) => onChange(String(f ?? "")));
699
848
  } catch {
700
- val = null; // git missing, not a repo, or timed out
849
+ // Recursive watch unsupported (some Linux/Bun builds): watch .git only
850
+ // catches commit/branch/stage instantly; dirty count rides the safety tick.
851
+ try {
852
+ watch(path.join(root, ".git"), (_e, f) => onChange(".git/" + String(f ?? "")));
853
+ } catch {
854
+ /* no watcher available — rely solely on the safety recompute */
855
+ }
701
856
  }
702
- gitCache.set(cwd, { at: now, val });
703
- return val;
857
+ setInterval(() => recompute(root, rw), GIT_SAFETY_MS);
858
+ return rw;
859
+ };
860
+ const gitStatus = async (cwd: string | null | undefined): Promise<GitInfo | null> => {
861
+ if (!cwd) return null;
862
+ const root = await resolveRoot(cwd);
863
+ if (!root) return null; // not a git repo
864
+ return ensureRepoWatch(root).val; // cached — the request path never spawns `git status`
704
865
  };
705
866
 
706
867
  // One agent record decorated for the console: the latest OSC title + a git
@@ -709,6 +870,9 @@ export async function cmdServe(rest: string[]): Promise<number> {
709
870
  ...r,
710
871
  title: await logTitle(r.log_file),
711
872
  git: r.status === "exited" ? null : await gitStatus(r.cwd),
873
+ // Task progress from the rendered todo block (null when none detected → no
874
+ // badge). Skipped for exited agents — their screen is no longer live.
875
+ tasks: r.status === "exited" ? null : await logTasks(r.log_file),
712
876
  });
713
877
 
714
878
  // The whole API as a plain handler: served over HTTP by Bun.serve (--http)
@@ -1056,6 +1220,59 @@ export async function cmdServe(rest: string[]): Promise<number> {
1056
1220
  }
1057
1221
  }
1058
1222
 
1223
+ // POST /api/kill body {keyword} — force-kill a stuck agent. The console can
1224
+ // already send keystrokes (Ctrl+C, /exit) via /api/send; this is the escalation
1225
+ // for an agent too wedged to respond to those: a real SIGKILL of its process
1226
+ // GROUP (wrapper + CLI + children), via the pgid the reaper recorded. The >1
1227
+ // guards are critical — process.kill(-1)/kill(0) would signal far too much.
1228
+ if (req.method === "POST" && p === "/api/kill") {
1229
+ let body: { keyword?: string };
1230
+ try {
1231
+ body = (await req.json()) as typeof body;
1232
+ } catch {
1233
+ return new Response("invalid JSON body", { status: 400 });
1234
+ }
1235
+ const keyword = body.keyword;
1236
+ if (!keyword || typeof keyword !== "string")
1237
+ return new Response("missing keyword", { status: 400 });
1238
+ if (process.platform === "win32")
1239
+ return new Response("force-kill unsupported on a Windows serve", { status: 501 });
1240
+ try {
1241
+ const record = await resolveOne(keyword, defaultOpts({ all: true }));
1242
+ const killed: string[] = [];
1243
+ const sig = (target: number, label: string) => {
1244
+ if (!target || target <= 1) return;
1245
+ try {
1246
+ process.kill(target, "SIGKILL");
1247
+ killed.push(label);
1248
+ } catch {
1249
+ /* ESRCH: already gone */
1250
+ }
1251
+ };
1252
+ // Whole process group first (kills children too), then the pids directly in
1253
+ // case they aren't group leaders.
1254
+ const pgid = await pgidForWrapper(record.wrapper_pid ?? 0);
1255
+ if (pgid && pgid > 1) {
1256
+ try {
1257
+ process.kill(-pgid, "SIGKILL");
1258
+ killed.push(`group ${pgid}`);
1259
+ } catch {
1260
+ /* group already gone */
1261
+ }
1262
+ }
1263
+ sig(record.pid, `pid ${record.pid}`);
1264
+ if (record.wrapper_pid && record.wrapper_pid !== record.pid)
1265
+ sig(record.wrapper_pid, `wrapper ${record.wrapper_pid}`);
1266
+ await updateGlobalPidStatus(record.pid, {
1267
+ status: "exited",
1268
+ exit_reason: "force-killed via console",
1269
+ }).catch(() => {});
1270
+ return Response.json({ ok: true, pid: record.pid, killed });
1271
+ } catch (e) {
1272
+ return new Response((e as Error).message, { status: 404 });
1273
+ }
1274
+ }
1275
+
1059
1276
  // POST /api/resize/:keyword body {cols, rows} — drive the agent's PTY size.
1060
1277
  // Mirrors `ay attach`: write ~/.agent-yes/winsize/<pid> then SIGWINCH; the
1061
1278
  // agent's resize listener picks it up and reflows its TUI to that width.
@@ -1266,19 +1483,37 @@ export async function cmdServe(rest: string[]): Promise<number> {
1266
1483
  }
1267
1484
  }
1268
1485
 
1486
+ // Liveness heartbeat (WebRTC daemons only — that's where the native stack can
1487
+ // freeze the loop). If the event loop wedges, this interval stops firing, the
1488
+ // file goes stale, and oxmgr's --health-cmd (ay serve healthcheck) restarts us.
1489
+ let heartbeat: ReturnType<typeof setInterval> | undefined;
1490
+ if (wantWebrtc) {
1491
+ const stamp = () => {
1492
+ try {
1493
+ // Atomic: write a temp file then rename over the target, so a concurrent
1494
+ // `ay serve healthcheck` reader never sees a truncated/partial timestamp.
1495
+ const tmp = `${heartbeatPath()}.tmp`;
1496
+ writeFileSync(tmp, String(Date.now()));
1497
+ renameSync(tmp, heartbeatPath());
1498
+ } catch {
1499
+ /* best effort */
1500
+ }
1501
+ };
1502
+ stamp();
1503
+ heartbeat = setInterval(stamp, HEARTBEAT_WRITE_MS);
1504
+ }
1505
+
1269
1506
  process.stdout.write(`(Ctrl-C to stop)\n`);
1270
1507
 
1508
+ const shutdown = (resolve: () => void) => {
1509
+ if (heartbeat) clearInterval(heartbeat);
1510
+ closeShare?.();
1511
+ server?.stop();
1512
+ resolve();
1513
+ };
1271
1514
  await new Promise<void>((resolve) => {
1272
- process.on("SIGINT", () => {
1273
- closeShare?.();
1274
- server?.stop();
1275
- resolve();
1276
- });
1277
- process.on("SIGTERM", () => {
1278
- closeShare?.();
1279
- server?.stop();
1280
- resolve();
1281
- });
1515
+ process.on("SIGINT", () => shutdown(resolve));
1516
+ process.on("SIGTERM", () => shutdown(resolve));
1282
1517
  });
1283
1518
 
1284
1519
  return 0;
package/ts/share.ts CHANGED
@@ -37,6 +37,35 @@ const HOST_HEARTBEAT_MS = 20000; // keepalive ping to the rendezvous + silent-dr
37
37
  // heartbeat never trips. Re-running the hello on a timer forces the DO to
38
38
  // re-register us, self-healing that state. Cheap: one reconnect per few minutes.
39
39
  const SIG_REFRESH_MS = 4 * 60_000;
40
+ // If building a peer connection fails this many times in a row, the native
41
+ // WebRTC stack (node-datachannel) is wedged — observed after long daemon uptime:
42
+ // signaling stays connected and peer-joins arrive, but every createOffer fails,
43
+ // so the host silently answers nobody and the room looks "offline". Reconnecting
44
+ // the socket can't clear it; only a fresh process can. Exit so the service
45
+ // manager restarts us with a clean stack (a fresh process provably works).
46
+ const MAX_PEER_SETUP_FAILURES = 3;
47
+ // Serialize peer setup. The recurring live freeze is an upstream libdatachannel
48
+ // 0.24.2 deadlock tripped by a RECONNECT STORM — several browser tabs/devices all
49
+ // reconnecting at once (e.g. right after the daemon restarts) fire many concurrent
50
+ // real-DTLS handshakes, which wedges the native stack. Process peer-joins one at a
51
+ // time with a small gap so a burst is staggered instead of simultaneous.
52
+ const PEER_JOIN_GAP_MS = 300;
53
+ // Abandon a peer setup that doesn't settle in time (a wedged native createOffer
54
+ // would otherwise stall the whole serial queue) — counts as a setup failure so
55
+ // the self-heal can fire. And cap the queue so a pathological storm can't grow it
56
+ // unboundedly; dropped joins simply retry via the browser's reconnect.
57
+ const STARTPEER_TIMEOUT_MS = 10_000;
58
+ const MAX_PEER_JOIN_QUEUE = 50;
59
+ // Proactively recycle the process to PREVENT the node-datachannel freeze rather
60
+ // than just recover from it. The wedge has been observed at ~60-90min uptime, so
61
+ // restarting well before that keeps the native stack fresh and the freeze rare.
62
+ // Two thresholds (relies on the daemon's `--restart always` policy):
63
+ // - IDLE: when there are no active peers, refresh early during a quiet moment.
64
+ // - HARD: a ceiling that fires even mid-session (closing peers gracefully so
65
+ // browsers reconnect in ~1s) — a ~1s blip every ~45min beats a ~90s freeze.
66
+ const IDLE_RESTART_UPTIME_MS = 25 * 60_000;
67
+ const HARD_RESTART_UPTIME_MS = 45 * 60_000;
68
+ const IDLE_RESTART_CHECK_MS = 60_000;
40
69
 
41
70
  type IceServer = { urls: string | string[]; username?: string; credential?: string };
42
71
  const STUN: IceServer[] = [{ urls: "stun:stun.l.google.com:19302" }];
@@ -191,6 +220,14 @@ async function ensureAddon(ndDir: string): Promise<void> {
191
220
  }
192
221
  }
193
222
 
223
+ // NOTE on the node-datachannel freeze: holding libdatachannel's global init alive
224
+ // via preload() was tried here — it prevents a *loopback* churn freeze (rapid
225
+ // create/close, reproduced at ~1400 cycles) but did NOT fix the live freeze, which
226
+ // is a separate upstream libdatachannel 0.24.2 deadlock triggered by real browser
227
+ // DTLS connections (esp. reconnection storms across multiple tabs). preload() is
228
+ // unproven under that live load, so it's not used; the proactive recycle + oxmgr
229
+ // health watchdog remain the mitigation until node-datachannel ships libdatachannel
230
+ // >0.24.2 (cf upstream #1538/#1548).
194
231
  async function importRTC(): Promise<any> {
195
232
  // Ensure the native addon is on disk before the first import — a failed
196
233
  // dynamic import is cached by Bun, so post-import healing can't recover it.
@@ -243,7 +280,8 @@ export async function startShare(
243
280
  let S = firstS;
244
281
 
245
282
  const wsScheme = host.startsWith("localhost") || host.startsWith("127.") ? "ws" : "wss";
246
- const ui = host === "s.agent-yes.com" ? "https://agent-yes.com" : "http://localhost:7778";
283
+ // The console web-app is served under /w/ (landing page lives at /).
284
+ const ui = host === "s.agent-yes.com" ? "https://agent-yes.com/w" : "http://localhost:7778/w";
247
285
  const suffix = host === "s.agent-yes.com" ? "" : "@" + host;
248
286
  const mkLink = () => `${ui}/#${room}:${MARKER}${S}${suffix}`;
249
287
  let authToken = await deriveAuthToken(S, room, host);
@@ -299,6 +337,61 @@ export async function startShare(
299
337
  const peers = new Map<string, Peer>();
300
338
  let closed = false; // set by close(); stops signaling reconnect + new peers
301
339
  let currentWs: WebSocket | undefined; // the live rendezvous socket, for close()
340
+ let peerSetupFailures = 0; // consecutive startPeer() throws — see MAX_PEER_SETUP_FAILURES
341
+
342
+ // Serial peer-join queue (see PEER_JOIN_GAP_MS): drain one at a time so a
343
+ // reconnect storm can't fire many concurrent DTLS handshakes and wedge the
344
+ // native stack. startPeer() awaits its async steps, yielding the event loop
345
+ // between peers, so this staggers setup without blocking the loop / heartbeat.
346
+ const peerJoinQueue: string[] = [];
347
+ let drainingPeerJoins = false;
348
+ const drainPeerJoins = async () => {
349
+ if (drainingPeerJoins) return;
350
+ drainingPeerJoins = true;
351
+ try {
352
+ while (!closed && peerJoinQueue.length) {
353
+ const peerId = peerJoinQueue.shift()!;
354
+ const ws = currentWs; // send offer/candidates on the live socket
355
+ if (!ws) continue;
356
+ try {
357
+ // Bound it: a wedged native createOffer must not stall the queue forever.
358
+ let timer: ReturnType<typeof setTimeout>;
359
+ const setup = startPeer(ws, peerId);
360
+ // If it times out, startPeer keeps running (native calls can't be
361
+ // cancelled) and may settle later — swallow that so it isn't an
362
+ // unhandled rejection; startPeer itself no-ops a late offer (peer guard).
363
+ setup.catch(() => {});
364
+ await Promise.race([
365
+ setup,
366
+ new Promise((_, reject) => {
367
+ timer = setTimeout(
368
+ () => reject(new Error("startPeer timeout")),
369
+ STARTPEER_TIMEOUT_MS,
370
+ );
371
+ }),
372
+ ]).finally(() => clearTimeout(timer!));
373
+ peerSetupFailures = 0; // a delivered offer proves the WebRTC stack works
374
+ } catch (err) {
375
+ // Don't swallow this: a failed createOffer is why a long-up host goes
376
+ // silently "offline". Surface it, and if it keeps failing, self-heal.
377
+ peerSetupFailures++;
378
+ process.stderr.write(
379
+ `[share] peer setup failed (${peerSetupFailures}/${MAX_PEER_SETUP_FAILURES}): ${(err as Error)?.message ?? err}\n`,
380
+ );
381
+ closePeer(peerId);
382
+ if (peerSetupFailures >= MAX_PEER_SETUP_FAILURES) {
383
+ process.stderr.write(
384
+ "[share] WebRTC stack wedged after repeated peer-setup failures — exiting so the service manager restarts with a fresh stack\n",
385
+ );
386
+ process.exit(1);
387
+ }
388
+ }
389
+ if (peerJoinQueue.length) await new Promise((r) => setTimeout(r, PEER_JOIN_GAP_MS));
390
+ }
391
+ } finally {
392
+ drainingPeerJoins = false;
393
+ }
394
+ };
302
395
 
303
396
  const connectSignaling = (onReady: () => void) => {
304
397
  if (closed) return; // a reconnect timer queued before close() must not revive it
@@ -355,8 +448,20 @@ export async function startShare(
355
448
  lastRecv = Date.now();
356
449
  const m = JSON.parse(ev.data as string);
357
450
  if (m.type === "pong") return; // heartbeat ack — liveness already recorded
358
- if (m.type === "peer-join") startPeer(ws, m.peer).catch(() => {});
359
- else if (m.type === "answer") {
451
+ if (m.type === "peer-join") {
452
+ // Serialized in drainPeerJoins() to avoid a storm. Skip dupes (already
453
+ // queued or already an active peer) and cap the queue so a pathological
454
+ // burst can't grow it unboundedly — dropped joins retry via the browser.
455
+ const pid = String(m.peer);
456
+ if (
457
+ !peers.has(pid) &&
458
+ !peerJoinQueue.includes(pid) &&
459
+ peerJoinQueue.length < MAX_PEER_JOIN_QUEUE
460
+ ) {
461
+ peerJoinQueue.push(pid);
462
+ drainPeerJoins();
463
+ }
464
+ } else if (m.type === "answer") {
360
465
  const peer = peers.get(m.from);
361
466
  if (!peer) return;
362
467
  try {
@@ -467,6 +572,22 @@ export async function startShare(
467
572
  };
468
573
  const offer = await pc.createOffer();
469
574
  await pc.setLocalDescription(offer);
575
+ // The setup may have been abandoned while we built the offer. If THIS peer is
576
+ // no longer the map entry (serial-queue timeout closed it / a peer-leave
577
+ // arrived and its createOffer only just now resolved), close only this
578
+ // orphaned pc — don't closePeer(peerId) by id, which could hit a different
579
+ // entry. If it's still us but the socket was recycled (SIG_REFRESH_MS), drop
580
+ // cleanly; the browser re-joins on the fresh socket.
581
+ if (peers.get(peerId) !== peer) {
582
+ try {
583
+ peer.pc.close();
584
+ } catch {}
585
+ return;
586
+ }
587
+ if (ws.readyState !== WebSocket.OPEN) {
588
+ closePeer(peerId);
589
+ return;
590
+ }
470
591
  // Hand the browser the same ICE servers (incl. the short-lived TURN creds)
471
592
  // so it can relay too when there's no direct path.
472
593
  ws.send(
@@ -605,12 +726,44 @@ export async function startShare(
605
726
  await new Promise<void>((resolve) => connectSignaling(resolve));
606
727
  void minted; // (informational) caller decides how to surface the link
607
728
 
729
+ // Proactive restart to PREVENT the node-datachannel freeze (~60-90min onset):
730
+ // refresh the native stack well before it can wedge. Idle path refreshes early
731
+ // during a quiet moment; the hard ceiling fires even mid-session, closing peers
732
+ // gracefully first so browsers reconnect in ~1s (a tiny blip beats a ~90s
733
+ // freeze). Daemon-only (non-TTY): a foreground `ay serve` has no restart
734
+ // manager, so exiting would just stop sharing — and the user is there to act.
735
+ const startedAt = Date.now();
736
+ const proactiveRestart = process.stdout.isTTY
737
+ ? undefined
738
+ : setInterval(() => {
739
+ if (closed) return;
740
+ const up = Date.now() - startedAt;
741
+ if (peers.size === 0 && up > IDLE_RESTART_UPTIME_MS) {
742
+ process.stderr.write("[share] proactive restart (idle): refreshing the WebRTC stack\n");
743
+ process.exit(0); // `--restart always` brings us back with a fresh stack
744
+ } else if (up > HARD_RESTART_UPTIME_MS) {
745
+ process.stderr.write(
746
+ "[share] proactive restart (max uptime): closing peers, refreshing the WebRTC stack\n",
747
+ );
748
+ // graceful: DataChannel close → browsers reconnect to the fresh process.
749
+ // finally-guard the exit so a throw in close() can't leave us dead-but-
750
+ // not-respawned.
751
+ try {
752
+ close();
753
+ } finally {
754
+ setTimeout(() => process.exit(0), 250); // let close frames flush first
755
+ }
756
+ }
757
+ }, IDLE_RESTART_CHECK_MS);
758
+ proactiveRestart?.unref?.(); // don't keep the event loop alive on this timer alone
759
+
608
760
  // Clean shutdown: stop the rendezvous (so it can't reconnect or accept new
609
761
  // peers) and close every peer connection so browsers get an immediate
610
762
  // DataChannel close and reconnect right away, instead of waiting out the
611
763
  // ~15-30s ICE timeout that an abrupt process exit would otherwise force.
612
764
  const close = () => {
613
765
  closed = true;
766
+ if (proactiveRestart) clearInterval(proactiveRestart);
614
767
  try {
615
768
  currentWs?.close();
616
769
  } catch {
package/ts/subcommands.ts CHANGED
Binary file