agent-yes 1.122.2 → 1.123.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/default.config.yaml +19 -0
- package/dist/{SUPPORTED_CLIS-BTu2brih.js → SUPPORTED_CLIS-B4O2cFlt.js} +2 -2
- package/dist/SUPPORTED_CLIS-DHkqGoNv.js +8 -0
- package/dist/{agent-yes.config-z-IPzH5U.js → agent-yes.config-D6ycMApr.js} +2 -65
- package/dist/cli.js +6 -6
- package/dist/configShared-C5QaNPnz.js +71 -0
- package/dist/{globalPidIndex-gZuTvTBs.js → globalPidIndex-C7r2m6s7.js} +19 -20
- package/dist/index.js +4 -4
- package/dist/pidStore-C4c2O15q.js +5 -0
- package/dist/{pidStore-B5vBu8Px.js → pidStore-CGKIhaJO.js} +5 -4
- package/dist/reaper-BLVA780B.js +3 -0
- package/dist/{reaper-Dj8R7ltI.js → reaper-BkjPN7mw.js} +24 -2
- package/dist/{remotes-CpGcTr7A.js → remotes-BRCDVnR7.js} +1 -1
- package/dist/{remotes-D2fqaRU8.js → remotes-D8GvSbhf.js} +1 -1
- package/dist/{schedule-DgRrdA_n.js → schedule-DULdIkU9.js} +7 -7
- package/dist/{serve-tn7ZetZs.js → serve-r_2v9EKc.js} +202 -58
- package/dist/{setup-dZhgpNse.js → setup-DHa6fX8M.js} +3 -3
- package/dist/{share-CksllWW-.js → share-YuM6-Q6A.js} +78 -4
- package/dist/{subcommands-D9BWZilr.js → subcommands-B13Kto-u.js} +647 -32
- package/dist/subcommands-Tv6AwUkD.js +7 -0
- package/dist/{tray-DjCIyakK.js → tray-BVnJLThD.js} +1 -1
- package/dist/{ts-CIf0uaR7.js → ts-DgukRoEI.js} +10 -7
- package/dist/{versionChecker-DjxKi4qe.js → versionChecker-BqOr1YqC.js} +2 -2
- package/dist/{workspaceConfig-XP2NEWmV.js → workspaceConfig-BJO4fzEn.js} +1 -1
- package/lab/ui/console-logic.js +222 -10
- package/lab/ui/icon.svg +5 -0
- package/lab/ui/index.html +689 -14
- package/lab/ui/landing.html +276 -0
- package/lab/ui/manifest.webmanifest +14 -0
- package/lab/ui/sw.js +56 -0
- package/package.json +5 -1
- package/ts/agentTree.spec.ts +92 -0
- package/ts/agentTree.ts +149 -0
- package/ts/configShared.ts +4 -0
- package/ts/globalPidIndex.ts +28 -20
- package/ts/idleWaiter.spec.ts +7 -1
- package/ts/index.ts +9 -0
- package/ts/lsWatch.spec.ts +61 -0
- package/ts/lsWatch.ts +94 -0
- package/ts/needsInput.spec.ts +55 -0
- package/ts/needsInput.ts +68 -0
- package/ts/pidStore.ts +3 -0
- package/ts/reaper.spec.ts +26 -2
- package/ts/reaper.ts +25 -0
- package/ts/resultEnvelope.spec.ts +43 -0
- package/ts/resultEnvelope.ts +88 -0
- package/ts/serve.ts +276 -41
- package/ts/share.ts +156 -3
- package/ts/subcommands.ts +0 -0
- package/ts/todoParse.spec.ts +68 -0
- package/ts/todoParse.ts +88 -0
- package/ts/utils.spec.ts +4 -1
- package/dist/SUPPORTED_CLIS-DcOKE9Nz.js +0 -8
- package/dist/pidStore-7y1cTcAE.js +0 -5
- package/dist/reaper-HqcUms2d.js +0 -3
- package/dist/subcommands-D8sHibKu.js +0 -6
package/ts/serve.ts
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
import { mkdir, open, readFile, writeFile } from "fs/promises";
|
|
2
|
-
import { watch } from "node:fs";
|
|
1
|
+
import { mkdir, open, readFile, stat, writeFile } from "fs/promises";
|
|
2
|
+
import { renameSync, watch, writeFileSync } from "node:fs";
|
|
3
3
|
import { fileURLToPath } from "node:url";
|
|
4
4
|
import { createHash, randomBytes, timingSafeEqual } from "crypto";
|
|
5
5
|
import { homedir, hostname, userInfo } from "os";
|
|
@@ -7,6 +7,7 @@ import path from "path";
|
|
|
7
7
|
import yargs from "yargs";
|
|
8
8
|
import {
|
|
9
9
|
controlCodeFromName,
|
|
10
|
+
extractTaskCounts,
|
|
10
11
|
listRecords,
|
|
11
12
|
readNotes,
|
|
12
13
|
renderRawLog,
|
|
@@ -15,6 +16,8 @@ import {
|
|
|
15
16
|
writeToIpc,
|
|
16
17
|
type CommonOpts,
|
|
17
18
|
} from "./subcommands.ts";
|
|
19
|
+
import { updateGlobalPidStatus } from "./globalPidIndex.ts";
|
|
20
|
+
import { pgidForWrapper } from "./reaper.ts";
|
|
18
21
|
import { SUPPORTED_CLIS } from "./SUPPORTED_CLIS.ts";
|
|
19
22
|
import { getInstalledPackage } from "./versionChecker.ts";
|
|
20
23
|
|
|
@@ -28,6 +31,19 @@ function tokenPath(): string {
|
|
|
28
31
|
return path.join(agentYesHome(), ".serve-token");
|
|
29
32
|
}
|
|
30
33
|
|
|
34
|
+
// Liveness heartbeat for the WebRTC daemon. The native WebRTC stack
|
|
35
|
+
// (node-datachannel) has been observed to freeze the entire JS event loop after
|
|
36
|
+
// long uptime — main thread stuck in a pthread rendezvous — so signaling stays
|
|
37
|
+
// connected but the host answers nobody, and NO in-process timer (self-heal,
|
|
38
|
+
// idle-restart) can recover it because JS has stopped running. The serve loop
|
|
39
|
+
// stamps this file every HEARTBEAT_WRITE_MS; `ay serve healthcheck` reports the
|
|
40
|
+
// daemon unhealthy once it goes stale, and oxmgr's --health-cmd restarts it.
|
|
41
|
+
function heartbeatPath(): string {
|
|
42
|
+
return path.join(agentYesHome(), ".serve-heartbeat");
|
|
43
|
+
}
|
|
44
|
+
const HEARTBEAT_WRITE_MS = 5_000;
|
|
45
|
+
const HEARTBEAT_STALE_MS = 15_000; // event loop is wedged if no stamp this long (3 missed)
|
|
46
|
+
|
|
31
47
|
async function loadOrCreateToken(tokenFlag?: string): Promise<string> {
|
|
32
48
|
if (tokenFlag) return tokenFlag;
|
|
33
49
|
try {
|
|
@@ -320,9 +336,42 @@ async function cmdServeDaemon(sub: string, args: string[]): Promise<number> {
|
|
|
320
336
|
// args after `--`. Both auto-restart on crash by default (pm2) / via the
|
|
321
337
|
// explicit flag (oxmgr).
|
|
322
338
|
const serveArgv = ayServeArgv(effArgs);
|
|
339
|
+
// WebRTC daemons get an oxmgr health watchdog: the native WebRTC stack can
|
|
340
|
+
// freeze the JS event loop (host answers nobody, no in-process timer can
|
|
341
|
+
// recover it), so an EXTERNAL probe of the serve heartbeat is the only thing
|
|
342
|
+
// that can detect+restart it. 15s stale + 3 misses at 10s ≈ 45s to auto-recover.
|
|
343
|
+
const webrtcDaemon = effArgs.some((a) => a.startsWith("--webrtc") || a.startsWith("--share"));
|
|
344
|
+
const oxmgrHealth =
|
|
345
|
+
webrtcDaemon && mgr.id === "oxmgr"
|
|
346
|
+
? [
|
|
347
|
+
"--health-cmd",
|
|
348
|
+
ayServeArgv(["healthcheck"]).join(" "),
|
|
349
|
+
"--health-interval",
|
|
350
|
+
"10",
|
|
351
|
+
"--health-timeout",
|
|
352
|
+
"5",
|
|
353
|
+
"--health-max-failures",
|
|
354
|
+
"3",
|
|
355
|
+
]
|
|
356
|
+
: [];
|
|
323
357
|
const startArgv =
|
|
324
358
|
mgr.id === "oxmgr"
|
|
325
|
-
? [
|
|
359
|
+
? [
|
|
360
|
+
mgr.bin,
|
|
361
|
+
"start",
|
|
362
|
+
serveArgv.join(" "),
|
|
363
|
+
"--name",
|
|
364
|
+
DAEMON_NAME,
|
|
365
|
+
"--restart",
|
|
366
|
+
"always",
|
|
367
|
+
// Persistent daemon: oxmgr's default lifetime cap of 10 restarts would
|
|
368
|
+
// eventually stop respawning it (updates, reboots, the health-watchdog
|
|
369
|
+
// recovering a frozen WebRTC stack). Raise it far out of the way; the
|
|
370
|
+
// crash-restart-limit still guards against a tight crash loop.
|
|
371
|
+
"--max-restarts",
|
|
372
|
+
"1000000",
|
|
373
|
+
...oxmgrHealth,
|
|
374
|
+
]
|
|
326
375
|
: [
|
|
327
376
|
mgr.bin,
|
|
328
377
|
"start",
|
|
@@ -528,6 +577,28 @@ export async function cmdServe(rest: string[]): Promise<number> {
|
|
|
528
577
|
// Daemon subcommands
|
|
529
578
|
const sub = rest[0];
|
|
530
579
|
if (sub === "status") return cmdServeStatus(rest.slice(1));
|
|
580
|
+
if (sub === "healthcheck") {
|
|
581
|
+
// oxmgr --health-cmd liveness probe. Exit non-zero only when the heartbeat is
|
|
582
|
+
// demonstrably stale (event loop wedged), so the manager restarts us. A
|
|
583
|
+
// missing/just-started/unparseable heartbeat is treated as healthy to avoid
|
|
584
|
+
// flapping a daemon that simply hasn't stamped yet.
|
|
585
|
+
try {
|
|
586
|
+
const raw = (await readFile(heartbeatPath(), "utf-8")).trim();
|
|
587
|
+
const ts = Number(raw);
|
|
588
|
+
// Only declare unhealthy on a VALID, genuinely-old stamp. Empty/partial/NaN
|
|
589
|
+
// (Number("") === 0!) is treated as healthy so a torn read or a not-yet-
|
|
590
|
+
// written file can't trigger a false restart. (Writes are atomic via
|
|
591
|
+
// temp+rename, so a torn read shouldn't happen — this is belt-and-braces.)
|
|
592
|
+
const age = Date.now() - ts;
|
|
593
|
+
if (raw.length > 0 && Number.isFinite(ts) && ts > 0 && age > HEARTBEAT_STALE_MS) {
|
|
594
|
+
process.stderr.write(`unhealthy: serve heartbeat stale by ${age}ms\n`);
|
|
595
|
+
return 1;
|
|
596
|
+
}
|
|
597
|
+
} catch {
|
|
598
|
+
/* no heartbeat yet — treat as healthy */
|
|
599
|
+
}
|
|
600
|
+
return 0;
|
|
601
|
+
}
|
|
531
602
|
if (sub === "install" || sub === "uninstall" || sub === "logs") {
|
|
532
603
|
return cmdServeDaemon(sub, rest.slice(1));
|
|
533
604
|
}
|
|
@@ -655,11 +726,40 @@ export async function cmdServe(rest: string[]): Promise<number> {
|
|
|
655
726
|
}
|
|
656
727
|
};
|
|
657
728
|
|
|
658
|
-
// Per-
|
|
659
|
-
//
|
|
660
|
-
//
|
|
661
|
-
//
|
|
662
|
-
//
|
|
729
|
+
// Per-agent task progress ({done,total}) parsed from the agent's rendered TUI
|
|
730
|
+
// screen (the durable raw log). Cached per (size, mtime) exactly like logTitle:
|
|
731
|
+
// re-parse only when the log grew, so the 1s tick stays cheap even though each
|
|
732
|
+
// parse renders a log window through xterm. Works for every CLI — the source is
|
|
733
|
+
// the drawn todo block, not a CLI-specific session file. See extractTaskCounts.
|
|
734
|
+
const taskCache = new Map<
|
|
735
|
+
string,
|
|
736
|
+
{ size: number; mtimeMs: number; tasks: { done: number; total: number } | null }
|
|
737
|
+
>();
|
|
738
|
+
const logTasks = async (
|
|
739
|
+
logFile: string | null | undefined,
|
|
740
|
+
): Promise<{ done: number; total: number } | null> => {
|
|
741
|
+
if (!logFile) return null;
|
|
742
|
+
try {
|
|
743
|
+
const { size, mtimeMs } = await stat(logFile);
|
|
744
|
+
const hit = taskCache.get(logFile);
|
|
745
|
+
if (hit && hit.size === size && hit.mtimeMs === mtimeMs) return hit.tasks;
|
|
746
|
+
const tasks = await extractTaskCounts(logFile);
|
|
747
|
+
taskCache.set(logFile, { size, mtimeMs, tasks });
|
|
748
|
+
return tasks;
|
|
749
|
+
} catch {
|
|
750
|
+
return null;
|
|
751
|
+
}
|
|
752
|
+
};
|
|
753
|
+
|
|
754
|
+
// Per-repo git snapshot for the list (branch + dirty/changed + ahead/behind, from
|
|
755
|
+
// one `git status --porcelain --branch`). WATCHER-INVALIDATED, not polled: a read
|
|
756
|
+
// returns the cached snapshot instantly and NEVER spawns `git status` on the
|
|
757
|
+
// request path. A per-repo-root fs watcher recomputes (debounced) only when the
|
|
758
|
+
// repo actually changes, so an idle fleet costs ~0 git processes. The old design
|
|
759
|
+
// forked one `git status` per agent every poll tick — with dozens of agents that
|
|
760
|
+
// concurrent fan-out pinned host load (high load-average, low CPU: fork + I/O,
|
|
761
|
+
// not compute). Modeled on VSCode's git extension: watch + debounce, no interval
|
|
762
|
+
// poll (just a slow safety recompute for events a watcher might miss).
|
|
663
763
|
interface GitInfo {
|
|
664
764
|
branch: string | null;
|
|
665
765
|
dirty: boolean;
|
|
@@ -667,16 +767,11 @@ export async function cmdServe(rest: string[]): Promise<number> {
|
|
|
667
767
|
ahead: number;
|
|
668
768
|
behind: number;
|
|
669
769
|
}
|
|
670
|
-
const
|
|
671
|
-
const
|
|
672
|
-
const
|
|
673
|
-
if (!cwd) return null;
|
|
674
|
-
const now = Date.now();
|
|
675
|
-
const hit = gitCache.get(cwd);
|
|
676
|
-
if (hit && now - hit.at < GIT_TTL_MS) return hit.val;
|
|
677
|
-
let val: GitInfo | null = null;
|
|
770
|
+
const GIT_DEBOUNCE_MS = 800; // coalesce a burst of edits into one recompute
|
|
771
|
+
const GIT_SAFETY_MS = 60_000; // backstop recompute for any missed watch event
|
|
772
|
+
const runGit = async (args: string[], cwd: string): Promise<string | null> => {
|
|
678
773
|
try {
|
|
679
|
-
const proc = Bun.spawn(["git",
|
|
774
|
+
const proc = Bun.spawn(["git", ...args], {
|
|
680
775
|
cwd,
|
|
681
776
|
stdout: "pipe",
|
|
682
777
|
stderr: "ignore",
|
|
@@ -684,23 +779,89 @@ export async function cmdServe(rest: string[]): Promise<number> {
|
|
|
684
779
|
});
|
|
685
780
|
const out = await new Response(proc.stdout).text();
|
|
686
781
|
await proc.exited;
|
|
687
|
-
|
|
688
|
-
|
|
689
|
-
|
|
690
|
-
|
|
691
|
-
|
|
692
|
-
|
|
693
|
-
|
|
694
|
-
|
|
695
|
-
|
|
696
|
-
|
|
697
|
-
|
|
782
|
+
return proc.exitCode === 0 ? out : null;
|
|
783
|
+
} catch {
|
|
784
|
+
return null; // git missing, not a repo, or timed out
|
|
785
|
+
}
|
|
786
|
+
};
|
|
787
|
+
const parseGitStatus = (out: string): GitInfo => {
|
|
788
|
+
const lines = out.split("\n");
|
|
789
|
+
// Branch header, e.g. "## main...origin/main [ahead 1, behind 2]", "## main"
|
|
790
|
+
// (no upstream), "## HEAD (no branch)", or "## No commits yet on x".
|
|
791
|
+
const h = /^## (.+)$/.exec(lines[0] ?? "")?.[1] ?? "";
|
|
792
|
+
const unborn = /^No commits yet on (.+)$/.exec(h);
|
|
793
|
+
const branch = unborn ? unborn[1]! : /^(.+?)(?:\.\.\.|\s|$)/.exec(h)?.[1] || null;
|
|
794
|
+
const ahead = Number(/\bahead (\d+)/.exec(h)?.[1] ?? 0);
|
|
795
|
+
const behind = Number(/\bbehind (\d+)/.exec(h)?.[1] ?? 0);
|
|
796
|
+
const changed = lines.slice(1).filter((l) => l.trim().length > 0).length;
|
|
797
|
+
return { branch, dirty: changed > 0, changed, ahead, behind };
|
|
798
|
+
};
|
|
799
|
+
// cwd -> repo root ("" = resolved, not a repo). Resolved once per cwd via a cheap
|
|
800
|
+
// `git rev-parse --show-toplevel` (no tree scan) and cached, so many agents in the
|
|
801
|
+
// same repo (or its submodules/subdirs) share one watcher + snapshot.
|
|
802
|
+
const rootOfCwd = new Map<string, string>();
|
|
803
|
+
const resolveRoot = async (cwd: string): Promise<string> => {
|
|
804
|
+
const cached = rootOfCwd.get(cwd);
|
|
805
|
+
if (cached !== undefined) return cached;
|
|
806
|
+
const root = ((await runGit(["rev-parse", "--show-toplevel"], cwd)) ?? "").trim();
|
|
807
|
+
rootOfCwd.set(cwd, root);
|
|
808
|
+
return root;
|
|
809
|
+
};
|
|
810
|
+
interface RepoWatch {
|
|
811
|
+
val: GitInfo | null;
|
|
812
|
+
busy: boolean;
|
|
813
|
+
timer: ReturnType<typeof setTimeout> | null;
|
|
814
|
+
}
|
|
815
|
+
const repoWatch = new Map<string, RepoWatch>();
|
|
816
|
+
const recompute = (root: string, rw: RepoWatch) => {
|
|
817
|
+
if (rw.timer) return; // a recompute is already queued (debounce + throttle)
|
|
818
|
+
rw.timer = setTimeout(async () => {
|
|
819
|
+
rw.timer = null;
|
|
820
|
+
if (rw.busy) return void recompute(root, rw); // re-arm if one is in flight
|
|
821
|
+
rw.busy = true;
|
|
822
|
+
try {
|
|
823
|
+
const out = await runGit(["status", "--porcelain", "--branch"], root);
|
|
824
|
+
if (out != null) rw.val = parseGitStatus(out);
|
|
825
|
+
} finally {
|
|
826
|
+
rw.busy = false;
|
|
698
827
|
}
|
|
828
|
+
}, GIT_DEBOUNCE_MS);
|
|
829
|
+
};
|
|
830
|
+
const ensureRepoWatch = (root: string): RepoWatch => {
|
|
831
|
+
const existing = repoWatch.get(root);
|
|
832
|
+
if (existing) return existing;
|
|
833
|
+
const rw: RepoWatch = { val: null, busy: false, timer: null };
|
|
834
|
+
repoWatch.set(root, rw);
|
|
835
|
+
recompute(root, rw); // initial snapshot
|
|
836
|
+
// Ignore high-churn paths that never change `git status` output: our own log
|
|
837
|
+
// dir (.agent-yes, written on every PTY byte — would re-trigger forever),
|
|
838
|
+
// gitignored deps (node_modules), and git's own lock files.
|
|
839
|
+
const onChange = (file: string) => {
|
|
840
|
+
if (file.includes(".agent-yes") || file.includes("node_modules") || file.endsWith(".lock"))
|
|
841
|
+
return;
|
|
842
|
+
recompute(root, rw);
|
|
843
|
+
};
|
|
844
|
+
try {
|
|
845
|
+
// macOS/Windows: one recursive watcher (FSEvents/ReadDirectoryChanges) covers
|
|
846
|
+
// the working tree (dirty) AND .git (branch/ahead-behind) cheaply.
|
|
847
|
+
watch(root, { recursive: true }, (_e, f) => onChange(String(f ?? "")));
|
|
699
848
|
} catch {
|
|
700
|
-
|
|
849
|
+
// Recursive watch unsupported (some Linux/Bun builds): watch .git only —
|
|
850
|
+
// catches commit/branch/stage instantly; dirty count rides the safety tick.
|
|
851
|
+
try {
|
|
852
|
+
watch(path.join(root, ".git"), (_e, f) => onChange(".git/" + String(f ?? "")));
|
|
853
|
+
} catch {
|
|
854
|
+
/* no watcher available — rely solely on the safety recompute */
|
|
855
|
+
}
|
|
701
856
|
}
|
|
702
|
-
|
|
703
|
-
return
|
|
857
|
+
setInterval(() => recompute(root, rw), GIT_SAFETY_MS);
|
|
858
|
+
return rw;
|
|
859
|
+
};
|
|
860
|
+
const gitStatus = async (cwd: string | null | undefined): Promise<GitInfo | null> => {
|
|
861
|
+
if (!cwd) return null;
|
|
862
|
+
const root = await resolveRoot(cwd);
|
|
863
|
+
if (!root) return null; // not a git repo
|
|
864
|
+
return ensureRepoWatch(root).val; // cached — the request path never spawns `git status`
|
|
704
865
|
};
|
|
705
866
|
|
|
706
867
|
// One agent record decorated for the console: the latest OSC title + a git
|
|
@@ -709,6 +870,9 @@ export async function cmdServe(rest: string[]): Promise<number> {
|
|
|
709
870
|
...r,
|
|
710
871
|
title: await logTitle(r.log_file),
|
|
711
872
|
git: r.status === "exited" ? null : await gitStatus(r.cwd),
|
|
873
|
+
// Task progress from the rendered todo block (null when none detected → no
|
|
874
|
+
// badge). Skipped for exited agents — their screen is no longer live.
|
|
875
|
+
tasks: r.status === "exited" ? null : await logTasks(r.log_file),
|
|
712
876
|
});
|
|
713
877
|
|
|
714
878
|
// The whole API as a plain handler: served over HTTP by Bun.serve (--http)
|
|
@@ -1056,6 +1220,59 @@ export async function cmdServe(rest: string[]): Promise<number> {
|
|
|
1056
1220
|
}
|
|
1057
1221
|
}
|
|
1058
1222
|
|
|
1223
|
+
// POST /api/kill body {keyword} — force-kill a stuck agent. The console can
|
|
1224
|
+
// already send keystrokes (Ctrl+C, /exit) via /api/send; this is the escalation
|
|
1225
|
+
// for an agent too wedged to respond to those: a real SIGKILL of its process
|
|
1226
|
+
// GROUP (wrapper + CLI + children), via the pgid the reaper recorded. The >1
|
|
1227
|
+
// guards are critical — process.kill(-1)/kill(0) would signal far too much.
|
|
1228
|
+
if (req.method === "POST" && p === "/api/kill") {
|
|
1229
|
+
let body: { keyword?: string };
|
|
1230
|
+
try {
|
|
1231
|
+
body = (await req.json()) as typeof body;
|
|
1232
|
+
} catch {
|
|
1233
|
+
return new Response("invalid JSON body", { status: 400 });
|
|
1234
|
+
}
|
|
1235
|
+
const keyword = body.keyword;
|
|
1236
|
+
if (!keyword || typeof keyword !== "string")
|
|
1237
|
+
return new Response("missing keyword", { status: 400 });
|
|
1238
|
+
if (process.platform === "win32")
|
|
1239
|
+
return new Response("force-kill unsupported on a Windows serve", { status: 501 });
|
|
1240
|
+
try {
|
|
1241
|
+
const record = await resolveOne(keyword, defaultOpts({ all: true }));
|
|
1242
|
+
const killed: string[] = [];
|
|
1243
|
+
const sig = (target: number, label: string) => {
|
|
1244
|
+
if (!target || target <= 1) return;
|
|
1245
|
+
try {
|
|
1246
|
+
process.kill(target, "SIGKILL");
|
|
1247
|
+
killed.push(label);
|
|
1248
|
+
} catch {
|
|
1249
|
+
/* ESRCH: already gone */
|
|
1250
|
+
}
|
|
1251
|
+
};
|
|
1252
|
+
// Whole process group first (kills children too), then the pids directly in
|
|
1253
|
+
// case they aren't group leaders.
|
|
1254
|
+
const pgid = await pgidForWrapper(record.wrapper_pid ?? 0);
|
|
1255
|
+
if (pgid && pgid > 1) {
|
|
1256
|
+
try {
|
|
1257
|
+
process.kill(-pgid, "SIGKILL");
|
|
1258
|
+
killed.push(`group ${pgid}`);
|
|
1259
|
+
} catch {
|
|
1260
|
+
/* group already gone */
|
|
1261
|
+
}
|
|
1262
|
+
}
|
|
1263
|
+
sig(record.pid, `pid ${record.pid}`);
|
|
1264
|
+
if (record.wrapper_pid && record.wrapper_pid !== record.pid)
|
|
1265
|
+
sig(record.wrapper_pid, `wrapper ${record.wrapper_pid}`);
|
|
1266
|
+
await updateGlobalPidStatus(record.pid, {
|
|
1267
|
+
status: "exited",
|
|
1268
|
+
exit_reason: "force-killed via console",
|
|
1269
|
+
}).catch(() => {});
|
|
1270
|
+
return Response.json({ ok: true, pid: record.pid, killed });
|
|
1271
|
+
} catch (e) {
|
|
1272
|
+
return new Response((e as Error).message, { status: 404 });
|
|
1273
|
+
}
|
|
1274
|
+
}
|
|
1275
|
+
|
|
1059
1276
|
// POST /api/resize/:keyword body {cols, rows} — drive the agent's PTY size.
|
|
1060
1277
|
// Mirrors `ay attach`: write ~/.agent-yes/winsize/<pid> then SIGWINCH; the
|
|
1061
1278
|
// agent's resize listener picks it up and reflows its TUI to that width.
|
|
@@ -1266,19 +1483,37 @@ export async function cmdServe(rest: string[]): Promise<number> {
|
|
|
1266
1483
|
}
|
|
1267
1484
|
}
|
|
1268
1485
|
|
|
1486
|
+
// Liveness heartbeat (WebRTC daemons only — that's where the native stack can
|
|
1487
|
+
// freeze the loop). If the event loop wedges, this interval stops firing, the
|
|
1488
|
+
// file goes stale, and oxmgr's --health-cmd (ay serve healthcheck) restarts us.
|
|
1489
|
+
let heartbeat: ReturnType<typeof setInterval> | undefined;
|
|
1490
|
+
if (wantWebrtc) {
|
|
1491
|
+
const stamp = () => {
|
|
1492
|
+
try {
|
|
1493
|
+
// Atomic: write a temp file then rename over the target, so a concurrent
|
|
1494
|
+
// `ay serve healthcheck` reader never sees a truncated/partial timestamp.
|
|
1495
|
+
const tmp = `${heartbeatPath()}.tmp`;
|
|
1496
|
+
writeFileSync(tmp, String(Date.now()));
|
|
1497
|
+
renameSync(tmp, heartbeatPath());
|
|
1498
|
+
} catch {
|
|
1499
|
+
/* best effort */
|
|
1500
|
+
}
|
|
1501
|
+
};
|
|
1502
|
+
stamp();
|
|
1503
|
+
heartbeat = setInterval(stamp, HEARTBEAT_WRITE_MS);
|
|
1504
|
+
}
|
|
1505
|
+
|
|
1269
1506
|
process.stdout.write(`(Ctrl-C to stop)\n`);
|
|
1270
1507
|
|
|
1508
|
+
const shutdown = (resolve: () => void) => {
|
|
1509
|
+
if (heartbeat) clearInterval(heartbeat);
|
|
1510
|
+
closeShare?.();
|
|
1511
|
+
server?.stop();
|
|
1512
|
+
resolve();
|
|
1513
|
+
};
|
|
1271
1514
|
await new Promise<void>((resolve) => {
|
|
1272
|
-
process.on("SIGINT", () =>
|
|
1273
|
-
|
|
1274
|
-
server?.stop();
|
|
1275
|
-
resolve();
|
|
1276
|
-
});
|
|
1277
|
-
process.on("SIGTERM", () => {
|
|
1278
|
-
closeShare?.();
|
|
1279
|
-
server?.stop();
|
|
1280
|
-
resolve();
|
|
1281
|
-
});
|
|
1515
|
+
process.on("SIGINT", () => shutdown(resolve));
|
|
1516
|
+
process.on("SIGTERM", () => shutdown(resolve));
|
|
1282
1517
|
});
|
|
1283
1518
|
|
|
1284
1519
|
return 0;
|
package/ts/share.ts
CHANGED
|
@@ -37,6 +37,35 @@ const HOST_HEARTBEAT_MS = 20000; // keepalive ping to the rendezvous + silent-dr
|
|
|
37
37
|
// heartbeat never trips. Re-running the hello on a timer forces the DO to
|
|
38
38
|
// re-register us, self-healing that state. Cheap: one reconnect per few minutes.
|
|
39
39
|
const SIG_REFRESH_MS = 4 * 60_000;
|
|
40
|
+
// If building a peer connection fails this many times in a row, the native
|
|
41
|
+
// WebRTC stack (node-datachannel) is wedged — observed after long daemon uptime:
|
|
42
|
+
// signaling stays connected and peer-joins arrive, but every createOffer fails,
|
|
43
|
+
// so the host silently answers nobody and the room looks "offline". Reconnecting
|
|
44
|
+
// the socket can't clear it; only a fresh process can. Exit so the service
|
|
45
|
+
// manager restarts us with a clean stack (a fresh process provably works).
|
|
46
|
+
const MAX_PEER_SETUP_FAILURES = 3;
|
|
47
|
+
// Serialize peer setup. The recurring live freeze is an upstream libdatachannel
|
|
48
|
+
// 0.24.2 deadlock tripped by a RECONNECT STORM — several browser tabs/devices all
|
|
49
|
+
// reconnecting at once (e.g. right after the daemon restarts) fire many concurrent
|
|
50
|
+
// real-DTLS handshakes, which wedges the native stack. Process peer-joins one at a
|
|
51
|
+
// time with a small gap so a burst is staggered instead of simultaneous.
|
|
52
|
+
const PEER_JOIN_GAP_MS = 300;
|
|
53
|
+
// Abandon a peer setup that doesn't settle in time (a wedged native createOffer
|
|
54
|
+
// would otherwise stall the whole serial queue) — counts as a setup failure so
|
|
55
|
+
// the self-heal can fire. And cap the queue so a pathological storm can't grow it
|
|
56
|
+
// unboundedly; dropped joins simply retry via the browser's reconnect.
|
|
57
|
+
const STARTPEER_TIMEOUT_MS = 10_000;
|
|
58
|
+
const MAX_PEER_JOIN_QUEUE = 50;
|
|
59
|
+
// Proactively recycle the process to PREVENT the node-datachannel freeze rather
|
|
60
|
+
// than just recover from it. The wedge has been observed at ~60-90min uptime, so
|
|
61
|
+
// restarting well before that keeps the native stack fresh and the freeze rare.
|
|
62
|
+
// Two thresholds (relies on the daemon's `--restart always` policy):
|
|
63
|
+
// - IDLE: when there are no active peers, refresh early during a quiet moment.
|
|
64
|
+
// - HARD: a ceiling that fires even mid-session (closing peers gracefully so
|
|
65
|
+
// browsers reconnect in ~1s) — a ~1s blip every ~45min beats a ~90s freeze.
|
|
66
|
+
const IDLE_RESTART_UPTIME_MS = 25 * 60_000;
|
|
67
|
+
const HARD_RESTART_UPTIME_MS = 45 * 60_000;
|
|
68
|
+
const IDLE_RESTART_CHECK_MS = 60_000;
|
|
40
69
|
|
|
41
70
|
type IceServer = { urls: string | string[]; username?: string; credential?: string };
|
|
42
71
|
const STUN: IceServer[] = [{ urls: "stun:stun.l.google.com:19302" }];
|
|
@@ -191,6 +220,14 @@ async function ensureAddon(ndDir: string): Promise<void> {
|
|
|
191
220
|
}
|
|
192
221
|
}
|
|
193
222
|
|
|
223
|
+
// NOTE on the node-datachannel freeze: holding libdatachannel's global init alive
|
|
224
|
+
// via preload() was tried here — it prevents a *loopback* churn freeze (rapid
|
|
225
|
+
// create/close, reproduced at ~1400 cycles) but did NOT fix the live freeze, which
|
|
226
|
+
// is a separate upstream libdatachannel 0.24.2 deadlock triggered by real browser
|
|
227
|
+
// DTLS connections (esp. reconnection storms across multiple tabs). preload() is
|
|
228
|
+
// unproven under that live load, so it's not used; the proactive recycle + oxmgr
|
|
229
|
+
// health watchdog remain the mitigation until node-datachannel ships libdatachannel
|
|
230
|
+
// >0.24.2 (cf upstream #1538/#1548).
|
|
194
231
|
async function importRTC(): Promise<any> {
|
|
195
232
|
// Ensure the native addon is on disk before the first import — a failed
|
|
196
233
|
// dynamic import is cached by Bun, so post-import healing can't recover it.
|
|
@@ -243,7 +280,8 @@ export async function startShare(
|
|
|
243
280
|
let S = firstS;
|
|
244
281
|
|
|
245
282
|
const wsScheme = host.startsWith("localhost") || host.startsWith("127.") ? "ws" : "wss";
|
|
246
|
-
|
|
283
|
+
// The console web-app is served under /w/ (landing page lives at /).
|
|
284
|
+
const ui = host === "s.agent-yes.com" ? "https://agent-yes.com/w" : "http://localhost:7778/w";
|
|
247
285
|
const suffix = host === "s.agent-yes.com" ? "" : "@" + host;
|
|
248
286
|
const mkLink = () => `${ui}/#${room}:${MARKER}${S}${suffix}`;
|
|
249
287
|
let authToken = await deriveAuthToken(S, room, host);
|
|
@@ -299,6 +337,61 @@ export async function startShare(
|
|
|
299
337
|
const peers = new Map<string, Peer>();
|
|
300
338
|
let closed = false; // set by close(); stops signaling reconnect + new peers
|
|
301
339
|
let currentWs: WebSocket | undefined; // the live rendezvous socket, for close()
|
|
340
|
+
let peerSetupFailures = 0; // consecutive startPeer() throws — see MAX_PEER_SETUP_FAILURES
|
|
341
|
+
|
|
342
|
+
// Serial peer-join queue (see PEER_JOIN_GAP_MS): drain one at a time so a
|
|
343
|
+
// reconnect storm can't fire many concurrent DTLS handshakes and wedge the
|
|
344
|
+
// native stack. startPeer() awaits its async steps, yielding the event loop
|
|
345
|
+
// between peers, so this staggers setup without blocking the loop / heartbeat.
|
|
346
|
+
const peerJoinQueue: string[] = [];
|
|
347
|
+
let drainingPeerJoins = false;
|
|
348
|
+
const drainPeerJoins = async () => {
|
|
349
|
+
if (drainingPeerJoins) return;
|
|
350
|
+
drainingPeerJoins = true;
|
|
351
|
+
try {
|
|
352
|
+
while (!closed && peerJoinQueue.length) {
|
|
353
|
+
const peerId = peerJoinQueue.shift()!;
|
|
354
|
+
const ws = currentWs; // send offer/candidates on the live socket
|
|
355
|
+
if (!ws) continue;
|
|
356
|
+
try {
|
|
357
|
+
// Bound it: a wedged native createOffer must not stall the queue forever.
|
|
358
|
+
let timer: ReturnType<typeof setTimeout>;
|
|
359
|
+
const setup = startPeer(ws, peerId);
|
|
360
|
+
// If it times out, startPeer keeps running (native calls can't be
|
|
361
|
+
// cancelled) and may settle later — swallow that so it isn't an
|
|
362
|
+
// unhandled rejection; startPeer itself no-ops a late offer (peer guard).
|
|
363
|
+
setup.catch(() => {});
|
|
364
|
+
await Promise.race([
|
|
365
|
+
setup,
|
|
366
|
+
new Promise((_, reject) => {
|
|
367
|
+
timer = setTimeout(
|
|
368
|
+
() => reject(new Error("startPeer timeout")),
|
|
369
|
+
STARTPEER_TIMEOUT_MS,
|
|
370
|
+
);
|
|
371
|
+
}),
|
|
372
|
+
]).finally(() => clearTimeout(timer!));
|
|
373
|
+
peerSetupFailures = 0; // a delivered offer proves the WebRTC stack works
|
|
374
|
+
} catch (err) {
|
|
375
|
+
// Don't swallow this: a failed createOffer is why a long-up host goes
|
|
376
|
+
// silently "offline". Surface it, and if it keeps failing, self-heal.
|
|
377
|
+
peerSetupFailures++;
|
|
378
|
+
process.stderr.write(
|
|
379
|
+
`[share] peer setup failed (${peerSetupFailures}/${MAX_PEER_SETUP_FAILURES}): ${(err as Error)?.message ?? err}\n`,
|
|
380
|
+
);
|
|
381
|
+
closePeer(peerId);
|
|
382
|
+
if (peerSetupFailures >= MAX_PEER_SETUP_FAILURES) {
|
|
383
|
+
process.stderr.write(
|
|
384
|
+
"[share] WebRTC stack wedged after repeated peer-setup failures — exiting so the service manager restarts with a fresh stack\n",
|
|
385
|
+
);
|
|
386
|
+
process.exit(1);
|
|
387
|
+
}
|
|
388
|
+
}
|
|
389
|
+
if (peerJoinQueue.length) await new Promise((r) => setTimeout(r, PEER_JOIN_GAP_MS));
|
|
390
|
+
}
|
|
391
|
+
} finally {
|
|
392
|
+
drainingPeerJoins = false;
|
|
393
|
+
}
|
|
394
|
+
};
|
|
302
395
|
|
|
303
396
|
const connectSignaling = (onReady: () => void) => {
|
|
304
397
|
if (closed) return; // a reconnect timer queued before close() must not revive it
|
|
@@ -355,8 +448,20 @@ export async function startShare(
|
|
|
355
448
|
lastRecv = Date.now();
|
|
356
449
|
const m = JSON.parse(ev.data as string);
|
|
357
450
|
if (m.type === "pong") return; // heartbeat ack — liveness already recorded
|
|
358
|
-
if (m.type === "peer-join")
|
|
359
|
-
|
|
451
|
+
if (m.type === "peer-join") {
|
|
452
|
+
// Serialized in drainPeerJoins() to avoid a storm. Skip dupes (already
|
|
453
|
+
// queued or already an active peer) and cap the queue so a pathological
|
|
454
|
+
// burst can't grow it unboundedly — dropped joins retry via the browser.
|
|
455
|
+
const pid = String(m.peer);
|
|
456
|
+
if (
|
|
457
|
+
!peers.has(pid) &&
|
|
458
|
+
!peerJoinQueue.includes(pid) &&
|
|
459
|
+
peerJoinQueue.length < MAX_PEER_JOIN_QUEUE
|
|
460
|
+
) {
|
|
461
|
+
peerJoinQueue.push(pid);
|
|
462
|
+
drainPeerJoins();
|
|
463
|
+
}
|
|
464
|
+
} else if (m.type === "answer") {
|
|
360
465
|
const peer = peers.get(m.from);
|
|
361
466
|
if (!peer) return;
|
|
362
467
|
try {
|
|
@@ -467,6 +572,22 @@ export async function startShare(
|
|
|
467
572
|
};
|
|
468
573
|
const offer = await pc.createOffer();
|
|
469
574
|
await pc.setLocalDescription(offer);
|
|
575
|
+
// The setup may have been abandoned while we built the offer. If THIS peer is
|
|
576
|
+
// no longer the map entry (serial-queue timeout closed it / a peer-leave
|
|
577
|
+
// arrived and its createOffer only just now resolved), close only this
|
|
578
|
+
// orphaned pc — don't closePeer(peerId) by id, which could hit a different
|
|
579
|
+
// entry. If it's still us but the socket was recycled (SIG_REFRESH_MS), drop
|
|
580
|
+
// cleanly; the browser re-joins on the fresh socket.
|
|
581
|
+
if (peers.get(peerId) !== peer) {
|
|
582
|
+
try {
|
|
583
|
+
peer.pc.close();
|
|
584
|
+
} catch {}
|
|
585
|
+
return;
|
|
586
|
+
}
|
|
587
|
+
if (ws.readyState !== WebSocket.OPEN) {
|
|
588
|
+
closePeer(peerId);
|
|
589
|
+
return;
|
|
590
|
+
}
|
|
470
591
|
// Hand the browser the same ICE servers (incl. the short-lived TURN creds)
|
|
471
592
|
// so it can relay too when there's no direct path.
|
|
472
593
|
ws.send(
|
|
@@ -605,12 +726,44 @@ export async function startShare(
|
|
|
605
726
|
await new Promise<void>((resolve) => connectSignaling(resolve));
|
|
606
727
|
void minted; // (informational) caller decides how to surface the link
|
|
607
728
|
|
|
729
|
+
// Proactive restart to PREVENT the node-datachannel freeze (~60-90min onset):
|
|
730
|
+
// refresh the native stack well before it can wedge. Idle path refreshes early
|
|
731
|
+
// during a quiet moment; the hard ceiling fires even mid-session, closing peers
|
|
732
|
+
// gracefully first so browsers reconnect in ~1s (a tiny blip beats a ~90s
|
|
733
|
+
// freeze). Daemon-only (non-TTY): a foreground `ay serve` has no restart
|
|
734
|
+
// manager, so exiting would just stop sharing — and the user is there to act.
|
|
735
|
+
const startedAt = Date.now();
|
|
736
|
+
const proactiveRestart = process.stdout.isTTY
|
|
737
|
+
? undefined
|
|
738
|
+
: setInterval(() => {
|
|
739
|
+
if (closed) return;
|
|
740
|
+
const up = Date.now() - startedAt;
|
|
741
|
+
if (peers.size === 0 && up > IDLE_RESTART_UPTIME_MS) {
|
|
742
|
+
process.stderr.write("[share] proactive restart (idle): refreshing the WebRTC stack\n");
|
|
743
|
+
process.exit(0); // `--restart always` brings us back with a fresh stack
|
|
744
|
+
} else if (up > HARD_RESTART_UPTIME_MS) {
|
|
745
|
+
process.stderr.write(
|
|
746
|
+
"[share] proactive restart (max uptime): closing peers, refreshing the WebRTC stack\n",
|
|
747
|
+
);
|
|
748
|
+
// graceful: DataChannel close → browsers reconnect to the fresh process.
|
|
749
|
+
// finally-guard the exit so a throw in close() can't leave us dead-but-
|
|
750
|
+
// not-respawned.
|
|
751
|
+
try {
|
|
752
|
+
close();
|
|
753
|
+
} finally {
|
|
754
|
+
setTimeout(() => process.exit(0), 250); // let close frames flush first
|
|
755
|
+
}
|
|
756
|
+
}
|
|
757
|
+
}, IDLE_RESTART_CHECK_MS);
|
|
758
|
+
proactiveRestart?.unref?.(); // don't keep the event loop alive on this timer alone
|
|
759
|
+
|
|
608
760
|
// Clean shutdown: stop the rendezvous (so it can't reconnect or accept new
|
|
609
761
|
// peers) and close every peer connection so browsers get an immediate
|
|
610
762
|
// DataChannel close and reconnect right away, instead of waiting out the
|
|
611
763
|
// ~15-30s ICE timeout that an abrupt process exit would otherwise force.
|
|
612
764
|
const close = () => {
|
|
613
765
|
closed = true;
|
|
766
|
+
if (proactiveRestart) clearInterval(proactiveRestart);
|
|
614
767
|
try {
|
|
615
768
|
currentWs?.close();
|
|
616
769
|
} catch {
|
package/ts/subcommands.ts
CHANGED
|
Binary file
|