agent-yes 1.121.0 → 1.122.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/default.config.yaml +27 -4
- package/dist/SUPPORTED_CLIS-DcWAr8NI.js +8 -0
- package/dist/{SUPPORTED_CLIS-O57LGUEG.js → SUPPORTED_CLIS-f50t1rrA.js} +2 -2
- package/dist/{agent-yes.config-kmtJKJHk.js → agent-yes.config-z-IPzH5U.js} +3 -2
- package/dist/cli.js +5 -5
- package/dist/index.js +2 -2
- package/dist/reaper-Dj8R7ltI.js +64 -0
- package/dist/reaper-HqcUms2d.js +3 -0
- package/dist/{remotes-DavR4Hca.js → remotes-CpGcTr7A.js} +1 -1
- package/dist/{remotes-BufkGk0e.js → remotes-D2fqaRU8.js} +1 -1
- package/dist/schedule-OJeQo0Da.js +144 -0
- package/dist/{serve-D2czcYNC.js → serve-O3e2YFfp.js} +137 -36
- package/dist/{setup-f1FIFcZm.js → setup-yKMfadhq.js} +5 -42
- package/dist/{share-B6QVr5D1.js → share-CksllWW-.js} +122 -16
- package/dist/{subcommands-DobVXouH.js → subcommands-BkR-nSAB.js} +2 -2
- package/dist/{subcommands-CzpZQHO6.js → subcommands-CT1z9Jl4.js} +15 -6
- package/dist/{tray-B8_rx1iu.js → tray-DjCIyakK.js} +22 -10
- package/dist/{ts-D91dm1E0.js → ts-DyDU_Dae.js} +76 -7
- package/dist/{versionChecker-CAtpgnoQ.js → versionChecker-DmCadDPY.js} +13 -19
- package/dist/workspaceConfig-XP2NEWmV.js +56 -0
- package/lab/ui/index.html +63 -32
- package/package.json +1 -1
- package/ts/autoRetry.spec.ts +19 -0
- package/ts/autoRetry.ts +16 -0
- package/ts/configShared.ts +4 -0
- package/ts/index.ts +102 -0
- package/ts/oxmgrService.ts +36 -0
- package/ts/pty.ts +19 -1
- package/ts/reaper.spec.ts +45 -0
- package/ts/reaper.ts +77 -0
- package/ts/schedule.spec.ts +30 -0
- package/ts/schedule.ts +161 -0
- package/ts/serve.ts +207 -44
- package/ts/share.ts +171 -22
- package/ts/subcommands.ts +0 -0
- package/ts/tray.spec.ts +9 -1
- package/ts/tray.ts +30 -14
- package/ts/versionChecker.ts +24 -27
- package/dist/SUPPORTED_CLIS-CegJgoEf.js +0 -8
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
import { t as agentYesHome } from "./agentYesHome-BvaUOzCV.js";
|
|
2
|
+
import { mkdirSync, readFileSync, writeFileSync } from "fs";
|
|
3
|
+
import { homedir } from "os";
|
|
4
|
+
import path from "path";
|
|
5
|
+
|
|
6
|
+
//#region ts/workspaceConfig.ts
|
|
7
|
+
function configPath() {
|
|
8
|
+
return path.join(agentYesHome(), "config.json");
|
|
9
|
+
}
|
|
10
|
+
function readConfig() {
|
|
11
|
+
try {
|
|
12
|
+
return JSON.parse(readFileSync(configPath(), "utf-8"));
|
|
13
|
+
} catch {
|
|
14
|
+
return {};
|
|
15
|
+
}
|
|
16
|
+
}
|
|
17
|
+
/** Expand a leading `~` (`~` or `~/x`) to an absolute home-based path. */
|
|
18
|
+
function expandTilde(p) {
|
|
19
|
+
const s = p.trim();
|
|
20
|
+
if (s === "~") return homedir();
|
|
21
|
+
if (s.startsWith("~/") || s.startsWith("~\\")) return path.join(homedir(), s.slice(2));
|
|
22
|
+
return s;
|
|
23
|
+
}
|
|
24
|
+
/** The configured workspace root (absolute), or the home dir if unset. */
|
|
25
|
+
function getWorkspaceRoot() {
|
|
26
|
+
const w = readConfig().workspace;
|
|
27
|
+
return w && w.trim() ? w : homedir();
|
|
28
|
+
}
|
|
29
|
+
/** Persist the workspace root, tilde-expanded and resolved to an absolute path. */
|
|
30
|
+
function setWorkspaceRoot(dir) {
|
|
31
|
+
const abs = path.resolve(expandTilde(dir));
|
|
32
|
+
const cfg = readConfig();
|
|
33
|
+
cfg.workspace = abs;
|
|
34
|
+
mkdirSync(agentYesHome(), { recursive: true });
|
|
35
|
+
writeFileSync(configPath(), JSON.stringify(cfg, null, 2));
|
|
36
|
+
return abs;
|
|
37
|
+
}
|
|
38
|
+
/**
|
|
39
|
+
* Resolve a user-supplied spawn location to an absolute cwd:
|
|
40
|
+
* - empty → the workspace root
|
|
41
|
+
* - a bare name → `<workspace>/<name>` (so "myproject" lands under the root)
|
|
42
|
+
* - `~`-prefixed → home-based absolute
|
|
43
|
+
* - anything with a path separator → resolved as-is
|
|
44
|
+
*/
|
|
45
|
+
function resolveSpawnCwd(input) {
|
|
46
|
+
const root = getWorkspaceRoot();
|
|
47
|
+
const v = (input ?? "").trim();
|
|
48
|
+
if (!v) return root;
|
|
49
|
+
if (v.startsWith("~")) return path.resolve(expandTilde(v));
|
|
50
|
+
if (v.includes("/") || v.includes("\\") || path.isAbsolute(v)) return path.resolve(v);
|
|
51
|
+
return path.join(root, v);
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
//#endregion
|
|
55
|
+
export { resolveSpawnCwd as n, setWorkspaceRoot as r, getWorkspaceRoot as t };
|
|
56
|
+
//# sourceMappingURL=workspaceConfig-XP2NEWmV.js.map
|
package/lab/ui/index.html
CHANGED
|
@@ -990,38 +990,45 @@
|
|
|
990
990
|
if (m.type === "welcome") {
|
|
991
991
|
if (this._v2 && m.v !== 2)
|
|
992
992
|
return fail(new Error("host is running an old agent-yes — ask it to upgrade"));
|
|
993
|
-
pc
|
|
994
|
-
|
|
995
|
-
|
|
996
|
-
|
|
997
|
-
|
|
998
|
-
|
|
999
|
-
|
|
1000
|
-
|
|
1001
|
-
|
|
1002
|
-
|
|
1003
|
-
this.
|
|
1004
|
-
|
|
1005
|
-
|
|
1006
|
-
|
|
1007
|
-
await this._keysReady;
|
|
1008
|
-
// Open the bidirectional confirmation handshake.
|
|
1009
|
-
this._dcSend(FLAG_CONFIRM, { t: "confirm", nonce: this._myNonce });
|
|
1010
|
-
this._confirmTimer = setTimeout(() => {
|
|
1011
|
-
if (!this._confirmed) fail(new Error("key confirmation timed out"));
|
|
1012
|
-
}, CONFIRM_TIMEOUT_MS);
|
|
1013
|
-
} catch (err) {
|
|
1014
|
-
fail(err);
|
|
1015
|
-
}
|
|
993
|
+
// pc is created on the offer below so it can use the host-supplied
|
|
994
|
+
// iceServers (incl. short-lived TURN creds for relaying behind NAT).
|
|
995
|
+
} else if (m.type === "offer") {
|
|
996
|
+
if (!pc) {
|
|
997
|
+
pc = new RTCPeerConnection({
|
|
998
|
+
iceServers:
|
|
999
|
+
m.iceServers && m.iceServers.length
|
|
1000
|
+
? m.iceServers
|
|
1001
|
+
: [{ urls: "stun:stun.l.google.com:19302" }],
|
|
1002
|
+
});
|
|
1003
|
+
this.pc = pc;
|
|
1004
|
+
pc.onicecandidate = (e) => {
|
|
1005
|
+
if (e.candidate)
|
|
1006
|
+
ws.send(JSON.stringify({ type: "candidate", candidate: e.candidate }));
|
|
1016
1007
|
};
|
|
1017
|
-
|
|
1018
|
-
|
|
1019
|
-
|
|
1020
|
-
|
|
1008
|
+
pc.onconnectionstatechange = () => this.onstate(pc.connectionState);
|
|
1009
|
+
pc.ondatachannel = (e) => {
|
|
1010
|
+
this.dc = e.channel;
|
|
1011
|
+
this.dc.binaryType = "arraybuffer";
|
|
1012
|
+
this.dc.onopen = async () => {
|
|
1013
|
+
try {
|
|
1014
|
+
await this._keysReady;
|
|
1015
|
+
// Open the bidirectional confirmation handshake.
|
|
1016
|
+
this._dcSend(FLAG_CONFIRM, { t: "confirm", nonce: this._myNonce });
|
|
1017
|
+
this._confirmTimer = setTimeout(() => {
|
|
1018
|
+
if (!this._confirmed) fail(new Error("key confirmation timed out"));
|
|
1019
|
+
}, CONFIRM_TIMEOUT_MS);
|
|
1020
|
+
} catch (err) {
|
|
1021
|
+
fail(err);
|
|
1022
|
+
}
|
|
1023
|
+
};
|
|
1024
|
+
this.dc.onmessage = (ev2) => {
|
|
1025
|
+
this._recvChain = this._recvChain
|
|
1026
|
+
.then(() => this._dcRecv(ev2.data, done))
|
|
1027
|
+
.catch(() => {});
|
|
1028
|
+
};
|
|
1029
|
+
this.dc.onclose = () => this.onstate("closed");
|
|
1021
1030
|
};
|
|
1022
|
-
|
|
1023
|
-
};
|
|
1024
|
-
} else if (m.type === "offer") {
|
|
1031
|
+
}
|
|
1025
1032
|
await pc.setRemoteDescription({ type: "offer", sdp: m.sdp });
|
|
1026
1033
|
await pc.setLocalDescription(await pc.createAnswer());
|
|
1027
1034
|
ws.send(JSON.stringify({ type: "answer", sdp: pc.localDescription.sdp }));
|
|
@@ -1040,7 +1047,7 @@
|
|
|
1040
1047
|
fail(err);
|
|
1041
1048
|
}
|
|
1042
1049
|
} else if (m.type === "candidate") {
|
|
1043
|
-
await pc.addIceCandidate(m.candidate).catch(() => {});
|
|
1050
|
+
if (pc) await pc.addIceCandidate(m.candidate).catch(() => {});
|
|
1044
1051
|
}
|
|
1045
1052
|
};
|
|
1046
1053
|
ws.onerror = () => fail(new Error("signaling error"));
|
|
@@ -2346,9 +2353,33 @@
|
|
|
2346
2353
|
cwd: $("nf-cwd").value.trim(),
|
|
2347
2354
|
prompt: $("nf-prompt").value,
|
|
2348
2355
|
};
|
|
2356
|
+
const room = $("newform").dataset.room || undefined;
|
|
2357
|
+
// Warn (but allow) when an agent is already running in this cwd on the
|
|
2358
|
+
// target fleet. A 2nd `claude -c` in the same repo fights over the same
|
|
2359
|
+
// session/files and usually exits on startup — which looks like "the new
|
|
2360
|
+
// agent never appears". Let the user pick a different dir or proceed.
|
|
2361
|
+
if (spec.cwd) {
|
|
2362
|
+
await loadList();
|
|
2363
|
+
const norm = (p) => (p || "").replace(/\/+$/, "");
|
|
2364
|
+
const busy = entries.some(
|
|
2365
|
+
(e) =>
|
|
2366
|
+
(room ? e._room === room : true) &&
|
|
2367
|
+
e.exit_code == null &&
|
|
2368
|
+
norm(e.cwd) === norm(spec.cwd),
|
|
2369
|
+
);
|
|
2370
|
+
if (
|
|
2371
|
+
busy &&
|
|
2372
|
+
!confirm(
|
|
2373
|
+
`An agent is already running in:\n${spec.cwd}\n\n` +
|
|
2374
|
+
`Launching another in the same repo can collide (shared session/files) ` +
|
|
2375
|
+
`and the new one may exit immediately. Launch anyway?`,
|
|
2376
|
+
)
|
|
2377
|
+
)
|
|
2378
|
+
return;
|
|
2379
|
+
}
|
|
2349
2380
|
go.disabled = true;
|
|
2350
2381
|
go.textContent = "launching…";
|
|
2351
|
-
const ok = await spawnAndSelect(spec,
|
|
2382
|
+
const ok = await spawnAndSelect(spec, room);
|
|
2352
2383
|
if (ok) {
|
|
2353
2384
|
$("newform").style.display = "none";
|
|
2354
2385
|
} else {
|
package/package.json
CHANGED
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
import { describe, expect, it } from "vitest";
|
|
2
|
+
import { AUTO_RETRY_MAX_DELAY_SECS, autoRetryBackoffMs } from "./autoRetry.ts";
|
|
3
|
+
|
|
4
|
+
describe("autoRetryBackoffMs", () => {
|
|
5
|
+
it("doubles 8,16,32,…,256 then caps", () => {
|
|
6
|
+
expect(autoRetryBackoffMs(0)).toBe(8_000);
|
|
7
|
+
expect(autoRetryBackoffMs(1)).toBe(16_000);
|
|
8
|
+
expect(autoRetryBackoffMs(2)).toBe(32_000);
|
|
9
|
+
expect(autoRetryBackoffMs(3)).toBe(64_000);
|
|
10
|
+
expect(autoRetryBackoffMs(4)).toBe(128_000);
|
|
11
|
+
expect(autoRetryBackoffMs(5)).toBe(256_000);
|
|
12
|
+
});
|
|
13
|
+
|
|
14
|
+
it("caps at the max delay and never overflows for large streaks", () => {
|
|
15
|
+
expect(autoRetryBackoffMs(6)).toBe(AUTO_RETRY_MAX_DELAY_SECS * 1000);
|
|
16
|
+
expect(autoRetryBackoffMs(50)).toBe(AUTO_RETRY_MAX_DELAY_SECS * 1000);
|
|
17
|
+
expect(Number.isFinite(autoRetryBackoffMs(1000))).toBe(true);
|
|
18
|
+
});
|
|
19
|
+
});
|
package/ts/autoRetry.ts
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
// Auto-retry on recoverable API errors (overload / rate-limit / usage-limit):
|
|
2
|
+
// agent-yes types "retry" with exponential backoff instead of letting the run
|
|
3
|
+
// die. This module holds the backoff schedule shared by the heartbeat logic and
|
|
4
|
+
// its tests. It mirrors the Rust runtime — see rs/src/context.rs
|
|
5
|
+
// (retry_backoff_secs / RETRY_* constants) — keep the two in sync.
|
|
6
|
+
|
|
7
|
+
export const AUTO_RETRY_BASE_SECS = 8; // first backoff; doubles each consecutive failure
|
|
8
|
+
export const AUTO_RETRY_MAX_DELAY_SECS = 256; // cap: 8,16,32,…,256 then hold
|
|
9
|
+
export const AUTO_RETRY_GIVE_UP_MS = 8 * 3600 * 1000; // stop after 8h (claude's usage window is ~5h)
|
|
10
|
+
|
|
11
|
+
/** Backoff (ms) before the Nth consecutive auto-retry — doubles, then caps. */
|
|
12
|
+
export function autoRetryBackoffMs(streak: number): number {
|
|
13
|
+
const shift = Math.min(streak, 20); // guard against absurd streaks blowing up 2 ** n
|
|
14
|
+
const secs = Math.min(AUTO_RETRY_BASE_SECS * 2 ** shift, AUTO_RETRY_MAX_DELAY_SECS);
|
|
15
|
+
return secs * 1000;
|
|
16
|
+
}
|
package/ts/configShared.ts
CHANGED
|
@@ -17,6 +17,7 @@ type RawCliConfig = Omit<
|
|
|
17
17
|
| "restartWithoutContinueArg"
|
|
18
18
|
| "updateAvailable"
|
|
19
19
|
| "exitCommands"
|
|
20
|
+
| "autoRetry"
|
|
20
21
|
> & {
|
|
21
22
|
ready?: RegexSource[];
|
|
22
23
|
fatal?: RegexSource[];
|
|
@@ -26,6 +27,7 @@ type RawCliConfig = Omit<
|
|
|
26
27
|
typingRespond?: Record<string, RegexSource[]>;
|
|
27
28
|
restartWithoutContinueArg?: RegexSource[];
|
|
28
29
|
updateAvailable?: RegexSource[];
|
|
30
|
+
autoRetry?: RegexSource[];
|
|
29
31
|
exitCommands?: string[];
|
|
30
32
|
exitCommand?: string[];
|
|
31
33
|
};
|
|
@@ -78,6 +80,7 @@ export function normalizeCliConfig(raw: RawCliConfig): AgentCliConfig {
|
|
|
78
80
|
typingRespond,
|
|
79
81
|
restartWithoutContinueArg,
|
|
80
82
|
updateAvailable,
|
|
83
|
+
autoRetry,
|
|
81
84
|
exitCommands,
|
|
82
85
|
exitCommand,
|
|
83
86
|
...rest
|
|
@@ -93,6 +96,7 @@ export function normalizeCliConfig(raw: RawCliConfig): AgentCliConfig {
|
|
|
93
96
|
typingRespond: compileTypingRespond(typingRespond),
|
|
94
97
|
restartWithoutContinueArg: compileRegexList(restartWithoutContinueArg),
|
|
95
98
|
updateAvailable: compileRegexList(updateAvailable),
|
|
99
|
+
autoRetry: compileRegexList(autoRetry),
|
|
96
100
|
exitCommands: exitCommands ?? exitCommand,
|
|
97
101
|
};
|
|
98
102
|
}
|
package/ts/index.ts
CHANGED
|
@@ -18,6 +18,7 @@ import { logger } from "./logger.ts";
|
|
|
18
18
|
import { createFifoStream } from "./beta/fifo.ts";
|
|
19
19
|
import { PidStore } from "./pidStore.ts";
|
|
20
20
|
import { sendEnter, sendMessage } from "./core/messaging.ts";
|
|
21
|
+
import { AUTO_RETRY_GIVE_UP_MS, autoRetryBackoffMs } from "./autoRetry.ts";
|
|
21
22
|
import {
|
|
22
23
|
initializeLogPaths,
|
|
23
24
|
setupDebugLogging,
|
|
@@ -30,6 +31,7 @@ import { createTerminatorStream } from "./core/streamHelpers.ts";
|
|
|
30
31
|
import { globalAgentRegistry } from "./agentRegistry.ts";
|
|
31
32
|
import { notifyWebhook } from "./webhookNotifier.ts";
|
|
32
33
|
import { readGlobalPids } from "./globalPidIndex.ts";
|
|
34
|
+
import * as reaper from "./reaper.ts";
|
|
33
35
|
|
|
34
36
|
export { removeControlCharacters };
|
|
35
37
|
export { AgentContext };
|
|
@@ -62,6 +64,7 @@ export type AgentCliConfig = {
|
|
|
62
64
|
enter?: RegExp[]; // array of regex to match for sending Enter
|
|
63
65
|
enterExclude?: RegExp[]; // array of regex to exclude from auto-enter (even if enter matches)
|
|
64
66
|
typingRespond?: { [message: string]: RegExp[] }; // type specified message to a specified pattern
|
|
67
|
+
autoRetry?: RegExp[]; // recoverable API errors (overload/rate-limit/usage-limit): type "retry" with exponential backoff (up to 8h) instead of exiting
|
|
65
68
|
|
|
66
69
|
// crash/resuming-session behaviour
|
|
67
70
|
restoreArgs?: string[]; // arguments to continue the session when crashed
|
|
@@ -341,6 +344,10 @@ export default async function agentYes({
|
|
|
341
344
|
}
|
|
342
345
|
}
|
|
343
346
|
|
|
347
|
+
// Opportunistic sweep: reap any process group leaked by an agent whose wrapper
|
|
348
|
+
// died without cleanup, before we start a new one. See ts/reaper.ts.
|
|
349
|
+
reaper.sweep().catch(() => {});
|
|
350
|
+
|
|
344
351
|
// Spawn the agent CLI process
|
|
345
352
|
const ptyEnv = { ...(env ?? (process.env as Record<string, string>)) };
|
|
346
353
|
ptyEnv.AGENT_YES_PID = String(process.pid);
|
|
@@ -387,6 +394,10 @@ export default async function agentYes({
|
|
|
387
394
|
} catch (error) {
|
|
388
395
|
logger.warn(`[pidStore] Failed to register process ${shell.pid}:`, error);
|
|
389
396
|
}
|
|
397
|
+
// Defense-in-depth: record (this wrapper, the agent's process group) so a later
|
|
398
|
+
// sweep reaps the group if we're killed without running onExit cleanup. The PTY
|
|
399
|
+
// child is a session leader, so its pgid == shell.pid. See ts/reaper.ts.
|
|
400
|
+
reaper.register(process.pid, shell.pid).catch(() => {});
|
|
390
401
|
notifyWebhook("RUNNING", prompt ?? "", workingDir).catch(() => null);
|
|
391
402
|
|
|
392
403
|
// Initialize log paths (independent of registration)
|
|
@@ -449,6 +460,19 @@ export default async function agentYes({
|
|
|
449
460
|
|
|
450
461
|
shell.onExit(async function onExit({ exitCode }) {
|
|
451
462
|
const exitedPid = shell.pid; // Capture PID immediately before any shell reassignment
|
|
463
|
+
// Reap the exited agent's process group. The PTY child is a session/group
|
|
464
|
+
// leader, so a `yes | cmd` (or any descendant) it leaked shares its pgid even
|
|
465
|
+
// after it reparents to PID 1 — kill the group so orphans don't spin at ~100%
|
|
466
|
+
// CPU forever. Targeting the pgid (not ppid==1) is container-safe and never
|
|
467
|
+
// touches processes outside this agent's session. Runs on the final exit AND
|
|
468
|
+
// before each robust restart below.
|
|
469
|
+
if (process.platform !== "win32") {
|
|
470
|
+
try {
|
|
471
|
+
process.kill(-exitedPid, "SIGKILL");
|
|
472
|
+
} catch {
|
|
473
|
+
// ESRCH = no surviving group members left to reap; nothing to do.
|
|
474
|
+
}
|
|
475
|
+
}
|
|
452
476
|
// Unregister from agent registry
|
|
453
477
|
globalAgentRegistry.unregister(exitedPid);
|
|
454
478
|
ctx.stdinReady.unready(); // start buffer stdin
|
|
@@ -504,6 +528,10 @@ export default async function agentYes({
|
|
|
504
528
|
} catch (error) {
|
|
505
529
|
logger.warn(`[pidStore] Failed to register restarted process ${shell.pid}:`, error);
|
|
506
530
|
}
|
|
531
|
+
// Re-register the NEW process group with the reaper — the restart gave us a
|
|
532
|
+
// fresh pgid; without this the reaper would track the old (now-dead) group
|
|
533
|
+
// and the live one would leak if we're SIGKILLed. Mirrors the Rust loop.
|
|
534
|
+
reaper.register(process.pid, shell.pid).catch(() => {});
|
|
507
535
|
// Update context with new shell
|
|
508
536
|
ctx.shell = shell;
|
|
509
537
|
// Register new agent in registry (non-blocking)
|
|
@@ -612,6 +640,9 @@ export default async function agentYes({
|
|
|
612
640
|
} catch (error) {
|
|
613
641
|
logger.warn(`[pidStore] Failed to register restored process ${shell.pid}:`, error);
|
|
614
642
|
}
|
|
643
|
+
// Re-register the NEW process group with the reaper (fresh pgid after the
|
|
644
|
+
// restore) so the reaper tracks the live group, not the dead one. See above.
|
|
645
|
+
reaper.register(process.pid, shell.pid).catch(() => {});
|
|
615
646
|
// Update context with new shell
|
|
616
647
|
ctx.shell = shell;
|
|
617
648
|
// Register new agent in registry (non-blocking)
|
|
@@ -683,10 +714,52 @@ export default async function agentYes({
|
|
|
683
714
|
// Heartbeat for auto-response on rendered terminal output
|
|
684
715
|
// This catches patterns that appear via CSI positioning instead of newlines
|
|
685
716
|
let lastHeartbeatRendered = "";
|
|
717
|
+
// Auto-retry backoff state (mirrors rs/src/context.rs). `streak` doubles the
|
|
718
|
+
// backoff each consecutive failed retry; `startedAt` anchors the 8h give-up
|
|
719
|
+
// window; `nextAt` is non-null while a retry is scheduled. `autoRetryScreen`
|
|
720
|
+
// is the latest rendered screen captured by the stdout pipeline below — the
|
|
721
|
+
// heartbeat timer reads it (its own xtermProxy.tail() is empty for EOL CLIs).
|
|
722
|
+
let retryStreak = 0;
|
|
723
|
+
let retryStartedAt: number | null = null;
|
|
724
|
+
let retryNextAt: number | null = null;
|
|
725
|
+
let autoRetryScreen = "";
|
|
686
726
|
const heartbeatInterval = setInterval(async () => {
|
|
687
727
|
try {
|
|
688
728
|
const rendered = removeControlCharacters(xtermProxy.tail(12));
|
|
689
729
|
|
|
730
|
+
// Auto-retry backoff timer — fires the scheduled "retry" using the latest
|
|
731
|
+
// rendered screen captured by the stdout pipeline (consoleResponder). Runs
|
|
732
|
+
// every tick (independent of output) so it still fires while the agent sits
|
|
733
|
+
// idle on an error. Arming/reset lives in the stdout pipeline because this
|
|
734
|
+
// heartbeat's own xtermProxy.tail() is empty for newline (EOL) CLIs like
|
|
735
|
+
// claude. Only types "retry" when idle at a prompt (never mid-work).
|
|
736
|
+
if (retryNextAt !== null) {
|
|
737
|
+
const now = Date.now();
|
|
738
|
+
if (retryStartedAt !== null && now - retryStartedAt >= AUTO_RETRY_GIVE_UP_MS) {
|
|
739
|
+
logger.warn(`[${cli}-yes] auto-retry: giving up after 8h with no recovery`);
|
|
740
|
+
retryNextAt = null;
|
|
741
|
+
retryStartedAt = null;
|
|
742
|
+
retryStreak = 0;
|
|
743
|
+
} else if (now >= retryNextAt) {
|
|
744
|
+
const working = conf.working?.some((rx: RegExp) => rx.test(autoRetryScreen)) ?? false;
|
|
745
|
+
const readyNow = conf.ready?.some((rx: RegExp) => rx.test(autoRetryScreen)) ?? false;
|
|
746
|
+
if (working || !readyNow) {
|
|
747
|
+
retryNextAt = now + 500; // busy / not at prompt — re-check shortly
|
|
748
|
+
} else {
|
|
749
|
+
retryStreak += 1;
|
|
750
|
+
logger.warn(`[${cli}-yes] auto-retry: typing 'retry' (attempt ${retryStreak})`);
|
|
751
|
+
// Write "retry" + Enter atomically (mirrors rs do_send_retry); using
|
|
752
|
+
// sendMessage would split text/Enter across the fast heartbeat ticks.
|
|
753
|
+
ctx.messageContext.shell.write("retry\r");
|
|
754
|
+
ctx.idleWaiter.ping();
|
|
755
|
+
// Self-schedule the next retry with escalated backoff. (Leaving nextAt
|
|
756
|
+
// null and re-arming from the stdout pipeline would tight-loop while the
|
|
757
|
+
// error banner stays on screen.) Reset on recovery cancels this.
|
|
758
|
+
retryNextAt = now + autoRetryBackoffMs(retryStreak);
|
|
759
|
+
}
|
|
760
|
+
}
|
|
761
|
+
}
|
|
762
|
+
|
|
690
763
|
// Skip if output hasn't changed since last heartbeat
|
|
691
764
|
if (rendered === lastHeartbeatRendered) return;
|
|
692
765
|
lastHeartbeatRendered = rendered;
|
|
@@ -1012,6 +1085,35 @@ export default async function agentYes({
|
|
|
1012
1085
|
|
|
1013
1086
|
logger.debug(`stdout|${line}`);
|
|
1014
1087
|
|
|
1088
|
+
// Auto-retry on recoverable API errors (overload / rate-limit / usage-
|
|
1089
|
+
// limit): arm/reset the backoff on the whole rendered screen (the error
|
|
1090
|
+
// banner and the ready prompt are on different lines, so this can't be a
|
|
1091
|
+
// per-line check). The firing happens on the heartbeat timer, which
|
|
1092
|
+
// reads `autoRetryScreen`. Done here, before the `fatal` check below, so
|
|
1093
|
+
// these recoverable errors retry instead of exiting.
|
|
1094
|
+
if (conf.autoRetry?.length) {
|
|
1095
|
+
autoRetryScreen = rendered;
|
|
1096
|
+
const errVisible = conf.autoRetry.some((rx: RegExp) => rx.test(rendered));
|
|
1097
|
+
const readyVisible = conf.ready?.some((rx: RegExp) => rx.test(rendered)) ?? false;
|
|
1098
|
+
if (errVisible && readyVisible) {
|
|
1099
|
+
if (retryNextAt === null) {
|
|
1100
|
+
if (retryStartedAt === null) retryStartedAt = Date.now();
|
|
1101
|
+
const delayMs = autoRetryBackoffMs(retryStreak);
|
|
1102
|
+
retryNextAt = Date.now() + delayMs;
|
|
1103
|
+
logger.warn(
|
|
1104
|
+
`[${cli}-yes] auto-retry armed: recoverable error detected, retrying in ${
|
|
1105
|
+
delayMs / 1000
|
|
1106
|
+
}s (attempt ${retryStreak + 1})`,
|
|
1107
|
+
);
|
|
1108
|
+
}
|
|
1109
|
+
} else if (readyVisible && !errVisible && retryStartedAt !== null) {
|
|
1110
|
+
logger.debug(`[${cli}-yes] auto-retry: recovered, resetting backoff`);
|
|
1111
|
+
retryStreak = 0;
|
|
1112
|
+
retryStartedAt = null;
|
|
1113
|
+
retryNextAt = null;
|
|
1114
|
+
}
|
|
1115
|
+
}
|
|
1116
|
+
|
|
1015
1117
|
// ready matcher: if matched, mark stdin ready
|
|
1016
1118
|
if (conf.ready?.some((rx: RegExp) => line.match(rx))) {
|
|
1017
1119
|
logger.debug(`ready |${line}`);
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
// Register oxmgr's daemon with the platform init system (launchd on macOS,
|
|
2
|
+
// systemd on Linux, Task Scheduler on Windows) so managed processes survive a
|
|
3
|
+
// *reboot*, not just a crash.
|
|
4
|
+
//
|
|
5
|
+
// CHEAP + idempotent: it first checks `oxmgr service status` and SKIPS the
|
|
6
|
+
// install when the service is already registered. This matters a lot —
|
|
7
|
+
// re-running `oxmgr service install` re-bootstraps the launchd/systemd job,
|
|
8
|
+
// which restarts the oxmgr daemon itself, and a daemon restart kills and
|
|
9
|
+
// relaunches EVERY managed process (not just ours). Doing that on every
|
|
10
|
+
// `ay serve install` / `ay schedule` was bouncing unrelated daemons — e.g. a
|
|
11
|
+
// VS Code `serve-web` server running under another managed process, which took
|
|
12
|
+
// the user's editor (and any agent running inside it) down with it.
|
|
13
|
+
//
|
|
14
|
+
// Best-effort: returns false on any failure (e.g. a system-level systemd unit
|
|
15
|
+
// that needs sudo) without aborting the caller — the process is still managed,
|
|
16
|
+
// just not boot-persistent.
|
|
17
|
+
export async function ensureBootAutostart(oxmgrBin: string): Promise<boolean> {
|
|
18
|
+
try {
|
|
19
|
+
// Already registered with the init system? Then we're done — don't bounce
|
|
20
|
+
// the daemon (and all its children) just to re-assert what's already true.
|
|
21
|
+
const status = Bun.spawn([oxmgrBin, "service", "status"], {
|
|
22
|
+
stdio: ["ignore", "ignore", "ignore"],
|
|
23
|
+
});
|
|
24
|
+
if ((await status.exited) === 0) return true;
|
|
25
|
+
|
|
26
|
+
// Not registered yet → install. `--system` defaults to "auto"
|
|
27
|
+
// (launchd/systemd/Task Scheduler by platform); it's a `service`-level flag,
|
|
28
|
+
// so passing it after `install` is rejected.
|
|
29
|
+
const svc = Bun.spawn([oxmgrBin, "service", "install"], {
|
|
30
|
+
stdio: ["ignore", "ignore", "ignore"],
|
|
31
|
+
});
|
|
32
|
+
return (await svc.exited) === 0;
|
|
33
|
+
} catch {
|
|
34
|
+
return false;
|
|
35
|
+
}
|
|
36
|
+
}
|
package/ts/pty.ts
CHANGED
|
@@ -15,6 +15,24 @@ async function getPty(): Promise<typeof import("node-pty") | typeof import("bun-
|
|
|
15
15
|
});
|
|
16
16
|
}
|
|
17
17
|
export type IPty = IPtyNode | IPtyBun;
|
|
18
|
-
|
|
18
|
+
type PtyModule = typeof import("node-pty") | typeof import("bun-pty");
|
|
19
|
+
|
|
20
|
+
// Loading node-pty/bun-pty pulls in a native addon. Failing here at import time
|
|
21
|
+
// would crash anything that merely imports this module's graph — including unit
|
|
22
|
+
// tests that never spawn a PTY (e.g. on a machine where the prebuilt binary is
|
|
23
|
+
// missing). So if the load fails, defer the error to first actual use: hand back
|
|
24
|
+
// a proxy that re-throws the original load error the moment `pty.spawn` (or any
|
|
25
|
+
// member) is touched. Production paths that do spawn still fail loudly, with the
|
|
26
|
+
// same error and the same `logger.error` already emitted by getPty().
|
|
27
|
+
let pty: PtyModule;
|
|
28
|
+
try {
|
|
29
|
+
pty = await getPty();
|
|
30
|
+
} catch (error) {
|
|
31
|
+
pty = new Proxy({} as PtyModule, {
|
|
32
|
+
get() {
|
|
33
|
+
throw error;
|
|
34
|
+
},
|
|
35
|
+
});
|
|
36
|
+
}
|
|
19
37
|
export const ptyPackage = globalThis.Bun ? "bun-pty" : "node-pty";
|
|
20
38
|
export default pty;
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
import { afterEach, beforeEach, expect, test } from "vitest";
|
|
2
|
+
import { mkdtempSync, readFileSync } from "fs";
|
|
3
|
+
import { tmpdir } from "os";
|
|
4
|
+
import path from "path";
|
|
5
|
+
import { register, sweep } from "./reaper.ts";
|
|
6
|
+
|
|
7
|
+
let prevHome: string | undefined;
|
|
8
|
+
|
|
9
|
+
beforeEach(() => {
|
|
10
|
+
prevHome = process.env.AGENT_YES_HOME;
|
|
11
|
+
process.env.AGENT_YES_HOME = mkdtempSync(path.join(tmpdir(), "ay-reaper-"));
|
|
12
|
+
});
|
|
13
|
+
|
|
14
|
+
afterEach(() => {
|
|
15
|
+
if (prevHome === undefined) delete process.env.AGENT_YES_HOME;
|
|
16
|
+
else process.env.AGENT_YES_HOME = prevHome;
|
|
17
|
+
});
|
|
18
|
+
|
|
19
|
+
const registryFile = () => path.join(process.env.AGENT_YES_HOME!, "reaper.jsonl");
|
|
20
|
+
const liveLines = () =>
|
|
21
|
+
readFileSync(registryFile(), "utf8")
|
|
22
|
+
.split("\n")
|
|
23
|
+
.map((l) => l.trim())
|
|
24
|
+
.filter(Boolean);
|
|
25
|
+
|
|
26
|
+
test("sweep keeps live wrappers and drops dead ones", async () => {
|
|
27
|
+
// A live wrapper (us) is kept; a dead wrapper (999999) is dropped. Neither
|
|
28
|
+
// pgid points at a real group, so the kill is a harmless ESRCH no-op — we only
|
|
29
|
+
// exercise the bookkeeping here, not real signalling.
|
|
30
|
+
await register(process.pid, 222_222);
|
|
31
|
+
await register(999_999, 999_998);
|
|
32
|
+
await sweep();
|
|
33
|
+
|
|
34
|
+
const lines = liveLines();
|
|
35
|
+
expect(lines.length).toBe(1);
|
|
36
|
+
expect(lines[0]).toContain(String(process.pid));
|
|
37
|
+
});
|
|
38
|
+
|
|
39
|
+
test("register refuses to persist a pgid <= 1", async () => {
|
|
40
|
+
await register(process.pid, 1);
|
|
41
|
+
await register(process.pid, 0);
|
|
42
|
+
// Nothing written, so the registry file doesn't exist — sweep is a no-op.
|
|
43
|
+
await sweep();
|
|
44
|
+
expect(() => readFileSync(registryFile(), "utf8")).toThrow();
|
|
45
|
+
});
|
package/ts/reaper.ts
ADDED
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+
// Defense-in-depth orphan reaper — mirrors rs/src/reaper.rs (see it for the full
|
|
2
|
+
// rationale). Records each running agent's (wrapper pid, agent pgid) so a later
|
|
3
|
+
// sweep kills the recorded process group of any agent whose wrapper died WITHOUT
|
|
4
|
+
// running its own group cleanup (SIGKILL by an OOM killer / oxmgr force-restart /
|
|
5
|
+
// a panic). It targets the recorded pgid of a CONFIRMED-DEAD wrapper — never
|
|
6
|
+
// ppid==1 — so it is container-safe and never touches an unrelated process.
|
|
7
|
+
|
|
8
|
+
import { appendFile, mkdir, readFile, rename, writeFile } from "fs/promises";
|
|
9
|
+
import path from "path";
|
|
10
|
+
import { agentYesHome } from "./agentYesHome.ts";
|
|
11
|
+
|
|
12
|
+
const registryPath = () => path.join(agentYesHome(), "reaper.jsonl");
|
|
13
|
+
|
|
14
|
+
function isAlive(pid: number): boolean {
|
|
15
|
+
if (pid <= 1) return false;
|
|
16
|
+
try {
|
|
17
|
+
process.kill(pid, 0); // signal 0 probes existence without affecting the target
|
|
18
|
+
return true;
|
|
19
|
+
} catch (e) {
|
|
20
|
+
return (e as NodeJS.ErrnoException).code === "EPERM"; // exists, owned by another user
|
|
21
|
+
}
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
/** Record this wrapper + its agent's process group for later sweeping. */
|
|
25
|
+
export async function register(wrapperPid: number, pgid: number): Promise<void> {
|
|
26
|
+
if (pgid <= 1) return; // never persist a group we'd refuse to signal
|
|
27
|
+
try {
|
|
28
|
+
await mkdir(agentYesHome(), { recursive: true });
|
|
29
|
+
await appendFile(registryPath(), JSON.stringify({ wpid: wrapperPid, pgid }) + "\n");
|
|
30
|
+
} catch {
|
|
31
|
+
// best-effort
|
|
32
|
+
}
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
/** SIGKILL the recorded group of every agent whose wrapper has exited, and
|
|
36
|
+
* rewrite the registry keeping only still-running agents. Best-effort. */
|
|
37
|
+
export async function sweep(): Promise<void> {
|
|
38
|
+
let content: string;
|
|
39
|
+
try {
|
|
40
|
+
content = await readFile(registryPath(), "utf8");
|
|
41
|
+
} catch {
|
|
42
|
+
return; // no registry yet
|
|
43
|
+
}
|
|
44
|
+
const keep: string[] = [];
|
|
45
|
+
for (const line of content.split("\n")) {
|
|
46
|
+
const t = line.trim();
|
|
47
|
+
if (!t) continue;
|
|
48
|
+
let entry: { wpid?: unknown; pgid?: unknown };
|
|
49
|
+
try {
|
|
50
|
+
entry = JSON.parse(t);
|
|
51
|
+
} catch {
|
|
52
|
+
continue; // drop malformed lines
|
|
53
|
+
}
|
|
54
|
+
if (typeof entry.wpid !== "number" || typeof entry.pgid !== "number") continue;
|
|
55
|
+
if (isAlive(entry.wpid)) {
|
|
56
|
+
keep.push(t); // agent still running — keep watching it
|
|
57
|
+
continue;
|
|
58
|
+
}
|
|
59
|
+
// Wrapper gone — reap its recorded group. The pgid outlives the leader, so
|
|
60
|
+
// this catches descendants already reparented to PID 1. The `> 1` guard is
|
|
61
|
+
// critical: process.kill(-1) would signal every process the user owns.
|
|
62
|
+
if (process.platform !== "win32" && entry.pgid > 1) {
|
|
63
|
+
try {
|
|
64
|
+
process.kill(-entry.pgid, "SIGKILL");
|
|
65
|
+
} catch {
|
|
66
|
+
// ESRCH = nothing left alive in that group
|
|
67
|
+
}
|
|
68
|
+
}
|
|
69
|
+
}
|
|
70
|
+
try {
|
|
71
|
+
const tmp = registryPath() + ".tmp";
|
|
72
|
+
await writeFile(tmp, keep.join("\n"));
|
|
73
|
+
await rename(tmp, registryPath());
|
|
74
|
+
} catch {
|
|
75
|
+
// best-effort
|
|
76
|
+
}
|
|
77
|
+
}
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
import { describe, expect, it } from "vitest";
|
|
2
|
+
import { shellQuote, toCron } from "./schedule.ts";
|
|
3
|
+
|
|
4
|
+
describe("toCron", () => {
|
|
5
|
+
it("expands HH:MM to a daily cron", () => {
|
|
6
|
+
expect(toCron("10:00")).toBe("0 10 * * *");
|
|
7
|
+
expect(toCron("9:05")).toBe("5 9 * * *");
|
|
8
|
+
expect(toCron("23:59")).toBe("59 23 * * *");
|
|
9
|
+
});
|
|
10
|
+
it("passes through a 5-field cron expression", () => {
|
|
11
|
+
expect(toCron("0 10 * * *")).toBe("0 10 * * *");
|
|
12
|
+
expect(toCron("*/15 * * * 1-5")).toBe("*/15 * * * 1-5");
|
|
13
|
+
});
|
|
14
|
+
it("rejects out-of-range times and malformed specs", () => {
|
|
15
|
+
expect(toCron("25:00")).toBeNull();
|
|
16
|
+
expect(toCron("10:75")).toBeNull();
|
|
17
|
+
expect(toCron("daily")).toBeNull();
|
|
18
|
+
expect(toCron("0 10 * *")).toBeNull(); // only 4 fields
|
|
19
|
+
expect(toCron("")).toBeNull();
|
|
20
|
+
});
|
|
21
|
+
});
|
|
22
|
+
|
|
23
|
+
describe("shellQuote", () => {
|
|
24
|
+
it("wraps in single quotes for oxmgr's shell parsing", () => {
|
|
25
|
+
expect(shellQuote("a b c")).toBe("'a b c'");
|
|
26
|
+
});
|
|
27
|
+
it("escapes embedded single quotes", () => {
|
|
28
|
+
expect(shellQuote("it's a test")).toBe(`'it'\\''s a test'`);
|
|
29
|
+
});
|
|
30
|
+
});
|