agent-relay-orchestrator 0.119.11 → 0.120.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "agent-relay-orchestrator",
3
- "version": "0.119.11",
3
+ "version": "0.120.0",
4
4
  "description": "Agent Relay orchestrator — manages agent lifecycle across hosts",
5
5
  "type": "module",
6
6
  "bin": {
@@ -17,7 +17,7 @@
17
17
  },
18
18
  "dependencies": {
19
19
  "agent-relay-providers": "0.104.3",
20
- "agent-relay-sdk": "0.2.106",
20
+ "agent-relay-sdk": "0.2.107",
21
21
  "callmux": "0.23.0"
22
22
  },
23
23
  "devDependencies": {
package/src/config.ts CHANGED
@@ -93,6 +93,8 @@ export const TERMINAL_COMMAND_TIMEOUT_MS = envNonNegativeMax("AGENT_RELAY_TERMIN
93
93
  export const TERMINAL_GROUND_WAIT_MAX_MS = envNonNegativeMax("AGENT_RELAY_TERMINAL_GROUND_WAIT_MAX_MS", 500, 0);
94
94
  export const TERMINAL_BACKFILL_SCROLLBACK_LINES = envNonNegativeMax("AGENT_RELAY_TERMINAL_BACKFILL_SCROLLBACK", 1000, 0);
95
95
  export const TERMINAL_DEBUG = process.env.AGENT_RELAY_TERMINAL_DEBUG === "1";
96
+ export const SCHEDULER_COMMAND_MAX_CONCURRENT = envNonNegativeMax("AGENT_RELAY_SCHEDULER_COMMAND_MAX_CONCURRENT", 4, 1);
97
+ export const SCHEDULER_COMMAND_MAX_OUTPUT_BYTES = envNonNegativeMax("AGENT_RELAY_SCHEDULER_COMMAND_MAX_OUTPUT_BYTES", 256 * 1024, 4096);
96
98
 
97
99
  interface RawConfig {
98
100
  id?: string;
package/src/control.ts CHANGED
@@ -1,6 +1,6 @@
1
- import { errMessage, isRecord, normalizeAgentLifecycle, normalizeWorkspaceMode, Semaphore } from "agent-relay-sdk";
1
+ import { errMessage, isPathWithinBase, isRecord, normalizeAgentLifecycle, normalizeWorkspaceMode, Semaphore } from "agent-relay-sdk";
2
2
  import { getAllManifests, getManifest } from "agent-relay-providers";
3
- import type { OrchestratorConfig } from "./config";
3
+ import { SCHEDULER_COMMAND_MAX_CONCURRENT, SCHEDULER_COMMAND_MAX_OUTPUT_BYTES, type OrchestratorConfig } from "./config";
4
4
  import type { ManagedAgentReport, RelayClient, RelayCommand } from "./relay";
5
5
  import { handleSelfUpgrade } from "./self-upgrade";
6
6
  import { readLocalProviderConfigs } from "./provider-config-migration";
@@ -9,6 +9,7 @@ import { cleanupWorkspace, discardRecoveryBranch, idleRefreshWorktree, mergeWork
9
9
  import { withMergePhaseTimeout } from "./workspace-probe/merge-timeouts";
10
10
  import { armWorkspacePrAutoMerge, mergeWorkspacePr, refreshWorkspacePrBranch } from "./workspace-pr";
11
11
  import type { WorkspaceMergeResult } from "agent-relay-sdk";
12
+ import { execProcess } from "./process";
12
13
 
13
14
  // #638 — settle a `workspace.merge` command on whether it made progress, not on whether
14
15
  // the merge function threw. Every no-progress outcome (origin moved ahead, an unpredicted
@@ -64,6 +65,22 @@ export function resolveSpawnRegistrationTimeoutMs(env: NodeJS.ProcessEnv = proce
64
65
  return parsed;
65
66
  }
66
67
 
68
+ export function resolveMaxConcurrentSchedulerCommands(env: NodeJS.ProcessEnv = process.env): number {
69
+ const raw = env.AGENT_RELAY_SCHEDULER_COMMAND_MAX_CONCURRENT;
70
+ if (raw === undefined || raw.trim() === "") return SCHEDULER_COMMAND_MAX_CONCURRENT;
71
+ const parsed = Number(raw);
72
+ if (!Number.isInteger(parsed) || parsed < 1) return SCHEDULER_COMMAND_MAX_CONCURRENT;
73
+ return parsed;
74
+ }
75
+
76
+ export function resolveSchedulerCommandMaxOutputBytes(env: NodeJS.ProcessEnv = process.env): number {
77
+ const raw = env.AGENT_RELAY_SCHEDULER_COMMAND_MAX_OUTPUT_BYTES;
78
+ if (raw === undefined || raw.trim() === "") return SCHEDULER_COMMAND_MAX_OUTPUT_BYTES;
79
+ const parsed = Number(raw);
80
+ if (!Number.isSafeInteger(parsed) || parsed < 4096) return SCHEDULER_COMMAND_MAX_OUTPUT_BYTES;
81
+ return parsed;
82
+ }
83
+
67
84
  /** Awaits a spawn slot until the child registers, or resolves false at the fail-safe deadline. */
68
85
  export type WaitForRegistration = (agent: ManagedAgentReport, timeoutMs: number) => Promise<boolean>;
69
86
 
@@ -98,6 +115,8 @@ interface ControlHandlerDeps {
98
115
  registrationTimeoutMs?: number;
99
116
  // Injectable stop dispatch — production uses tmux/system process teardown.
100
117
  stopSession?: typeof stopSession;
118
+ maxConcurrentSchedulerCommands?: number;
119
+ schedulerCommandMaxOutputBytes?: number;
101
120
  }
102
121
 
103
122
  export function createControlHandler(
@@ -113,10 +132,13 @@ export function createControlHandler(
113
132
  const spawnSlots = new Semaphore(deps.maxConcurrentSpawns ?? resolveMaxConcurrentSpawns());
114
133
  const waitForRegistration = deps.waitForRegistration ?? waitForManagedRegistration;
115
134
  const registrationTimeoutMs = deps.registrationTimeoutMs ?? resolveSpawnRegistrationTimeoutMs();
135
+ const schedulerCommandSlots = new Semaphore(deps.maxConcurrentSchedulerCommands ?? resolveMaxConcurrentSchedulerCommands());
136
+ const schedulerCommandMaxOutputBytes = deps.schedulerCommandMaxOutputBytes ?? resolveSchedulerCommandMaxOutputBytes();
116
137
  // Backgrounded spawn tasks: each is claimed synchronously (status → accepted)
117
138
  // then runs to completion off the poll tick, holding a slot across the child's
118
139
  // registration window. Tracked so tests/shutdown can observe and settle them.
119
140
  const backgroundSpawns = new Set<Promise<void>>();
141
+ const backgroundCommands = new Set<Promise<void>>();
120
142
 
121
143
  async function spawnManagedAgent(opts: SpawnOptions, action = "Spawned"): Promise<ManagedAgentReport> {
122
144
  const agent = await dispatchSpawn(opts, config);
@@ -272,9 +294,69 @@ export function createControlHandler(
272
294
  // per-host cap (see dispatchSpawnCommand). Every other command is handled
273
295
  // inline, synchronously with the poll tick, exactly as before.
274
296
  if (command.type === "agent.spawn") return dispatchSpawnCommand(command);
297
+ if (command.type === "scheduler.command") return dispatchSchedulerCommand(command);
275
298
  return handleNonSpawnCommand(command);
276
299
  }
277
300
 
301
+ async function dispatchSchedulerCommand(command: RelayCommand): Promise<boolean> {
302
+ await relay.updateCommand(command.id, "accepted");
303
+ const startedAt = Date.now();
304
+ const run = (async () => {
305
+ await schedulerCommandSlots.acquire();
306
+ try {
307
+ await relay.updateCommand(command.id, "running");
308
+ const commandText = typeof command.params.command === "string" ? command.params.command : "";
309
+ const cwd = typeof command.params.cwd === "string" ? command.params.cwd : config.baseDir;
310
+ if (!commandText.trim()) throw new Error("scheduler.command missing command");
311
+ if (!isPathWithinBase(cwd, config.baseDir)) throw new Error("scheduler.command cwd escapes orchestrator baseDir");
312
+ const timeoutMs = typeof command.params.timeoutMs === "number" && Number.isSafeInteger(command.params.timeoutMs) && command.params.timeoutMs > 0
313
+ ? command.params.timeoutMs
314
+ : undefined;
315
+ const env = isRecord(command.params.env)
316
+ ? Object.fromEntries(Object.entries(command.params.env).filter((entry): entry is [string, string] => typeof entry[1] === "string"))
317
+ : undefined;
318
+ const result = await execProcess(["bash", "-lc", commandText], {
319
+ cwd,
320
+ env: { ...process.env, ...(env ?? {}) },
321
+ timeoutMs,
322
+ timeoutLabel: `scheduler command ${command.id}`,
323
+ trimStdout: false,
324
+ trimStderr: false,
325
+ maxOutputBytes: schedulerCommandMaxOutputBytes,
326
+ // #968 hole 2 — reap the whole tree on every exit path so a backgrounded/daemonized
327
+ // child (e.g. `setsid … &`) can't survive after the command returns.
328
+ reapProcessGroup: true,
329
+ });
330
+ const finishedAt = Date.now();
331
+ await relay.updateCommand(command.id, result.ok ? "succeeded" : result.timedOut ? "timed_out" : "failed", {
332
+ command: commandText,
333
+ cwd,
334
+ startedAt,
335
+ finishedAt,
336
+ durationMs: finishedAt - startedAt,
337
+ exitCode: result.exitCode,
338
+ success: result.ok,
339
+ timedOut: result.timedOut === true,
340
+ stdoutTail: result.stdout,
341
+ stderrTail: result.stderr,
342
+ outputTruncated: result.outputTruncated === true,
343
+ outputLimitExceeded: result.outputLimitExceeded === true,
344
+ }, result.ok ? undefined : result.stderr || `command exited ${result.exitCode ?? "without code"}`);
345
+ } catch (error) {
346
+ await relay.updateCommand(command.id, "failed", {
347
+ startedAt,
348
+ finishedAt: Date.now(),
349
+ durationMs: Date.now() - startedAt,
350
+ }, errMessage(error));
351
+ } finally {
352
+ schedulerCommandSlots.release();
353
+ }
354
+ })();
355
+ backgroundCommands.add(run);
356
+ run.finally(() => backgroundCommands.delete(run));
357
+ return true;
358
+ }
359
+
278
360
  async function handleNonSpawnCommand(command: RelayCommand): Promise<boolean> {
279
361
  await relay.updateCommand(command.id, "accepted");
280
362
  await relay.updateCommand(command.id, "running");
package/src/process.ts CHANGED
@@ -1,9 +1,14 @@
1
+ import { mkdirSync, readFileSync, rmdirSync, writeFileSync } from "node:fs";
2
+ import { isPidAlive, killPid, processTreePids, waitForPidsExit } from "agent-relay-sdk/process-utils";
3
+
1
4
  export interface ExecResult {
2
5
  ok: boolean;
3
6
  exitCode: number | null;
4
7
  stdout: string;
5
8
  stderr: string;
6
9
  timedOut?: boolean;
10
+ outputTruncated?: boolean;
11
+ outputLimitExceeded?: boolean;
7
12
  }
8
13
 
9
14
  interface ExecOptions {
@@ -16,31 +21,117 @@ interface ExecOptions {
16
21
  timeoutMs?: number;
17
22
  timeoutLabel?: string;
18
23
  streamDrainGraceMs?: number;
24
+ maxOutputBytes?: number;
19
25
  signal?: AbortSignal;
26
+ /** Reap ALL descendants on EVERY exit path — normal exit included — not just on timeout/abort
27
+ * (#968 hole 2). Set for scheduler commands, which may background/daemonize children that would
28
+ * otherwise outlive the parent shell. On Linux this confines the command to a dedicated cgroup v2
29
+ * and kills the whole cgroup (bulletproof: a `setsid`-detached child cannot escape cgroup
30
+ * membership); elsewhere it falls back to a detached process-group + descendant-tree kill. */
31
+ reapProcessGroup?: boolean;
20
32
  }
21
33
 
22
34
  const DEFAULT_STREAM_DRAIN_GRACE_MS = 1_000;
35
+ const DEFAULT_KILL_GRACE_MS = 1_000;
36
+
37
+ let commandCgroupSeq = 0;
38
+
39
+ /** Create a dedicated child cgroup (v2) for a command so its entire process tree — including any
40
+ * `setsid`-detached grandchildren that reparent away and escape the process group — can be reaped
41
+ * in one shot via `cgroup.kill`. Returns the new cgroup's absolute path, or undefined when cgroup v2
42
+ * isn't available/writable (non-Linux, cgroup v1, or missing delegation) so callers fall back. */
43
+ function createCommandCgroup(): string | undefined {
44
+ if (process.platform !== "linux") return undefined;
45
+ try {
46
+ const self = readFileSync("/proc/self/cgroup", "utf8").trim();
47
+ const idx = self.indexOf("::"); // cgroup v2 line: "0::/rel/path"
48
+ if (idx < 0) return undefined;
49
+ const rel = self.slice(idx + 2).trim();
50
+ const path = `/sys/fs/cgroup${rel}/arcmd-${process.pid}-${Date.now()}-${commandCgroupSeq++}`;
51
+ mkdirSync(path);
52
+ return path;
53
+ } catch {
54
+ return undefined;
55
+ }
56
+ }
57
+
58
+ function cgroupPids(cgPath: string): number[] {
59
+ try {
60
+ return readFileSync(`${cgPath}/cgroup.procs`, "utf8")
61
+ .split("\n")
62
+ .map((line) => Number(line.trim()))
63
+ .filter((pid) => Number.isFinite(pid) && pid > 0);
64
+ } catch {
65
+ return [];
66
+ }
67
+ }
68
+
69
+ /** Kill every process in the command's cgroup and remove the cgroup. `cgroup.kill` SIGKILLs the
70
+ * whole subtree atomically; we then wait out any lingering PID-table entries and rmdir (retrying
71
+ * a few times, never hanging) so we don't leak an empty cgroup dir. */
72
+ async function reapCommandCgroup(cgPath: string): Promise<void> {
73
+ try { writeFileSync(`${cgPath}/cgroup.kill`, "1"); } catch {}
74
+ await waitForPidsExit(cgroupPids(cgPath), DEFAULT_KILL_GRACE_MS);
75
+ for (const pid of cgroupPids(cgPath)) killPid(pid, "SIGKILL");
76
+ for (let attempt = 0; attempt < 5; attempt++) {
77
+ try { rmdirSync(cgPath); return; } catch {}
78
+ await sleep(50);
79
+ }
80
+ }
81
+
82
+ /** Fallback reaper for hosts without cgroup v2: the command was spawned detached (its own process
83
+ * group, pgid === proc.pid), so signal the whole group plus any still-living descendants. Reuses the
84
+ * shared process helpers. NOTE: a `setsid`-detached child escapes the process group entirely — only
85
+ * the cgroup path above reaps those; this is the best achievable without cgroups. */
86
+ async function reapDetachedProcessGroup(proc: Bun.Subprocess): Promise<void> {
87
+ try { process.kill(-proc.pid, "SIGTERM"); } catch {}
88
+ const pids = await processTreePids([proc.pid]).catch(() => [proc.pid]);
89
+ for (const pid of pids) killPid(pid, "SIGTERM");
90
+ const all = [proc.pid, ...pids];
91
+ if (await waitForPidsExit(all, DEFAULT_KILL_GRACE_MS)) return;
92
+ try { process.kill(-proc.pid, "SIGKILL"); } catch {}
93
+ for (const pid of all.filter(isPidAlive)) killPid(pid, "SIGKILL");
94
+ }
23
95
 
24
96
  interface StreamCapture {
25
97
  done: Promise<void>;
26
98
  text(): string;
99
+ truncated(): boolean;
100
+ limitExceeded(): boolean;
27
101
  cancel(): void;
28
102
  }
29
103
 
30
- function captureStream(stream: ReadableStream<Uint8Array> | undefined): StreamCapture {
31
- if (!stream) return { done: Promise.resolve(), text: () => "", cancel: () => {} };
104
+ function captureStream(
105
+ stream: ReadableStream<Uint8Array> | undefined,
106
+ options: { maxOutputBytes?: number; onLimitExceeded?: () => void } = {},
107
+ ): StreamCapture {
108
+ if (!stream) return { done: Promise.resolve(), text: () => "", truncated: () => false, limitExceeded: () => false, cancel: () => {} };
32
109
  const reader = stream.getReader();
33
110
  const decoder = new TextDecoder();
34
111
  let output = "";
35
112
  let canceled = false;
113
+ let truncated = false;
114
+ let limitExceeded = false;
36
115
  const done = (async () => {
37
116
  try {
38
117
  while (!canceled) {
39
118
  const chunk = await reader.read();
40
119
  if (chunk.done) break;
41
120
  output += decoder.decode(chunk.value, { stream: true });
121
+ if (options.maxOutputBytes && output.length > options.maxOutputBytes) {
122
+ truncated = true;
123
+ output = output.slice(output.length - options.maxOutputBytes);
124
+ if (!limitExceeded) {
125
+ limitExceeded = true;
126
+ options.onLimitExceeded?.();
127
+ }
128
+ }
42
129
  }
43
130
  output += decoder.decode();
131
+ if (options.maxOutputBytes && output.length > options.maxOutputBytes) {
132
+ truncated = true;
133
+ output = output.slice(output.length - options.maxOutputBytes);
134
+ }
44
135
  } catch {
45
136
  // Intentional cancellation on process timeout or a stuck post-exit pipe.
46
137
  } finally {
@@ -50,6 +141,8 @@ function captureStream(stream: ReadableStream<Uint8Array> | undefined): StreamCa
50
141
  return {
51
142
  done,
52
143
  text: () => output,
144
+ truncated: () => truncated,
145
+ limitExceeded: () => limitExceeded,
53
146
  cancel: () => {
54
147
  canceled = true;
55
148
  void reader.cancel().catch(() => {});
@@ -74,35 +167,69 @@ function abortMessage(cmd: string[], options: ExecOptions): string {
74
167
  return `${label} aborted`;
75
168
  }
76
169
 
170
+ async function terminateProcessTree(proc: Bun.Subprocess): Promise<void> {
171
+ const pids = await processTreePids([proc.pid]).catch(() => [proc.pid]);
172
+ for (const target of pids) killPid(target, "SIGTERM");
173
+ try { proc.kill("SIGTERM"); } catch {}
174
+ const exited = await waitForPidsExit(pids, DEFAULT_KILL_GRACE_MS);
175
+ const alive = exited ? [] : pids.filter(isPidAlive);
176
+ for (const target of alive) killPid(target, "SIGKILL");
177
+ if (alive.includes(proc.pid)) {
178
+ try { proc.kill("SIGKILL"); } catch {}
179
+ }
180
+ if (alive.length > 0) await waitForPidsExit(alive, DEFAULT_KILL_GRACE_MS);
181
+ }
182
+
77
183
  export async function execProcess(cmd: string[], options: ExecOptions = {}): Promise<ExecResult> {
78
184
  if (options.signal?.aborted) {
79
185
  return { ok: false, exitCode: null, stdout: "", stderr: abortMessage(cmd, options) };
80
186
  }
81
- const proc = Bun.spawn(cmd, {
187
+ // #968 hole 2 — for reap-on-every-exit commands, confine the whole tree to a dedicated cgroup so a
188
+ // `setsid`-detached child can't survive the parent. The wrapper joins the cgroup (writes its own
189
+ // pid to cgroup.procs) BEFORE exec-ing the real command, so every descendant is a member from birth
190
+ // (race-free); `exec "$0" "$@"` passes the original argv through untouched.
191
+ const cgroup = options.reapProcessGroup ? createCommandCgroup() : undefined;
192
+ const spawnCmd = cgroup
193
+ ? ["sh", "-c", `echo $$ > ${cgroup}/cgroup.procs 2>/dev/null || true; exec "$0" "$@"`, ...cmd]
194
+ : cmd;
195
+ const proc = Bun.spawn(spawnCmd, {
82
196
  cwd: options.cwd,
83
197
  env: options.env,
84
198
  stdin: "ignore",
85
199
  stdout: options.stdout ?? "pipe",
86
200
  stderr: options.stderr ?? "pipe",
201
+ // Own process group so the fallback reaper can group-signal without touching the orchestrator.
202
+ ...(options.reapProcessGroup ? { detached: true } : {}),
87
203
  });
88
- const stdoutCapture = options.stdout === "ignore" ? captureStream(undefined) : captureStream(proc.stdout);
89
- const stderrCapture = options.stderr === "ignore" ? captureStream(undefined) : captureStream(proc.stderr);
90
204
  let timedOut = false;
91
205
  let aborted = false;
206
+ let outputLimitExceeded = false;
207
+ let abortStarted = false;
92
208
  let timeout: ReturnType<typeof setTimeout> | undefined;
93
- let killTimeout: ReturnType<typeof setTimeout> | undefined;
94
209
  let timeoutResolve: ((value: null) => void) | undefined;
95
210
  const timeoutPromise = new Promise<null>((resolve) => { timeoutResolve = resolve; });
96
- const abortProcess = () => {
211
+ let abortDone: Promise<void> = Promise.resolve();
212
+ const abortProcess = (killParentImmediately = false) => {
213
+ if (abortStarted) return;
214
+ abortStarted = true;
97
215
  aborted = true;
98
- try { proc.kill("SIGTERM"); } catch {}
99
- killTimeout = setTimeout(() => {
100
- try { proc.kill("SIGKILL"); } catch {}
101
- }, 1_000);
102
- killTimeout.unref?.();
103
- timeoutResolve?.(null);
216
+ if (killParentImmediately) {
217
+ try { proc.kill("SIGTERM"); } catch {}
218
+ }
219
+ abortDone = terminateProcessTree(proc).finally(() => timeoutResolve?.(null));
220
+ };
221
+ const abortForOutputLimit = () => {
222
+ outputLimitExceeded = true;
223
+ abortProcess();
104
224
  };
105
- options.signal?.addEventListener("abort", abortProcess, { once: true });
225
+ const stdoutCapture = options.stdout === "ignore"
226
+ ? captureStream(undefined)
227
+ : captureStream(proc.stdout, { maxOutputBytes: options.maxOutputBytes, onLimitExceeded: abortForOutputLimit });
228
+ const stderrCapture = options.stderr === "ignore"
229
+ ? captureStream(undefined)
230
+ : captureStream(proc.stderr, { maxOutputBytes: options.maxOutputBytes, onLimitExceeded: abortForOutputLimit });
231
+ const abortFromSignal = () => abortProcess(true);
232
+ options.signal?.addEventListener("abort", abortFromSignal, { once: true });
106
233
  if (options.timeoutMs && options.timeoutMs > 0) {
107
234
  timeout = setTimeout(() => {
108
235
  timedOut = true;
@@ -112,10 +239,10 @@ export async function execProcess(cmd: string[], options: ExecOptions = {}): Pro
112
239
  }
113
240
 
114
241
  const exitCode = await (timeout || options.signal ? Promise.race([proc.exited, timeoutPromise]) : proc.exited);
115
- options.signal?.removeEventListener("abort", abortProcess);
242
+ options.signal?.removeEventListener("abort", abortFromSignal);
116
243
  if (timeout) clearTimeout(timeout);
117
- if (!timedOut && !aborted && killTimeout) clearTimeout(killTimeout);
118
244
  if (timedOut || aborted) {
245
+ await abortDone;
119
246
  stdoutCapture.cancel();
120
247
  stderrCapture.cancel();
121
248
  }
@@ -125,20 +252,32 @@ export async function execProcess(cmd: string[], options: ExecOptions = {}): Pro
125
252
  stdoutCapture.cancel();
126
253
  stderrCapture.cancel();
127
254
 
255
+ // #968 hole 2 — reap the command's whole process tree on EVERY exit path (normal exit included),
256
+ // so a backgrounded/daemonized child can't outlive the command. Runs regardless of exit reason.
257
+ if (options.reapProcessGroup) {
258
+ if (cgroup) await reapCommandCgroup(cgroup);
259
+ else await reapDetachedProcessGroup(proc);
260
+ }
261
+
128
262
  const stdout = stdoutCapture.text();
129
263
  let stderr = stderrCapture.text();
130
264
  if (timedOut) {
131
265
  const msg = timeoutMessage(cmd, options);
132
266
  stderr = stderr ? `${stderr}\n${msg}` : msg;
267
+ } else if (outputLimitExceeded) {
268
+ const msg = `output exceeded ${options.maxOutputBytes} bytes`;
269
+ stderr = stderr ? `${stderr}\n${msg}` : msg;
133
270
  } else if (aborted) {
134
271
  const msg = abortMessage(cmd, options);
135
272
  stderr = stderr ? `${stderr}\n${msg}` : msg;
136
273
  }
137
274
  return {
138
- ok: !timedOut && !aborted && exitCode === 0,
275
+ ok: !timedOut && !aborted && !outputLimitExceeded && exitCode === 0,
139
276
  exitCode,
140
277
  stdout: options.trimStdout === false ? stdout : stdout.trim(),
141
278
  stderr: options.trimStderr === false ? stderr : stderr.trim(),
142
279
  ...(timedOut ? { timedOut } : {}),
280
+ ...(stdoutCapture.truncated() || stderrCapture.truncated() ? { outputTruncated: true } : {}),
281
+ ...(outputLimitExceeded ? { outputLimitExceeded: true } : {}),
143
282
  };
144
283
  }
@@ -1,7 +1,7 @@
1
1
  import { existsSync, mkdirSync, readFileSync, writeFileSync } from "node:fs";
2
2
  import { homedir } from "node:os";
3
3
  import { dirname, join } from "node:path";
4
- import { errMessage, isRecord } from "agent-relay-sdk";
4
+ import { errMessage, isRecord, SHARED_CALLMUX_TOOL_CALL_TIMEOUT_MS } from "agent-relay-sdk";
5
5
  import { createListener, type CallmuxConfig, type CreateListenerOptions, type ListenerHealthSnapshot, type ProgrammaticListener } from "callmux";
6
6
  import type { OrchestratorConfig } from "./config";
7
7
  import { agentRelayHome } from "./config";
@@ -78,14 +78,18 @@ export function sharedMcpListenerUrl(): string {
78
78
  export function writeSharedCallmuxConfig(opts: Pick<SharedCallmuxOptions, "configPath" | "sourceConfigPath"> & { registryServers?: CallmuxConfig["servers"] }): CallmuxConfig {
79
79
  const source = readJsonObject(opts.sourceConfigPath);
80
80
  const sourceServers = isRecord(source.servers) ? source.servers : {};
81
- const servers = opts.registryServers && Object.keys(opts.registryServers).length > 0
81
+ const rawServers = opts.registryServers && Object.keys(opts.registryServers).length > 0
82
82
  ? cloneServers(opts.registryServers)
83
83
  : fallbackSharedServers(sourceServers);
84
+ const servers = withSharedCallTimeoutFloor(rawServers);
84
85
  const generated: CallmuxConfig = {
85
86
  servers,
86
87
  cacheTtlSeconds: numberFromRecord(source, "cacheTtlSeconds") ?? 10,
87
88
  maxConcurrency: numberFromRecord(source, "maxConcurrency") ?? 20,
88
- callTimeoutMs: numberFromRecord(source, "callTimeoutMs") ?? 180_000,
89
+ callTimeoutMs: Math.max(
90
+ numberFromRecord(source, "callTimeoutMs") ?? SHARED_CALLMUX_TOOL_CALL_TIMEOUT_MS,
91
+ SHARED_CALLMUX_TOOL_CALL_TIMEOUT_MS,
92
+ ),
89
93
  outputFormat: outputFormatFromRecord(source, "outputFormat") ?? "auto",
90
94
  // Relay workers consume only proxied tokenlean+github tools; suppress callmux meta-tools.
91
95
  exposeMetaTools: false,
@@ -305,6 +309,16 @@ function fallbackSharedServers(sourceServers: Record<string, unknown>): CallmuxC
305
309
  };
306
310
  }
307
311
 
312
+ function withSharedCallTimeoutFloor(servers: CallmuxConfig["servers"]): CallmuxConfig["servers"] {
313
+ return Object.fromEntries(Object.entries(servers).map(([name, server]) => {
314
+ const callTimeoutMs = Math.max(
315
+ server.callTimeoutMs ?? SHARED_CALLMUX_TOOL_CALL_TIMEOUT_MS,
316
+ SHARED_CALLMUX_TOOL_CALL_TIMEOUT_MS,
317
+ );
318
+ return [name, { ...server, callTimeoutMs }];
319
+ }));
320
+ }
321
+
308
322
  function callmuxServerFromProvisioningVariant(value: unknown): { name: string; server: CallmuxConfig["servers"][string] } | null {
309
323
  if (!isRecord(value) || value.enabled === false || value.approvalStatus === "pending" || value.validationStatus === "invalid") return null;
310
324
  if (typeof value.name !== "string" || !value.name) return null;
@@ -383,7 +397,7 @@ function defaultGithubServer(): CallmuxConfig["servers"][string] {
383
397
  command: "github-mcp-server",
384
398
  args: ["stdio"],
385
399
  prefix: "gh",
386
- callTimeoutMs: 60_000,
400
+ callTimeoutMs: SHARED_CALLMUX_TOOL_CALL_TIMEOUT_MS,
387
401
  tools: GITHUB_TOOLS,
388
402
  cachePolicy: { allowTools: ["issue_read", "list_issues", "search_issues", "search_code", "get_file_contents"] },
389
403
  };