agent-relay-orchestrator 0.119.10 → 0.120.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "agent-relay-orchestrator",
3
- "version": "0.119.10",
3
+ "version": "0.120.0",
4
4
  "description": "Agent Relay orchestrator — manages agent lifecycle across hosts",
5
5
  "type": "module",
6
6
  "bin": {
@@ -17,7 +17,7 @@
17
17
  },
18
18
  "dependencies": {
19
19
  "agent-relay-providers": "0.104.3",
20
- "agent-relay-sdk": "0.2.106",
20
+ "agent-relay-sdk": "0.2.107",
21
21
  "callmux": "0.23.0"
22
22
  },
23
23
  "devDependencies": {
package/src/config.ts CHANGED
@@ -93,6 +93,8 @@ export const TERMINAL_COMMAND_TIMEOUT_MS = envNonNegativeMax("AGENT_RELAY_TERMIN
93
93
  export const TERMINAL_GROUND_WAIT_MAX_MS = envNonNegativeMax("AGENT_RELAY_TERMINAL_GROUND_WAIT_MAX_MS", 500, 0);
94
94
  export const TERMINAL_BACKFILL_SCROLLBACK_LINES = envNonNegativeMax("AGENT_RELAY_TERMINAL_BACKFILL_SCROLLBACK", 1000, 0);
95
95
  export const TERMINAL_DEBUG = process.env.AGENT_RELAY_TERMINAL_DEBUG === "1";
96
+ export const SCHEDULER_COMMAND_MAX_CONCURRENT = envNonNegativeMax("AGENT_RELAY_SCHEDULER_COMMAND_MAX_CONCURRENT", 4, 1);
97
+ export const SCHEDULER_COMMAND_MAX_OUTPUT_BYTES = envNonNegativeMax("AGENT_RELAY_SCHEDULER_COMMAND_MAX_OUTPUT_BYTES", 256 * 1024, 4096);
96
98
 
97
99
  interface RawConfig {
98
100
  id?: string;
package/src/control.ts CHANGED
@@ -1,6 +1,6 @@
1
- import { errMessage, isRecord, normalizeAgentLifecycle, normalizeWorkspaceMode, Semaphore } from "agent-relay-sdk";
1
+ import { errMessage, isPathWithinBase, isRecord, normalizeAgentLifecycle, normalizeWorkspaceMode, Semaphore } from "agent-relay-sdk";
2
2
  import { getAllManifests, getManifest } from "agent-relay-providers";
3
- import type { OrchestratorConfig } from "./config";
3
+ import { SCHEDULER_COMMAND_MAX_CONCURRENT, SCHEDULER_COMMAND_MAX_OUTPUT_BYTES, type OrchestratorConfig } from "./config";
4
4
  import type { ManagedAgentReport, RelayClient, RelayCommand } from "./relay";
5
5
  import { handleSelfUpgrade } from "./self-upgrade";
6
6
  import { readLocalProviderConfigs } from "./provider-config-migration";
@@ -9,6 +9,7 @@ import { cleanupWorkspace, discardRecoveryBranch, idleRefreshWorktree, mergeWork
9
9
  import { withMergePhaseTimeout } from "./workspace-probe/merge-timeouts";
10
10
  import { armWorkspacePrAutoMerge, mergeWorkspacePr, refreshWorkspacePrBranch } from "./workspace-pr";
11
11
  import type { WorkspaceMergeResult } from "agent-relay-sdk";
12
+ import { execProcess } from "./process";
12
13
 
13
14
  // #638 — settle a `workspace.merge` command on whether it made progress, not on whether
14
15
  // the merge function threw. Every no-progress outcome (origin moved ahead, an unpredicted
@@ -64,6 +65,22 @@ export function resolveSpawnRegistrationTimeoutMs(env: NodeJS.ProcessEnv = proce
64
65
  return parsed;
65
66
  }
66
67
 
68
+ export function resolveMaxConcurrentSchedulerCommands(env: NodeJS.ProcessEnv = process.env): number {
69
+ const raw = env.AGENT_RELAY_SCHEDULER_COMMAND_MAX_CONCURRENT;
70
+ if (raw === undefined || raw.trim() === "") return SCHEDULER_COMMAND_MAX_CONCURRENT;
71
+ const parsed = Number(raw);
72
+ if (!Number.isInteger(parsed) || parsed < 1) return SCHEDULER_COMMAND_MAX_CONCURRENT;
73
+ return parsed;
74
+ }
75
+
76
+ export function resolveSchedulerCommandMaxOutputBytes(env: NodeJS.ProcessEnv = process.env): number {
77
+ const raw = env.AGENT_RELAY_SCHEDULER_COMMAND_MAX_OUTPUT_BYTES;
78
+ if (raw === undefined || raw.trim() === "") return SCHEDULER_COMMAND_MAX_OUTPUT_BYTES;
79
+ const parsed = Number(raw);
80
+ if (!Number.isSafeInteger(parsed) || parsed < 4096) return SCHEDULER_COMMAND_MAX_OUTPUT_BYTES;
81
+ return parsed;
82
+ }
83
+
67
84
  /** Awaits a spawn slot until the child registers, or resolves false at the fail-safe deadline. */
68
85
  export type WaitForRegistration = (agent: ManagedAgentReport, timeoutMs: number) => Promise<boolean>;
69
86
 
@@ -98,6 +115,8 @@ interface ControlHandlerDeps {
98
115
  registrationTimeoutMs?: number;
99
116
  // Injectable stop dispatch — production uses tmux/system process teardown.
100
117
  stopSession?: typeof stopSession;
118
+ maxConcurrentSchedulerCommands?: number;
119
+ schedulerCommandMaxOutputBytes?: number;
101
120
  }
102
121
 
103
122
  export function createControlHandler(
@@ -113,10 +132,13 @@ export function createControlHandler(
113
132
  const spawnSlots = new Semaphore(deps.maxConcurrentSpawns ?? resolveMaxConcurrentSpawns());
114
133
  const waitForRegistration = deps.waitForRegistration ?? waitForManagedRegistration;
115
134
  const registrationTimeoutMs = deps.registrationTimeoutMs ?? resolveSpawnRegistrationTimeoutMs();
135
+ const schedulerCommandSlots = new Semaphore(deps.maxConcurrentSchedulerCommands ?? resolveMaxConcurrentSchedulerCommands());
136
+ const schedulerCommandMaxOutputBytes = deps.schedulerCommandMaxOutputBytes ?? resolveSchedulerCommandMaxOutputBytes();
116
137
  // Backgrounded spawn tasks: each is claimed synchronously (status → accepted)
117
138
  // then runs to completion off the poll tick, holding a slot across the child's
118
139
  // registration window. Tracked so tests/shutdown can observe and settle them.
119
140
  const backgroundSpawns = new Set<Promise<void>>();
141
+ const backgroundCommands = new Set<Promise<void>>();
120
142
 
121
143
  async function spawnManagedAgent(opts: SpawnOptions, action = "Spawned"): Promise<ManagedAgentReport> {
122
144
  const agent = await dispatchSpawn(opts, config);
@@ -272,9 +294,69 @@ export function createControlHandler(
272
294
  // per-host cap (see dispatchSpawnCommand). Every other command is handled
273
295
  // inline, synchronously with the poll tick, exactly as before.
274
296
  if (command.type === "agent.spawn") return dispatchSpawnCommand(command);
297
+ if (command.type === "scheduler.command") return dispatchSchedulerCommand(command);
275
298
  return handleNonSpawnCommand(command);
276
299
  }
277
300
 
301
+ async function dispatchSchedulerCommand(command: RelayCommand): Promise<boolean> {
302
+ await relay.updateCommand(command.id, "accepted");
303
+ const startedAt = Date.now();
304
+ const run = (async () => {
305
+ await schedulerCommandSlots.acquire();
306
+ try {
307
+ await relay.updateCommand(command.id, "running");
308
+ const commandText = typeof command.params.command === "string" ? command.params.command : "";
309
+ const cwd = typeof command.params.cwd === "string" ? command.params.cwd : config.baseDir;
310
+ if (!commandText.trim()) throw new Error("scheduler.command missing command");
311
+ if (!isPathWithinBase(cwd, config.baseDir)) throw new Error("scheduler.command cwd escapes orchestrator baseDir");
312
+ const timeoutMs = typeof command.params.timeoutMs === "number" && Number.isSafeInteger(command.params.timeoutMs) && command.params.timeoutMs > 0
313
+ ? command.params.timeoutMs
314
+ : undefined;
315
+ const env = isRecord(command.params.env)
316
+ ? Object.fromEntries(Object.entries(command.params.env).filter((entry): entry is [string, string] => typeof entry[1] === "string"))
317
+ : undefined;
318
+ const result = await execProcess(["bash", "-lc", commandText], {
319
+ cwd,
320
+ env: { ...process.env, ...(env ?? {}) },
321
+ timeoutMs,
322
+ timeoutLabel: `scheduler command ${command.id}`,
323
+ trimStdout: false,
324
+ trimStderr: false,
325
+ maxOutputBytes: schedulerCommandMaxOutputBytes,
326
+ // #968 hole 2 — reap the whole tree on every exit path so a backgrounded/daemonized
327
+ // child (e.g. `setsid … &`) can't survive after the command returns.
328
+ reapProcessGroup: true,
329
+ });
330
+ const finishedAt = Date.now();
331
+ await relay.updateCommand(command.id, result.ok ? "succeeded" : result.timedOut ? "timed_out" : "failed", {
332
+ command: commandText,
333
+ cwd,
334
+ startedAt,
335
+ finishedAt,
336
+ durationMs: finishedAt - startedAt,
337
+ exitCode: result.exitCode,
338
+ success: result.ok,
339
+ timedOut: result.timedOut === true,
340
+ stdoutTail: result.stdout,
341
+ stderrTail: result.stderr,
342
+ outputTruncated: result.outputTruncated === true,
343
+ outputLimitExceeded: result.outputLimitExceeded === true,
344
+ }, result.ok ? undefined : result.stderr || `command exited ${result.exitCode ?? "without code"}`);
345
+ } catch (error) {
346
+ await relay.updateCommand(command.id, "failed", {
347
+ startedAt,
348
+ finishedAt: Date.now(),
349
+ durationMs: Date.now() - startedAt,
350
+ }, errMessage(error));
351
+ } finally {
352
+ schedulerCommandSlots.release();
353
+ }
354
+ })();
355
+ backgroundCommands.add(run);
356
+ run.finally(() => backgroundCommands.delete(run));
357
+ return true;
358
+ }
359
+
278
360
  async function handleNonSpawnCommand(command: RelayCommand): Promise<boolean> {
279
361
  await relay.updateCommand(command.id, "accepted");
280
362
  await relay.updateCommand(command.id, "running");
package/src/process.ts CHANGED
@@ -1,9 +1,14 @@
1
+ import { mkdirSync, readFileSync, rmdirSync, writeFileSync } from "node:fs";
2
+ import { isPidAlive, killPid, processTreePids, waitForPidsExit } from "agent-relay-sdk/process-utils";
3
+
1
4
  export interface ExecResult {
2
5
  ok: boolean;
3
6
  exitCode: number | null;
4
7
  stdout: string;
5
8
  stderr: string;
6
9
  timedOut?: boolean;
10
+ outputTruncated?: boolean;
11
+ outputLimitExceeded?: boolean;
7
12
  }
8
13
 
9
14
  interface ExecOptions {
@@ -16,31 +21,117 @@ interface ExecOptions {
16
21
  timeoutMs?: number;
17
22
  timeoutLabel?: string;
18
23
  streamDrainGraceMs?: number;
24
+ maxOutputBytes?: number;
19
25
  signal?: AbortSignal;
26
+ /** Reap ALL descendants on EVERY exit path — normal exit included — not just on timeout/abort
27
+ * (#968 hole 2). Set for scheduler commands, which may background/daemonize children that would
28
+ * otherwise outlive the parent shell. On Linux this confines the command to a dedicated cgroup v2
29
+ * and kills the whole cgroup (bulletproof: a `setsid`-detached child cannot escape cgroup
30
+ * membership); elsewhere it falls back to a detached process-group + descendant-tree kill. */
31
+ reapProcessGroup?: boolean;
20
32
  }
21
33
 
22
34
  const DEFAULT_STREAM_DRAIN_GRACE_MS = 1_000;
35
+ const DEFAULT_KILL_GRACE_MS = 1_000;
36
+
37
+ let commandCgroupSeq = 0;
38
+
39
+ /** Create a dedicated child cgroup (v2) for a command so its entire process tree — including any
40
+ * `setsid`-detached grandchildren that reparent away and escape the process group — can be reaped
41
+ * in one shot via `cgroup.kill`. Returns the new cgroup's absolute path, or undefined when cgroup v2
42
+ * isn't available/writable (non-Linux, cgroup v1, or missing delegation) so callers fall back. */
43
+ function createCommandCgroup(): string | undefined {
44
+ if (process.platform !== "linux") return undefined;
45
+ try {
46
+ const self = readFileSync("/proc/self/cgroup", "utf8").trim();
47
+ const idx = self.indexOf("::"); // cgroup v2 line: "0::/rel/path"
48
+ if (idx < 0) return undefined;
49
+ const rel = self.slice(idx + 2).trim();
50
+ const path = `/sys/fs/cgroup${rel}/arcmd-${process.pid}-${Date.now()}-${commandCgroupSeq++}`;
51
+ mkdirSync(path);
52
+ return path;
53
+ } catch {
54
+ return undefined;
55
+ }
56
+ }
57
+
58
+ function cgroupPids(cgPath: string): number[] {
59
+ try {
60
+ return readFileSync(`${cgPath}/cgroup.procs`, "utf8")
61
+ .split("\n")
62
+ .map((line) => Number(line.trim()))
63
+ .filter((pid) => Number.isFinite(pid) && pid > 0);
64
+ } catch {
65
+ return [];
66
+ }
67
+ }
68
+
69
+ /** Kill every process in the command's cgroup and remove the cgroup. `cgroup.kill` SIGKILLs the
70
+ * whole subtree atomically; we then wait out any lingering PID-table entries and rmdir (retrying
71
+ * a few times, never hanging) so we don't leak an empty cgroup dir. */
72
+ async function reapCommandCgroup(cgPath: string): Promise<void> {
73
+ try { writeFileSync(`${cgPath}/cgroup.kill`, "1"); } catch {}
74
+ await waitForPidsExit(cgroupPids(cgPath), DEFAULT_KILL_GRACE_MS);
75
+ for (const pid of cgroupPids(cgPath)) killPid(pid, "SIGKILL");
76
+ for (let attempt = 0; attempt < 5; attempt++) {
77
+ try { rmdirSync(cgPath); return; } catch {}
78
+ await sleep(50);
79
+ }
80
+ }
81
+
82
+ /** Fallback reaper for hosts without cgroup v2: the command was spawned detached (its own process
83
+ * group, pgid === proc.pid), so signal the whole group plus any still-living descendants. Reuses the
84
+ * shared process helpers. NOTE: a `setsid`-detached child escapes the process group entirely — only
85
+ * the cgroup path above reaps those; this is the best achievable without cgroups. */
86
+ async function reapDetachedProcessGroup(proc: Bun.Subprocess): Promise<void> {
87
+ try { process.kill(-proc.pid, "SIGTERM"); } catch {}
88
+ const pids = await processTreePids([proc.pid]).catch(() => [proc.pid]);
89
+ for (const pid of pids) killPid(pid, "SIGTERM");
90
+ const all = [proc.pid, ...pids];
91
+ if (await waitForPidsExit(all, DEFAULT_KILL_GRACE_MS)) return;
92
+ try { process.kill(-proc.pid, "SIGKILL"); } catch {}
93
+ for (const pid of all.filter(isPidAlive)) killPid(pid, "SIGKILL");
94
+ }
23
95
 
24
96
  interface StreamCapture {
25
97
  done: Promise<void>;
26
98
  text(): string;
99
+ truncated(): boolean;
100
+ limitExceeded(): boolean;
27
101
  cancel(): void;
28
102
  }
29
103
 
30
- function captureStream(stream: ReadableStream<Uint8Array> | undefined): StreamCapture {
31
- if (!stream) return { done: Promise.resolve(), text: () => "", cancel: () => {} };
104
+ function captureStream(
105
+ stream: ReadableStream<Uint8Array> | undefined,
106
+ options: { maxOutputBytes?: number; onLimitExceeded?: () => void } = {},
107
+ ): StreamCapture {
108
+ if (!stream) return { done: Promise.resolve(), text: () => "", truncated: () => false, limitExceeded: () => false, cancel: () => {} };
32
109
  const reader = stream.getReader();
33
110
  const decoder = new TextDecoder();
34
111
  let output = "";
35
112
  let canceled = false;
113
+ let truncated = false;
114
+ let limitExceeded = false;
36
115
  const done = (async () => {
37
116
  try {
38
117
  while (!canceled) {
39
118
  const chunk = await reader.read();
40
119
  if (chunk.done) break;
41
120
  output += decoder.decode(chunk.value, { stream: true });
121
+ if (options.maxOutputBytes && output.length > options.maxOutputBytes) {
122
+ truncated = true;
123
+ output = output.slice(output.length - options.maxOutputBytes);
124
+ if (!limitExceeded) {
125
+ limitExceeded = true;
126
+ options.onLimitExceeded?.();
127
+ }
128
+ }
42
129
  }
43
130
  output += decoder.decode();
131
+ if (options.maxOutputBytes && output.length > options.maxOutputBytes) {
132
+ truncated = true;
133
+ output = output.slice(output.length - options.maxOutputBytes);
134
+ }
44
135
  } catch {
45
136
  // Intentional cancellation on process timeout or a stuck post-exit pipe.
46
137
  } finally {
@@ -50,6 +141,8 @@ function captureStream(stream: ReadableStream<Uint8Array> | undefined): StreamCa
50
141
  return {
51
142
  done,
52
143
  text: () => output,
144
+ truncated: () => truncated,
145
+ limitExceeded: () => limitExceeded,
53
146
  cancel: () => {
54
147
  canceled = true;
55
148
  void reader.cancel().catch(() => {});
@@ -74,35 +167,69 @@ function abortMessage(cmd: string[], options: ExecOptions): string {
74
167
  return `${label} aborted`;
75
168
  }
76
169
 
170
+ async function terminateProcessTree(proc: Bun.Subprocess): Promise<void> {
171
+ const pids = await processTreePids([proc.pid]).catch(() => [proc.pid]);
172
+ for (const target of pids) killPid(target, "SIGTERM");
173
+ try { proc.kill("SIGTERM"); } catch {}
174
+ const exited = await waitForPidsExit(pids, DEFAULT_KILL_GRACE_MS);
175
+ const alive = exited ? [] : pids.filter(isPidAlive);
176
+ for (const target of alive) killPid(target, "SIGKILL");
177
+ if (alive.includes(proc.pid)) {
178
+ try { proc.kill("SIGKILL"); } catch {}
179
+ }
180
+ if (alive.length > 0) await waitForPidsExit(alive, DEFAULT_KILL_GRACE_MS);
181
+ }
182
+
77
183
  export async function execProcess(cmd: string[], options: ExecOptions = {}): Promise<ExecResult> {
78
184
  if (options.signal?.aborted) {
79
185
  return { ok: false, exitCode: null, stdout: "", stderr: abortMessage(cmd, options) };
80
186
  }
81
- const proc = Bun.spawn(cmd, {
187
+ // #968 hole 2 — for reap-on-every-exit commands, confine the whole tree to a dedicated cgroup so a
188
+ // `setsid`-detached child can't survive the parent. The wrapper joins the cgroup (writes its own
189
+ // pid to cgroup.procs) BEFORE exec-ing the real command, so every descendant is a member from birth
190
+ // (race-free); `exec "$0" "$@"` passes the original argv through untouched.
191
+ const cgroup = options.reapProcessGroup ? createCommandCgroup() : undefined;
192
+ const spawnCmd = cgroup
193
+ ? ["sh", "-c", `echo $$ > ${cgroup}/cgroup.procs 2>/dev/null || true; exec "$0" "$@"`, ...cmd]
194
+ : cmd;
195
+ const proc = Bun.spawn(spawnCmd, {
82
196
  cwd: options.cwd,
83
197
  env: options.env,
84
198
  stdin: "ignore",
85
199
  stdout: options.stdout ?? "pipe",
86
200
  stderr: options.stderr ?? "pipe",
201
+ // Own process group so the fallback reaper can group-signal without touching the orchestrator.
202
+ ...(options.reapProcessGroup ? { detached: true } : {}),
87
203
  });
88
- const stdoutCapture = options.stdout === "ignore" ? captureStream(undefined) : captureStream(proc.stdout);
89
- const stderrCapture = options.stderr === "ignore" ? captureStream(undefined) : captureStream(proc.stderr);
90
204
  let timedOut = false;
91
205
  let aborted = false;
206
+ let outputLimitExceeded = false;
207
+ let abortStarted = false;
92
208
  let timeout: ReturnType<typeof setTimeout> | undefined;
93
- let killTimeout: ReturnType<typeof setTimeout> | undefined;
94
209
  let timeoutResolve: ((value: null) => void) | undefined;
95
210
  const timeoutPromise = new Promise<null>((resolve) => { timeoutResolve = resolve; });
96
- const abortProcess = () => {
211
+ let abortDone: Promise<void> = Promise.resolve();
212
+ const abortProcess = (killParentImmediately = false) => {
213
+ if (abortStarted) return;
214
+ abortStarted = true;
97
215
  aborted = true;
98
- try { proc.kill("SIGTERM"); } catch {}
99
- killTimeout = setTimeout(() => {
100
- try { proc.kill("SIGKILL"); } catch {}
101
- }, 1_000);
102
- killTimeout.unref?.();
103
- timeoutResolve?.(null);
216
+ if (killParentImmediately) {
217
+ try { proc.kill("SIGTERM"); } catch {}
218
+ }
219
+ abortDone = terminateProcessTree(proc).finally(() => timeoutResolve?.(null));
220
+ };
221
+ const abortForOutputLimit = () => {
222
+ outputLimitExceeded = true;
223
+ abortProcess();
104
224
  };
105
- options.signal?.addEventListener("abort", abortProcess, { once: true });
225
+ const stdoutCapture = options.stdout === "ignore"
226
+ ? captureStream(undefined)
227
+ : captureStream(proc.stdout, { maxOutputBytes: options.maxOutputBytes, onLimitExceeded: abortForOutputLimit });
228
+ const stderrCapture = options.stderr === "ignore"
229
+ ? captureStream(undefined)
230
+ : captureStream(proc.stderr, { maxOutputBytes: options.maxOutputBytes, onLimitExceeded: abortForOutputLimit });
231
+ const abortFromSignal = () => abortProcess(true);
232
+ options.signal?.addEventListener("abort", abortFromSignal, { once: true });
106
233
  if (options.timeoutMs && options.timeoutMs > 0) {
107
234
  timeout = setTimeout(() => {
108
235
  timedOut = true;
@@ -112,10 +239,10 @@ export async function execProcess(cmd: string[], options: ExecOptions = {}): Pro
112
239
  }
113
240
 
114
241
  const exitCode = await (timeout || options.signal ? Promise.race([proc.exited, timeoutPromise]) : proc.exited);
115
- options.signal?.removeEventListener("abort", abortProcess);
242
+ options.signal?.removeEventListener("abort", abortFromSignal);
116
243
  if (timeout) clearTimeout(timeout);
117
- if (!timedOut && !aborted && killTimeout) clearTimeout(killTimeout);
118
244
  if (timedOut || aborted) {
245
+ await abortDone;
119
246
  stdoutCapture.cancel();
120
247
  stderrCapture.cancel();
121
248
  }
@@ -125,20 +252,32 @@ export async function execProcess(cmd: string[], options: ExecOptions = {}): Pro
125
252
  stdoutCapture.cancel();
126
253
  stderrCapture.cancel();
127
254
 
255
+ // #968 hole 2 — reap the command's whole process tree on EVERY exit path (normal exit included),
256
+ // so a backgrounded/daemonized child can't outlive the command. Runs regardless of exit reason.
257
+ if (options.reapProcessGroup) {
258
+ if (cgroup) await reapCommandCgroup(cgroup);
259
+ else await reapDetachedProcessGroup(proc);
260
+ }
261
+
128
262
  const stdout = stdoutCapture.text();
129
263
  let stderr = stderrCapture.text();
130
264
  if (timedOut) {
131
265
  const msg = timeoutMessage(cmd, options);
132
266
  stderr = stderr ? `${stderr}\n${msg}` : msg;
267
+ } else if (outputLimitExceeded) {
268
+ const msg = `output exceeded ${options.maxOutputBytes} bytes`;
269
+ stderr = stderr ? `${stderr}\n${msg}` : msg;
133
270
  } else if (aborted) {
134
271
  const msg = abortMessage(cmd, options);
135
272
  stderr = stderr ? `${stderr}\n${msg}` : msg;
136
273
  }
137
274
  return {
138
- ok: !timedOut && !aborted && exitCode === 0,
275
+ ok: !timedOut && !aborted && !outputLimitExceeded && exitCode === 0,
139
276
  exitCode,
140
277
  stdout: options.trimStdout === false ? stdout : stdout.trim(),
141
278
  stderr: options.trimStderr === false ? stderr : stderr.trim(),
142
279
  ...(timedOut ? { timedOut } : {}),
280
+ ...(stdoutCapture.truncated() || stderrCapture.truncated() ? { outputTruncated: true } : {}),
281
+ ...(outputLimitExceeded ? { outputLimitExceeded: true } : {}),
143
282
  };
144
283
  }
@@ -1,7 +1,7 @@
1
1
  import { existsSync, mkdirSync, readFileSync, writeFileSync } from "node:fs";
2
2
  import { homedir } from "node:os";
3
3
  import { dirname, join } from "node:path";
4
- import { errMessage, isRecord } from "agent-relay-sdk";
4
+ import { errMessage, isRecord, SHARED_CALLMUX_TOOL_CALL_TIMEOUT_MS } from "agent-relay-sdk";
5
5
  import { createListener, type CallmuxConfig, type CreateListenerOptions, type ListenerHealthSnapshot, type ProgrammaticListener } from "callmux";
6
6
  import type { OrchestratorConfig } from "./config";
7
7
  import { agentRelayHome } from "./config";
@@ -78,14 +78,18 @@ export function sharedMcpListenerUrl(): string {
78
78
  export function writeSharedCallmuxConfig(opts: Pick<SharedCallmuxOptions, "configPath" | "sourceConfigPath"> & { registryServers?: CallmuxConfig["servers"] }): CallmuxConfig {
79
79
  const source = readJsonObject(opts.sourceConfigPath);
80
80
  const sourceServers = isRecord(source.servers) ? source.servers : {};
81
- const servers = opts.registryServers && Object.keys(opts.registryServers).length > 0
81
+ const rawServers = opts.registryServers && Object.keys(opts.registryServers).length > 0
82
82
  ? cloneServers(opts.registryServers)
83
83
  : fallbackSharedServers(sourceServers);
84
+ const servers = withSharedCallTimeoutFloor(rawServers);
84
85
  const generated: CallmuxConfig = {
85
86
  servers,
86
87
  cacheTtlSeconds: numberFromRecord(source, "cacheTtlSeconds") ?? 10,
87
88
  maxConcurrency: numberFromRecord(source, "maxConcurrency") ?? 20,
88
- callTimeoutMs: numberFromRecord(source, "callTimeoutMs") ?? 180_000,
89
+ callTimeoutMs: Math.max(
90
+ numberFromRecord(source, "callTimeoutMs") ?? SHARED_CALLMUX_TOOL_CALL_TIMEOUT_MS,
91
+ SHARED_CALLMUX_TOOL_CALL_TIMEOUT_MS,
92
+ ),
89
93
  outputFormat: outputFormatFromRecord(source, "outputFormat") ?? "auto",
90
94
  // Relay workers consume only proxied tokenlean+github tools; suppress callmux meta-tools.
91
95
  exposeMetaTools: false,
@@ -305,6 +309,16 @@ function fallbackSharedServers(sourceServers: Record<string, unknown>): CallmuxC
305
309
  };
306
310
  }
307
311
 
312
+ function withSharedCallTimeoutFloor(servers: CallmuxConfig["servers"]): CallmuxConfig["servers"] {
313
+ return Object.fromEntries(Object.entries(servers).map(([name, server]) => {
314
+ const callTimeoutMs = Math.max(
315
+ server.callTimeoutMs ?? SHARED_CALLMUX_TOOL_CALL_TIMEOUT_MS,
316
+ SHARED_CALLMUX_TOOL_CALL_TIMEOUT_MS,
317
+ );
318
+ return [name, { ...server, callTimeoutMs }];
319
+ }));
320
+ }
321
+
308
322
  function callmuxServerFromProvisioningVariant(value: unknown): { name: string; server: CallmuxConfig["servers"][string] } | null {
309
323
  if (!isRecord(value) || value.enabled === false || value.approvalStatus === "pending" || value.validationStatus === "invalid") return null;
310
324
  if (typeof value.name !== "string" || !value.name) return null;
@@ -383,7 +397,7 @@ function defaultGithubServer(): CallmuxConfig["servers"][string] {
383
397
  command: "github-mcp-server",
384
398
  args: ["stdio"],
385
399
  prefix: "gh",
386
- callTimeoutMs: 60_000,
400
+ callTimeoutMs: SHARED_CALLMUX_TOOL_CALL_TIMEOUT_MS,
387
401
  tools: GITHUB_TOOLS,
388
402
  cachePolicy: { allowTools: ["issue_read", "list_issues", "search_issues", "search_code", "get_file_contents"] },
389
403
  };
@@ -7,7 +7,7 @@ import { prMergedState } from "../workspace-pr";
7
7
  import { deleteBranchIfSafe, owningRepoRoot } from "./cleanup";
8
8
  import { refreshWorkspaceDeps } from "./deps";
9
9
  import { type LandGatesResult } from "./land-gates-runner";
10
- import { populateMergeState, resolveBranchRef, syncBaseFromOrigin, workspaceGitState } from "./git-state";
10
+ import { populateMergeState, resolveBranchRef, syncBaseFromOrigin, upstreamRef, workspaceGitState } from "./git-state";
11
11
  import { type MergePhase, mergePhaseTimeoutMs, throwIfMergeAborted, withMergePhaseTimeout } from "./merge-timeouts";
12
12
  import { nextBranchName } from "./names";
13
13
  import { parseWorktrees, shortBranch } from "./parse";
@@ -438,6 +438,66 @@ export async function mergeWorkspace(input: WorkspaceMergeInput): Promise<Worksp
438
438
  * agent with no checkout once cleanup reclaims the worktree (#327).
439
439
  * - Gone owner: reclaim the spent worktree/branch and go terminal `merged`.
440
440
  */
441
+ /**
442
+ * #950 — before a no-op land goes terminal `merged` (or recycles a live owner), verify the
443
+ * branch's landed work is actually on the UPSTREAM, not just on local base. A lost push race
444
+ * (see {@link mergeRebaseFf}) can leave a merge commit on local base that never reached origin;
445
+ * preview then fires `noop` because the branch is an ancestor of local base — and the old path
446
+ * blessed it as `merged` while origin never got it, diverging local base and wedging the repo.
447
+ * Outcomes:
448
+ * - "clean": no upstream / push disabled, or local base is already contained in origin
449
+ * (nothing unpublished) — safe to finalize as-is.
450
+ * - "published": local base was cleanly AHEAD of origin (carried the unpushed land) and we
451
+ * fast-forward-pushed it — now safe to finalize, with pushed=true.
452
+ * - "refuse": local base DIVERGED from origin (unpushed commits that can't fast-forward),
453
+ * or the publish push failed — must NOT finalize unpushed work as merged.
454
+ */
455
+ async function publishNoopBaseIfStranded(
456
+ input: WorkspaceMergeInput,
457
+ worktreePath: string,
458
+ repoRoot: string,
459
+ base: string | undefined,
460
+ signal?: AbortSignal,
461
+ ): Promise<"clean" | "published" | "refuse"> {
462
+ if (!base) return "clean";
463
+ if (input.push === false || !workspacePushEnabled()) return "clean";
464
+ const upstream = await upstreamRef(worktreePath, base, signal);
465
+ if (!upstream) return "clean";
466
+ const slash = upstream.indexOf("/");
467
+ const remote = slash > 0 ? upstream.slice(0, slash) : undefined;
468
+ if (!remote) return "clean";
469
+ throwIfMergeAborted(signal);
470
+ await mergeGit(["fetch", remote, base], worktreePath, "rebase", `workspace merge fetch ${remote}/${base} before noop finalize`, signal);
471
+ const upstreamSha = (await mergeGit(["rev-parse", "--verify", upstream], worktreePath, "rebase", `workspace merge resolve ${upstream} before noop finalize`, signal)).stdout;
472
+ const baseSha = (await mergeGit(["rev-parse", "--verify", base], worktreePath, "rebase", `workspace merge resolve ${base} before noop finalize`, signal)).stdout;
473
+ if (!upstreamSha || !baseSha || baseSha === upstreamSha) return "clean";
474
+ // Local base fully contained in origin (behind/equal) — the land is already published.
475
+ if ((await mergeGit(["merge-base", "--is-ancestor", baseSha, upstreamSha], worktreePath, "rebase", `workspace merge check ${base} contained in ${upstream}`, signal)).ok) return "clean";
476
+ // Local base strictly ahead of origin — publish the stranded land with a fast-forward push.
477
+ if ((await mergeGit(["merge-base", "--is-ancestor", upstreamSha, baseSha], worktreePath, "rebase", `workspace merge check ${base} ahead of ${upstream}`, signal)).ok) {
478
+ throwIfMergeAborted(signal);
479
+ const push = await mergeGit(["push", remote, `${base}:${base}`], worktreePath, "rebase", `workspace merge publish stranded ${base} to ${remote}`, signal);
480
+ return push.ok ? "published" : "refuse";
481
+ }
482
+ // Diverged: unpushed local commits AND origin has commits we lack. Left as-is, local base stays
483
+ // diverged and EVERY later real land hits the divergence refusal forever — a repo-wide host wedge
484
+ // (#950 follow-up SHOULD-FIX 3). Attempt an auto-reconcile: replay the unpushed commits onto fresh
485
+ // origin in a clean base checkout and publish, converting the divergence into a clean, published
486
+ // state. Content is preserved (SHAs of the reconciled commits change — a merge replays as its
487
+ // first-parent delta); this only fires on an already-wedged base, so a content-faithful publish
488
+ // beats a permanent refusal. If it can't be done cleanly/safely, refuse — the caller escalates to
489
+ // a CLEAR conflict (steward) rather than a silent perpetual refusal.
490
+ const localOnly = (await mergeGit(["rev-list", "--reverse", "--first-parent", `${upstreamSha}..${baseSha}`], worktreePath, "rebase", `workspace merge scan unpushed ${base} commits before reconcile`, signal)).stdout.split("\n").filter(Boolean);
491
+ if (localOnly.length === 0) return "refuse";
492
+ const baseWorktree = await worktreeForBranch(repoRoot, base, signal);
493
+ if (!baseWorktree || baseWorktree.dirty) return "refuse";
494
+ const replay = await replayCommitsOntoUpstream(baseWorktree.path, upstreamSha, localOnly, baseSha, base, signal);
495
+ if (!replay.ok) return "refuse";
496
+ throwIfMergeAborted(signal);
497
+ const push = await mergeGit(["push", remote, `${base}:${base}`], baseWorktree.path, "rebase", `workspace merge publish reconciled ${base} to ${remote}`, signal);
498
+ return push.ok ? "published" : "refuse";
499
+ }
500
+
441
501
  async function resolveNoopMerge(
442
502
  input: WorkspaceMergeInput,
443
503
  worktreePath: string,
@@ -453,6 +513,16 @@ async function resolveNoopMerge(
453
513
  signal?: AbortSignal,
454
514
  ): Promise<WorkspaceMergeResult> {
455
515
  throwIfMergeAborted(signal);
516
+ // #950 — a `noop` preview means the branch is already an ancestor of LOCAL base, but that base
517
+ // may carry a merge commit a lost push race never published. Verify it's on origin (publishing
518
+ // it if we cleanly can) before going terminal — never bless unpushed work as `merged`.
519
+ const publishState = await publishNoopBaseIfStranded(input, worktreePath, repoRoot, preview.baseRef, signal);
520
+ if (publishState === "refuse") {
521
+ // Diverged and could not auto-reconcile (SHOULD-FIX 3): escalate to a CLEAR steward-actionable
522
+ // state instead of a benign review_requested that would perpetually re-refuse and wedge the repo.
523
+ return head({ conflict: true, status: "conflict", error: `local ${preview.baseRef ?? "base"} carries unpushed landed work that diverged from origin and could not be auto-reconciled; a steward must reconcile and land it (#950)` });
524
+ }
525
+ const pushedStranded = publishState === "published" ? { pushed: true } : {};
456
526
  const ownerRepo = branch ? await owningRepoRoot(worktreePath, repoRoot) : repoRoot;
457
527
  // Live owner (#327): recycle-to-continue instead of bricking the session.
458
528
  if (input.deleteBranch === false) {
@@ -474,11 +544,11 @@ async function resolveNoopMerge(
474
544
  const reportDeps = depsRefresh.refreshed || depsRefresh.stale || depsRefresh.error;
475
545
  // merged:false → no `branch.landed` notice (nothing landed); newBranch makes the
476
546
  // relay repoint the row and return it to `active` rather than terminal `merged`.
477
- return head({ merged: false, noop: true, status: "active", baseSha, worktreeRemoved: false, branch: fresh, newBranch: fresh, ...deleteResult, ...(reportDeps ? { depsRefresh } : {}), error: undefined });
547
+ return head({ merged: false, noop: true, status: "active", baseSha, worktreeRemoved: false, branch: fresh, newBranch: fresh, ...deleteResult, ...(reportDeps ? { depsRefresh } : {}), ...pushedStranded, error: undefined });
478
548
  }
479
549
  }
480
550
  // No base or checkout failed — stay live on the current branch, don't strand at `merged`.
481
- return head({ merged: false, noop: true, status: "active", worktreeRemoved: false, branchDeleted: false, error: undefined });
551
+ return head({ merged: false, noop: true, status: "active", worktreeRemoved: false, branchDeleted: false, ...pushedStranded, error: undefined });
482
552
  }
483
553
  // Owner is gone — reclaim the spent worktree/branch and go terminal.
484
554
  if (branch) {
@@ -486,9 +556,9 @@ async function resolveNoopMerge(
486
556
  const removed = await mergeGit(["worktree", "remove", "--force", worktreePath], ownerRepo, "cleanup", "workspace merge remove noop worktree", signal);
487
557
  const worktreeRemoved = removed.ok;
488
558
  const deleteResult = worktreeRemoved ? await deleteBranchIfSafe(ownerRepo, branch, preview.baseRef, undefined, signal) : { branchDeleted: false };
489
- return head({ status: "merged", noop: true, worktreeRemoved, ...deleteResult, error: undefined });
559
+ return head({ status: "merged", noop: true, worktreeRemoved, ...deleteResult, ...pushedStranded, error: undefined });
490
560
  }
491
- return head({ status: "merged", noop: true, error: undefined });
561
+ return head({ status: "merged", noop: true, ...pushedStranded, error: undefined });
492
562
  }
493
563
 
494
564
  async function mergePr(
@@ -660,6 +730,145 @@ async function syncLocalBaseToUpstream(
660
730
  return { ok: true, baseSync };
661
731
  }
662
732
 
733
+ /**
734
+ * Restore `baseWorktreePath` to `restoreSha` — the PRE-replay snapshot that still carries the
735
+ * preserved local-only commits — after an aborted/failed replay (#950). Runs its git ops WITHOUT
736
+ * the merge signal on purpose: the replay reset already moved base to origin, so the preserved work
737
+ * survives ONLY in `restoreSha`; if this cleanup reused the (possibly-aborted) merge signal,
738
+ * `mergeGit` would rethrow immediately on the aborted signal (see its `throwIfMergeAborted` guard)
739
+ * and SKIP the restoration, stranding base half-reset at origin with the work dropped — the exact
740
+ * data loss a cancellation must not cause. Clears any in-progress cherry-pick, hard-resets to
741
+ * `restoreSha`, then VERIFIES HEAD actually landed on it. Returns whether the restore is PROVEN, so
742
+ * the caller can escalate a hard conflict when it cannot confirm the base is safe (never assume).
743
+ */
744
+ async function restoreBaseWorktreeToSnapshot(baseWorktreePath: string, restoreSha: string, base: string): Promise<boolean> {
745
+ try {
746
+ // No signal: this must run to completion even under an aborted/timed-out merge.
747
+ await mergeGit(["cherry-pick", "--abort"], baseWorktreePath, "cleanup", `workspace merge abort in-progress replay of ${base}`);
748
+ const reset = await mergeGit(["reset", "--hard", restoreSha], baseWorktreePath, "cleanup", `workspace merge restore ${base} to pre-replay snapshot`);
749
+ if (!reset.ok) return false;
750
+ const head = (await mergeGit(["rev-parse", "HEAD"], baseWorktreePath, "cleanup", `workspace merge verify ${base} restored to snapshot`)).stdout.trim();
751
+ return head !== "" && head === restoreSha.trim();
752
+ } catch {
753
+ return false;
754
+ }
755
+ }
756
+
757
+ /**
758
+ * Replay `commits` (oldest-first) onto `upstreamSha` in a CLEAN base worktree via cherry-pick, so
759
+ * unpushed local commits that can no longer fast-forward are PRESERVED (their content re-lands on
760
+ * fresh origin) instead of being discarded by a hard reset (#950). Merge commits replay as their
761
+ * first-parent delta (`-m 1`). Commits are attributed to the relay identity like the no-ff land, so
762
+ * a base checkout without a configured git identity can still replay.
763
+ *
764
+ * ABORT-SAFE (#950 review): the reset-to-origin runs BEFORE the replay, so a merge cancellation /
765
+ * total-timeout firing after it would leave base at `upstreamSha` with the preserved commits gone.
766
+ * The whole replay therefore runs inside a try; on ANY failure OR abort we restore `restoreSha` with
767
+ * a FRESH (non-aborted) signal and VERIFY the restore landed. If restoration cannot be PROVEN we
768
+ * return an explicit `conflict` so the caller escalates to a steward rather than reporting a benign
769
+ * state over a half-reset base. The caller escalates for a manual/steward reconcile either way.
770
+ */
771
+ async function replayCommitsOntoUpstream(
772
+ baseWorktreePath: string,
773
+ upstreamSha: string,
774
+ commits: string[],
775
+ restoreSha: string,
776
+ base: string,
777
+ signal?: AbortSignal,
778
+ ): Promise<{ ok: true } | { ok: false; conflict?: boolean; error: string }> {
779
+ let failure: string | undefined;
780
+ try {
781
+ const reset = await mergeGit(["reset", "--hard", upstreamSha], baseWorktreePath, "rebase", `workspace merge rewind ${base} to origin before replay`, signal);
782
+ if (!reset.ok) {
783
+ failure = reset.stderr || `failed to rewind ${base} to origin before replaying unpushed commits`;
784
+ } else {
785
+ for (const sha of commits) {
786
+ throwIfMergeAborted(signal);
787
+ const parents = (await mergeGit(["rev-list", "--parents", "-n", "1", sha], baseWorktreePath, "rebase", `workspace merge inspect ${sha} parents before replay`, signal)).stdout.split(/\s+/).filter(Boolean);
788
+ const pickArgs = parents.length > 2
789
+ ? ["-c", `user.name=${LAND_COMMITTER.name}`, "-c", `user.email=${LAND_COMMITTER.email}`, "cherry-pick", "-m", "1", sha]
790
+ : ["-c", `user.name=${LAND_COMMITTER.name}`, "-c", `user.email=${LAND_COMMITTER.email}`, "cherry-pick", sha];
791
+ const pick = await mergeGit(pickArgs, baseWorktreePath, "rebase", `workspace merge replay unpushed ${sha} onto origin`, signal);
792
+ if (!pick.ok) {
793
+ failure = `cannot cleanly replay unpushed ${base} commit ${sha.slice(0, 9)} onto origin; needs manual reconcile (#950)`;
794
+ break;
795
+ }
796
+ }
797
+ }
798
+ } catch (err) {
799
+ // Abort / total-timeout / unexpected throw mid-replay — base may be half-reset at origin.
800
+ failure = `replay of unpushed ${base} commits interrupted before completion (${errMessage(err)}); restoring ${base} (#950)`;
801
+ }
802
+
803
+ if (!failure) return { ok: true };
804
+
805
+ // Restore the pre-replay snapshot (still holding the preserved commits) with a fresh signal, then
806
+ // PROVE it landed. If we cannot, the base may be stranded at origin with the work dropped — a hard
807
+ // conflict for a steward, never a silent half-reset.
808
+ const restored = await restoreBaseWorktreeToSnapshot(baseWorktreePath, restoreSha, base);
809
+ if (!restored) {
810
+ return { ok: false, conflict: true, error: `${failure}; and could not restore ${base} to its pre-replay snapshot ${restoreSha.slice(0, 9)} — base may be left reset to origin with unpushed work dropped; escalating to steward (#950)` };
811
+ }
812
+ return { ok: false, error: failure };
813
+ }
814
+
815
+ /**
816
+ * Undo a base-ref advance whose publish lost a push race (#950). The merge commit is only on the
817
+ * LOCAL base and origin moved past us, so it can no longer fast-forward — stranding it would
818
+ * diverge local base from origin and wedge every later land (preview would see the branch as an
819
+ * ancestor of local base → noop → terminal `merged` for work never published). Rewind local base
820
+ * to the fetched origin tip so local == origin: no phantom merge, no divergence.
821
+ *
822
+ * MUST-FIX (#950 review): local base may carry PRE-EXISTING unpushed commits that are NOT the merge
823
+ * we just created (e.g. work stranded by an EARLIER push race). Committed work isn't dirty, so the
824
+ * clean-worktree guard doesn't catch it — a blind hard reset to origin would DISCARD it. So compute
825
+ * the local-only commits on the PRE-land base tip (`preLandBaseSha`, which excludes the merge we're
826
+ * intentionally dropping); if there are none, rewind to origin as before; if there are some, PRESERVE
827
+ * them by replaying onto fresh origin (a clean worktree is required to do this safely — otherwise
828
+ * REFUSE and surface for a steward, never reset over the unpushed work). Returns an explicit
829
+ * success/failure so the caller can VERIFY the base actually healed before reporting a recoverable
830
+ * state (a failed rollback must not masquerade as healed).
831
+ */
832
+ async function rewindBaseAfterPushRace(
833
+ repoRoot: string,
834
+ base: string,
835
+ advancedBaseTip: string,
836
+ preLandBaseSha: string | undefined,
837
+ upstreamSha: string,
838
+ baseWorktree: { path: string; dirty: boolean } | undefined,
839
+ dirtyBefore: Set<string> | undefined,
840
+ signal?: AbortSignal,
841
+ ): Promise<{ ok: true } | { ok: false; conflict?: boolean; error: string }> {
842
+ // Commits on the PRE-land base tip that origin lacks — pre-existing unpushed work, NOT the merge
843
+ // we just made (that lives only on advancedBaseTip). These must survive the rewind.
844
+ // `--first-parent` walks base's MAINLINE (the sequence of lands): each merge replays once as its
845
+ // first-parent delta (`-m 1`) instead of also re-applying the branch commits it already contains.
846
+ const localOnly = preLandBaseSha
847
+ ? (await mergeGit(["rev-list", "--reverse", "--first-parent", `${upstreamSha}..${preLandBaseSha}`], repoRoot, "rebase", `workspace merge scan unpushed ${base} commits before rewind`, signal)).stdout.split("\n").filter(Boolean)
848
+ : [];
849
+
850
+ if (localOnly.length === 0) {
851
+ // Nothing but the phantom merge to drop — safe to rewind straight to origin.
852
+ if (baseWorktree && !baseWorktree.dirty) {
853
+ const reset = await mergeGit(["reset", "--hard", upstreamSha], baseWorktree.path, "rebase", `workspace merge rewind ${base} to ${upstreamSha} after push race`, signal);
854
+ if (!reset.ok) return { ok: false, error: reset.stderr || `failed to rewind ${base} worktree to origin after push race` };
855
+ return { ok: true };
856
+ }
857
+ const update = await mergeGit(["update-ref", `refs/heads/${base}`, upstreamSha, advancedBaseTip], repoRoot, "rebase", `workspace merge rewind ${base} ref after push race`, signal);
858
+ if (!update.ok) return { ok: false, error: update.stderr || `failed to rewind ${base} ref to origin after push race` };
859
+ if (baseWorktree?.dirty && dirtyBefore) await syncDirtyBaseWorktreeAfterRefAdvance(base, baseWorktree, advancedBaseTip, upstreamSha, dirtyBefore, signal);
860
+ return { ok: true };
861
+ }
862
+
863
+ // Pre-existing unpushed commits present. Replaying requires a CLEAN base checkout; without one we
864
+ // refuse rather than risk discarding committed work — the branch and the unpushed commits both
865
+ // stay put and a steward reconciles.
866
+ if (!baseWorktree || baseWorktree.dirty) {
867
+ return { ok: false, error: `local ${base} carries ${localOnly.length} unpushed commit(s) not on origin and its checkout is ${baseWorktree ? "dirty" : "absent"}; refusing to rewind (would discard committed work) — needs manual/steward reconcile (#950)` };
868
+ }
869
+ return await replayCommitsOntoUpstream(baseWorktree.path, upstreamSha, localOnly, advancedBaseTip, base, signal);
870
+ }
871
+
663
872
  async function mergeRebaseFf(
664
873
  input: WorkspaceMergeInput,
665
874
  worktreePath: string,
@@ -782,6 +991,10 @@ async function mergeRebaseFf(
782
991
  // land: fall through to ref-plumbing (update-ref / synthesized no-ff merge) below, then
783
992
  // best-effort sync clean landed paths in the dirty checkout while preserving WIP (#681).
784
993
  let baseTip = headSha;
994
+ // Snapshot base's PRE-advance tip so a lost-push-race rewind can tell the merge we're about to
995
+ // create (which it should drop) apart from any PRE-EXISTING unpushed commits on base (which it
996
+ // must PRESERVE, not hard-reset over) — #950 data-loss guard.
997
+ const preLandBaseSha = (await mergeGit(["rev-parse", base], repoRoot, "rebase", `workspace merge snapshot ${base} tip before advance`, signal)).stdout || undefined;
785
998
  throwIfMergeAborted(signal);
786
999
  const baseWorktree = await worktreeForBranch(repoRoot, base, signal);
787
1000
  const dirtyBasePathsBefore = baseWorktree?.dirty ? await dirtyPathSet(baseWorktree.path, signal) : undefined;
@@ -830,14 +1043,54 @@ async function mergeRebaseFf(
830
1043
  const baseWorktreeSyncField = baseSync && !baseSync.reconciled ? { baseWorktreeSync: baseSync } : {};
831
1044
  if (baseSync && !baseSync.reconciled && baseWorktree) logBaseWorktreeStrand(base, baseWorktree.path, baseSync);
832
1045
 
833
- // Publish the advanced base so local and origin converge (#190). We verified
834
- // origin was an ancestor of base above, so this is a fast-forward; a failure
835
- // means origin raced us surface it instead of claiming an unpublished land.
1046
+ // Publish the advanced base so local and origin converge (#190). We verified origin was an
1047
+ // ancestor of base above, so this is a fast-forward but that check ran BEFORE the base
1048
+ // advance, and origin can move in the fetch→push window (#950: a sibling host landing the same
1049
+ // origin, CI, a human push). If it did, this push is a non-ff and is rejected AFTER local base
1050
+ // already carries the merge commit. Leaving the merge stranded on local base is the historical
1051
+ // multi-host land wedge: local base diverges from origin, later previews see the branch as an
1052
+ // ancestor of local base → noop → terminal `merged` for work never published, and every land
1053
+ // after refuses on divergence. So on a push failure, re-fetch and try to publish in the SAME
1054
+ // execution; if origin genuinely moved past us, rewind local base to the fresh origin tip
1055
+ // (local == origin, no phantom, no divergence) and bounce to review_requested for the next scan.
836
1056
  let pushed = false;
837
1057
  if (upstream && remote && pushEnabled) {
838
1058
  throwIfMergeAborted(signal);
839
- const push = await mergeGit(["push", remote, `${base}:${base}`], worktreePath, "rebase", `workspace merge push ${base} to ${remote}`, signal);
840
- if (!push.ok) return head({ status: "review_requested", mergedSha: headSha, error: push.stderr || `git push to ${remote}/${base} failed` });
1059
+ let push = await mergeGit(["push", remote, `${base}:${base}`], worktreePath, "rebase", `workspace merge push ${base} to ${remote}`, signal);
1060
+ if (!push.ok) {
1061
+ throwIfMergeAborted(signal);
1062
+ const refetch = await mergeGit(["fetch", remote, base], worktreePath, "rebase", `workspace merge re-fetch ${remote}/${base} after push race`, signal);
1063
+ const upstreamSha = refetch.ok ? (await mergeGit(["rev-parse", "--verify", upstream], worktreePath, "rebase", `workspace merge resolve ${upstream} after push race`, signal)).stdout : "";
1064
+ // Transient loss (a ref lock, or a compatible parallel push): origin is still an ancestor
1065
+ // of our advanced base, so the merge is a clean fast-forward — just re-push it.
1066
+ if (upstreamSha && (await mergeGit(["merge-base", "--is-ancestor", upstreamSha, baseTip], worktreePath, "rebase", "workspace merge check base still ff after push race", signal)).ok) {
1067
+ throwIfMergeAborted(signal);
1068
+ push = await mergeGit(["push", remote, `${base}:${base}`], worktreePath, "rebase", `workspace merge re-push ${base} to ${remote} after race`, signal);
1069
+ }
1070
+ if (!push.ok) {
1071
+ // Origin moved past us — the advanced merge can't fast-forward. Rewind local base to the
1072
+ // fresh origin tip so nothing is stranded (no mergedSha: nothing landed), PRESERVING any
1073
+ // pre-existing unpushed commits (#950 MUST-FIX 1). Then VERIFY the base actually healed
1074
+ // before reporting the retryable review_requested — a failed rollback must surface a HARD
1075
+ // state (conflict → steward), never masquerade as healed (#950 MUST-FIX 2).
1076
+ if (!upstreamSha) {
1077
+ return head({ conflict: true, status: "conflict", error: `push to ${remote}/${base} failed and the fresh origin tip is unresolvable; cannot safely rewind ${base} — escalating (#950)` });
1078
+ }
1079
+ const rewind = await rewindBaseAfterPushRace(repoRoot, base, baseTip, preLandBaseSha, upstreamSha, baseWorktree, dirtyBasePathsBefore, signal);
1080
+ if (!rewind.ok) {
1081
+ return head({ conflict: true, status: "conflict", error: rewind.error });
1082
+ }
1083
+ // Verify, don't assume: local base must now contain the fresh origin tip AND must NOT still
1084
+ // carry the phantom merge we failed to publish. Anything else means the base is still wedged.
1085
+ const healedBaseSha = (await mergeGit(["rev-parse", base], repoRoot, "rebase", `workspace merge verify ${base} healed after push race`, signal)).stdout;
1086
+ const originContained = Boolean(healedBaseSha) && (await mergeGit(["merge-base", "--is-ancestor", upstreamSha, healedBaseSha], repoRoot, "rebase", `workspace merge verify origin contained in ${base} after rewind`, signal)).ok;
1087
+ const phantomDropped = Boolean(healedBaseSha) && !(await mergeGit(["merge-base", "--is-ancestor", baseTip, healedBaseSha], repoRoot, "rebase", `workspace merge verify phantom dropped from ${base} after rewind`, signal)).ok;
1088
+ if (!originContained || !phantomDropped) {
1089
+ return head({ conflict: true, status: "conflict", error: `rewind of ${base} after push race did not heal (origin ${originContained ? "contained" : "MISSING"}, phantom ${phantomDropped ? "dropped" : "STILL PRESENT"}); escalating rather than reporting healed (#950)` });
1090
+ }
1091
+ return head({ status: "review_requested", error: push.stderr || `git push to ${remote}/${base} failed` });
1092
+ }
1093
+ }
841
1094
  pushed = true;
842
1095
  }
843
1096