agent-relay-orchestrator 0.84.0 → 0.86.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "agent-relay-orchestrator",
3
- "version": "0.84.0",
3
+ "version": "0.86.0",
4
4
  "description": "Agent Relay orchestrator — manages agent lifecycle across hosts",
5
5
  "type": "module",
6
6
  "bin": {
@@ -16,7 +16,7 @@
16
16
  "test": "bun test"
17
17
  },
18
18
  "dependencies": {
19
- "agent-relay-sdk": "0.2.64"
19
+ "agent-relay-sdk": "0.2.66"
20
20
  },
21
21
  "devDependencies": {
22
22
  "@types/bun": "latest",
package/src/control.ts CHANGED
@@ -6,6 +6,17 @@ import { readLocalProviderConfigs } from "./provider-config-migration";
6
6
  import { spawnAgent, stopSession, type SpawnOptions } from "./spawn";
7
7
  import { cleanupWorkspace, discardRecoveryBranch, mergeWorkspace, pruneWorktrees, reconcileWorkspace, refreshWorkspaceDeps, workspacesRoot } from "./workspace-probe";
8
8
  import { armWorkspacePrAutoMerge, mergeWorkspacePr, refreshWorkspacePrBranch } from "./workspace-pr";
9
+ import type { WorkspaceMergeResult } from "agent-relay-sdk";
10
+
11
+ // #638 — settle a `workspace.merge` command on whether it made progress, not on whether
12
+ // the merge function threw. Every no-progress outcome (origin moved ahead, an unpredicted
13
+ // replay conflict, a push race) returns merged:false with an `error` set; every genuine
14
+ // land / recycle / noop-resolve / PR-opened leaves `error` undefined. Reporting `succeeded`
15
+ // for an unlanded branch is what let the auto-merge job believe it was progressing and
16
+ // busy-loop the repo's merge lease. Mirrors the pr-merge / pr-arm settle contract.
17
+ export function mergeCommandStatus(result: WorkspaceMergeResult): "succeeded" | "failed" {
18
+ return result.error ? "failed" : "succeeded";
19
+ }
9
20
 
10
21
  interface ControlHandler {
11
22
  handleCommand(command: RelayCommand): Promise<boolean>;
@@ -132,7 +143,9 @@ export function createControlHandler(
132
143
  }
133
144
  : undefined,
134
145
  });
135
- await relay.updateCommand(command.id, "succeeded", result as unknown as Record<string, unknown>);
146
+ // #638 — settle `failed` (carrying the error) for a no-op merge instead of
147
+ // `succeeded`; see mergeCommandStatus.
148
+ await relay.updateCommand(command.id, mergeCommandStatus(result), result as unknown as Record<string, unknown>, result.error);
136
149
  } else if (command.type === "workspace.pr-arm-auto-merge") {
137
150
  const rawPrNumber = command.params.prNumber;
138
151
  const result = armWorkspacePrAutoMerge({
@@ -178,6 +178,37 @@ function logFileDiagnostics(logFile: string): Pick<ManagedSessionExitDiagnostics
178
178
  }
179
179
  }
180
180
 
181
+ /** Map systemd ExecMain* / Result + a best-effort journal OOM probe into signal/exitCode/oom (#636). */
182
+ function terminationDiagnostics(systemd: ManagedSessionExitDiagnostics["systemd"], pid?: number): Pick<ManagedSessionExitDiagnostics, "signal" | "exitCode" | "oom"> {
183
+ const out: Pick<ManagedSessionExitDiagnostics, "signal" | "exitCode" | "oom"> = {};
184
+ if (systemd?.result === "oom-kill") {
185
+ out.oom = { source: "systemd", detail: "Result=oom-kill" };
186
+ }
187
+ if (systemd?.execMainCode && /killed|dumped/i.test(systemd.execMainCode) && systemd.execMainStatus) {
188
+ out.signal = /^\d+$/.test(systemd.execMainStatus) ? `signal ${systemd.execMainStatus}` : `SIG${systemd.execMainStatus.replace(/^SIG/i, "")}`;
189
+ } else if (systemd?.execMainCode === "exited" && systemd.execMainStatus && /^\d+$/.test(systemd.execMainStatus)) {
190
+ out.exitCode = Number(systemd.execMainStatus);
191
+ }
192
+ // Best-effort kernel OOM-killer probe by pid — wrapped so a missing/permission-denied journal
193
+ // never breaks diagnosis. Only consulted when systemd didn't already flag the OOM.
194
+ if (!out.oom && pid && process.platform === "linux") {
195
+ try {
196
+ const result = Bun.spawnSync(["journalctl", "--user", "-k", "--no-pager", "-n", "200", "--grep", `Killed process ${pid}|Out of memory`], {
197
+ stdin: "ignore",
198
+ stdout: "pipe",
199
+ stderr: "ignore",
200
+ });
201
+ const text = result.exitCode === 0 ? result.stdout.toString() : "";
202
+ if (new RegExp(`Killed process ${pid}\\b|Out of memory: Killed process ${pid}\\b`).test(text)) {
203
+ out.oom = { source: "journal", detail: `OOM-killer hit pid ${pid}` };
204
+ }
205
+ } catch {
206
+ // journal unavailable — leave OOM unset.
207
+ }
208
+ }
209
+ return out;
210
+ }
211
+
181
212
  function describeSessionExit(record: SessionRecord, diagnostics: Omit<ManagedSessionExitDiagnostics, "lastError">): string {
182
213
  if (record.provider === "claude") {
183
214
  const modelUnavailable = extractClaudeModelUnavailableMessage((diagnostics.logTail ?? []).join("\n"));
@@ -211,6 +242,8 @@ export function diagnoseSessionExit(input: { agentId?: string; policyName?: stri
211
242
  const terminalAvailable = tmuxHasSession(record.name, readRunnerInfo(record)?.tmuxSocket);
212
243
  const log = logFileDiagnostics(record.logFile);
213
244
  const runnerInfoPresent = record.runnerInfoFile ? existsSync(record.runnerInfoFile) : false;
245
+ const systemd = supervisor.type === "systemd" && supervisor.unit ? systemdUnitDiagnostics(supervisor.unit) : undefined;
246
+ const termination = terminationDiagnostics(systemd, record.pid ?? currentPid);
214
247
  const unavailable = [
215
248
  ...(log.logUnavailable ? [`stdout/stderr log unavailable: ${log.logUnavailable}`] : []),
216
249
  ...(log.logEmpty ? ["stdout/stderr log empty"] : []),
@@ -243,7 +276,10 @@ export function diagnoseSessionExit(input: { agentId?: string; policyName?: stri
243
276
  logTail: log.logTail,
244
277
  runnerInfoFile: record.runnerInfoFile,
245
278
  runnerInfoPresent,
246
- ...(supervisor.type === "systemd" && supervisor.unit ? { systemd: systemdUnitDiagnostics(supervisor.unit) } : {}),
279
+ ...(systemd ? { systemd } : {}),
280
+ ...(termination.signal ? { signal: termination.signal } : {}),
281
+ ...(termination.exitCode !== undefined ? { exitCode: termination.exitCode } : {}),
282
+ ...(termination.oom ? { oom: termination.oom } : {}),
247
283
  ...(unavailable.length ? { unavailable } : {}),
248
284
  };
249
285
  return {
@@ -421,6 +421,39 @@ function recordNoFfMerge(
421
421
  return { ok: true, mergeSha };
422
422
  }
423
423
 
424
+ /**
425
+ * Fast-forward the local `base` ref to its fetched `upstream` tip (#638 concurrent-lane
426
+ * recovery). The caller has verified base is a strict ancestor of upstream — a clean ff,
427
+ * no divergence, nothing to lose. When base is checked out in a worktree, ff it there so
428
+ * that working tree stays consistent (refuse if it's dirty — can't ff cleanly); otherwise
429
+ * advance the ref directly. Returns an error string only when the sync genuinely can't be
430
+ * performed, so the merge path can surface it as a no-progress failure.
431
+ */
432
+ function syncLocalBaseToUpstream(
433
+ repoRoot: string,
434
+ worktreePath: string,
435
+ base: string,
436
+ upstream: string,
437
+ ): { ok: true } | { ok: false; error: string } {
438
+ const upstreamSha = git(["rev-parse", "--verify", upstream], worktreePath).stdout;
439
+ if (!upstreamSha) return { ok: false, error: `cannot resolve ${upstream} to sync ${base}` };
440
+ const baseWorktree = worktreeForBranch(repoRoot, base);
441
+ // Sync IN the base worktree only when it's clean — that keeps its working tree consistent
442
+ // with the advanced ref (the pristine home-repo checkout). When the base worktree is dirty
443
+ // (#644: a human's WIP in the shared checkout) we must NOT refuse and stall the whole repo's
444
+ // lands: advance the ref directly with update-ref instead. This is immune to the checkout's
445
+ // state and never reads or writes the human's working tree — their uncommitted work is left
446
+ // exactly as-is (the ref moves underneath; their files on disk are untouched).
447
+ if (baseWorktree && !baseWorktree.dirty) {
448
+ const ff = git(["merge", "--ff-only", upstream], baseWorktree.path);
449
+ if (!ff.ok) return { ok: false, error: ff.stderr || `failed to fast-forward ${base} to ${upstream}` };
450
+ return { ok: true };
451
+ }
452
+ const update = git(["update-ref", `refs/heads/${base}`, upstreamSha], repoRoot);
453
+ if (!update.ok) return { ok: false, error: update.stderr || `failed to advance ${base} to ${upstream}` };
454
+ return { ok: true };
455
+ }
456
+
424
457
  function mergeRebaseFf(
425
458
  input: WorkspaceMergeInput,
426
459
  worktreePath: string,
@@ -433,11 +466,20 @@ function mergeRebaseFf(
433
466
  if (!base) return head({ status: "review_requested", error: "no base branch to merge into" });
434
467
  if (!branch) return head({ status: "review_requested", error: "cannot determine agent branch" });
435
468
 
436
- // Reconcile with origin before landing (#190/#203). When base tracks an
437
- // upstream (e.g. main -> origin/main) and we'll push, fetch it and refuse if
438
- // origin has moved ahead of local base: pushing would then be a non-fast-forward,
439
- // and we won't rewrite published history or strand a local-only land. The
440
- // refusal happens BEFORE we mutate anything, so a diverged base is a clean no-op.
469
+ // Reconcile with origin before landing (#190/#203/#638). When base tracks an
470
+ // upstream (e.g. main -> origin/main) and we'll push, fetch it and check whether
471
+ // origin has moved ahead of local base.
472
+ //
473
+ // Origin-ahead is the COMMON case under concurrency (#638): a sibling lane lands
474
+ // and advances origin/<base> while this lane still sits on a stale local base.
475
+ // Refusing here floods the steward on every normal multi-lane batch. Instead:
476
+ // - If local base carries nothing origin lacks (it's a strict ancestor of the
477
+ // fetched upstream), fast-forward local base to the upstream and land onto the
478
+ // FRESH origin (the no-ff merge below ties the branch onto the advanced base).
479
+ // This is a clean recovery — no steward escalation.
480
+ // - Refuse ONLY on genuine divergence: local base has commits not on origin, so
481
+ // a sync would rewrite/discard published-or-local history. A real failure must
482
+ // still set `error` so the relay's no-progress backoff (#638) can engage.
441
483
  const upstream = upstreamRef(worktreePath, base);
442
484
  const slash = upstream ? upstream.indexOf("/") : -1;
443
485
  const remote = slash > 0 ? upstream!.slice(0, slash) : undefined; // remote of a `remote/branch` upstream
@@ -445,7 +487,13 @@ function mergeRebaseFf(
445
487
  if (upstream && remote && pushEnabled) {
446
488
  git(["fetch", remote, base], worktreePath); // best-effort freshness; a stale ref can only under-detect divergence
447
489
  if (!git(["merge-base", "--is-ancestor", upstream, base], worktreePath).ok) {
448
- return head({ status: "review_requested", error: `origin moved ahead of local ${base}; sync ${base} with ${upstream} before landing` });
490
+ // Origin moved ahead. Sync-then-land iff local base is cleanly behind (ancestor
491
+ // of upstream); otherwise it's genuine divergence — refuse without mutating.
492
+ if (!git(["merge-base", "--is-ancestor", base, upstream], worktreePath).ok) {
493
+ return head({ status: "review_requested", error: `local ${base} has diverged from ${upstream} (commits not on origin); sync before landing` });
494
+ }
495
+ const synced = syncLocalBaseToUpstream(repoRoot, worktreePath, base, upstream);
496
+ if (!synced.ok) return head({ status: "review_requested", error: synced.error });
449
497
  }
450
498
  }
451
499
 
@@ -460,13 +508,16 @@ function mergeRebaseFf(
460
508
  const behind = countBehind(worktreePath, base);
461
509
 
462
510
  // Advance base. `baseTip` is base's new tip after the land: it equals headSha on a
463
- // clean fast-forward, or the merge commit on a no-ff merge. If base is checked out
464
- // somewhere, operate in that worktree so its working tree stays consistent; otherwise
465
- // move/synthesize the ref directly. Refuse if the base worktree has uncommitted changes.
511
+ // clean fast-forward, or the merge commit on a no-ff merge. Operate IN the base worktree
512
+ // only when it exists AND is clean — that keeps its working tree consistent with the
513
+ // advanced ref (the pristine home-repo checkout). When the base worktree is dirty
514
+ // (#644: a human's WIP in the shared checkout) we must NOT refuse and stall every lane's
515
+ // land: fall through to ref-plumbing (update-ref / synthesized no-ff merge) below, which
516
+ // advances refs/heads/<base> without reading or touching that working tree. The human's
517
+ // uncommitted work is left exactly as-is — the ref moves underneath, their files untouched.
466
518
  let baseTip = headSha;
467
519
  const baseWorktree = worktreeForBranch(repoRoot, base);
468
- if (baseWorktree) {
469
- if (baseWorktree.dirty) return head({ status: "review_requested", error: `base branch '${base}' has uncommitted changes in ${baseWorktree.path}` });
520
+ if (baseWorktree && !baseWorktree.dirty) {
470
521
  if (behind === 0) {
471
522
  const ff = git(["merge", "--ff-only", branch], baseWorktree.path);
472
523
  if (!ff.ok) return head({ status: "review_requested", error: ff.stderr || "fast-forward into base failed" });