agent-relay-orchestrator 0.84.0 → 0.85.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "agent-relay-orchestrator",
3
- "version": "0.84.0",
3
+ "version": "0.85.0",
4
4
  "description": "Agent Relay orchestrator — manages agent lifecycle across hosts",
5
5
  "type": "module",
6
6
  "bin": {
@@ -16,7 +16,7 @@
16
16
  "test": "bun test"
17
17
  },
18
18
  "dependencies": {
19
- "agent-relay-sdk": "0.2.64"
19
+ "agent-relay-sdk": "0.2.65"
20
20
  },
21
21
  "devDependencies": {
22
22
  "@types/bun": "latest",
package/src/control.ts CHANGED
@@ -6,6 +6,17 @@ import { readLocalProviderConfigs } from "./provider-config-migration";
6
6
  import { spawnAgent, stopSession, type SpawnOptions } from "./spawn";
7
7
  import { cleanupWorkspace, discardRecoveryBranch, mergeWorkspace, pruneWorktrees, reconcileWorkspace, refreshWorkspaceDeps, workspacesRoot } from "./workspace-probe";
8
8
  import { armWorkspacePrAutoMerge, mergeWorkspacePr, refreshWorkspacePrBranch } from "./workspace-pr";
9
+ import type { WorkspaceMergeResult } from "agent-relay-sdk";
10
+
11
+ // #638 — settle a `workspace.merge` command on whether it made progress, not on whether
12
+ // the merge function threw. Every no-progress outcome (origin moved ahead, an unpredicted
13
+ // replay conflict, a push race) returns merged:false with an `error` set; every genuine
14
+ // land / recycle / noop-resolve / PR-opened leaves `error` undefined. Reporting `succeeded`
15
+ // for an unlanded branch is what let the auto-merge job believe it was progressing and
16
+ // busy-loop the repo's merge lease. Mirrors the pr-merge / pr-arm settle contract.
17
+ export function mergeCommandStatus(result: WorkspaceMergeResult): "succeeded" | "failed" {
18
+ return result.error ? "failed" : "succeeded";
19
+ }
9
20
 
10
21
  interface ControlHandler {
11
22
  handleCommand(command: RelayCommand): Promise<boolean>;
@@ -132,7 +143,9 @@ export function createControlHandler(
132
143
  }
133
144
  : undefined,
134
145
  });
135
- await relay.updateCommand(command.id, "succeeded", result as unknown as Record<string, unknown>);
146
+ // #638 — settle `failed` (carrying the error) for a no-op merge instead of
147
+ // `succeeded`; see mergeCommandStatus.
148
+ await relay.updateCommand(command.id, mergeCommandStatus(result), result as unknown as Record<string, unknown>, result.error);
136
149
  } else if (command.type === "workspace.pr-arm-auto-merge") {
137
150
  const rawPrNumber = command.params.prNumber;
138
151
  const result = armWorkspacePrAutoMerge({
@@ -178,6 +178,37 @@ function logFileDiagnostics(logFile: string): Pick<ManagedSessionExitDiagnostics
178
178
  }
179
179
  }
180
180
 
181
+ /** Map systemd ExecMain* / Result + a best-effort journal OOM probe into signal/exitCode/oom (#636). */
182
+ function terminationDiagnostics(systemd: ManagedSessionExitDiagnostics["systemd"], pid?: number): Pick<ManagedSessionExitDiagnostics, "signal" | "exitCode" | "oom"> {
183
+ const out: Pick<ManagedSessionExitDiagnostics, "signal" | "exitCode" | "oom"> = {};
184
+ if (systemd?.result === "oom-kill") {
185
+ out.oom = { source: "systemd", detail: "Result=oom-kill" };
186
+ }
187
+ if (systemd?.execMainCode && /killed|dumped/i.test(systemd.execMainCode) && systemd.execMainStatus) {
188
+ out.signal = /^\d+$/.test(systemd.execMainStatus) ? `signal ${systemd.execMainStatus}` : `SIG${systemd.execMainStatus.replace(/^SIG/i, "")}`;
189
+ } else if (systemd?.execMainCode === "exited" && systemd.execMainStatus && /^\d+$/.test(systemd.execMainStatus)) {
190
+ out.exitCode = Number(systemd.execMainStatus);
191
+ }
192
+ // Best-effort kernel OOM-killer probe by pid — wrapped so a missing/permission-denied journal
193
+ // never breaks diagnosis. Only consulted when systemd didn't already flag the OOM.
194
+ if (!out.oom && pid && process.platform === "linux") {
195
+ try {
196
+ const result = Bun.spawnSync(["journalctl", "--user", "-k", "--no-pager", "-n", "200", "--grep", `Killed process ${pid}|Out of memory`], {
197
+ stdin: "ignore",
198
+ stdout: "pipe",
199
+ stderr: "ignore",
200
+ });
201
+ const text = result.exitCode === 0 ? result.stdout.toString() : "";
202
+ if (new RegExp(`Killed process ${pid}\\b|Out of memory: Killed process ${pid}\\b`).test(text)) {
203
+ out.oom = { source: "journal", detail: `OOM-killer hit pid ${pid}` };
204
+ }
205
+ } catch {
206
+ // journal unavailable — leave OOM unset.
207
+ }
208
+ }
209
+ return out;
210
+ }
211
+
181
212
  function describeSessionExit(record: SessionRecord, diagnostics: Omit<ManagedSessionExitDiagnostics, "lastError">): string {
182
213
  if (record.provider === "claude") {
183
214
  const modelUnavailable = extractClaudeModelUnavailableMessage((diagnostics.logTail ?? []).join("\n"));
@@ -211,6 +242,8 @@ export function diagnoseSessionExit(input: { agentId?: string; policyName?: stri
211
242
  const terminalAvailable = tmuxHasSession(record.name, readRunnerInfo(record)?.tmuxSocket);
212
243
  const log = logFileDiagnostics(record.logFile);
213
244
  const runnerInfoPresent = record.runnerInfoFile ? existsSync(record.runnerInfoFile) : false;
245
+ const systemd = supervisor.type === "systemd" && supervisor.unit ? systemdUnitDiagnostics(supervisor.unit) : undefined;
246
+ const termination = terminationDiagnostics(systemd, record.pid ?? currentPid);
214
247
  const unavailable = [
215
248
  ...(log.logUnavailable ? [`stdout/stderr log unavailable: ${log.logUnavailable}`] : []),
216
249
  ...(log.logEmpty ? ["stdout/stderr log empty"] : []),
@@ -243,7 +276,10 @@ export function diagnoseSessionExit(input: { agentId?: string; policyName?: stri
243
276
  logTail: log.logTail,
244
277
  runnerInfoFile: record.runnerInfoFile,
245
278
  runnerInfoPresent,
246
- ...(supervisor.type === "systemd" && supervisor.unit ? { systemd: systemdUnitDiagnostics(supervisor.unit) } : {}),
279
+ ...(systemd ? { systemd } : {}),
280
+ ...(termination.signal ? { signal: termination.signal } : {}),
281
+ ...(termination.exitCode !== undefined ? { exitCode: termination.exitCode } : {}),
282
+ ...(termination.oom ? { oom: termination.oom } : {}),
247
283
  ...(unavailable.length ? { unavailable } : {}),
248
284
  };
249
285
  return {
@@ -421,6 +421,34 @@ function recordNoFfMerge(
421
421
  return { ok: true, mergeSha };
422
422
  }
423
423
 
424
+ /**
425
+ * Fast-forward the local `base` ref to its fetched `upstream` tip (#638 concurrent-lane
426
+ * recovery). The caller has verified base is a strict ancestor of upstream — a clean ff,
427
+ * no divergence, nothing to lose. When base is checked out in a worktree, ff it there so
428
+ * that working tree stays consistent (refuse if it's dirty — can't ff cleanly); otherwise
429
+ * advance the ref directly. Returns an error string only when the sync genuinely can't be
430
+ * performed, so the merge path can surface it as a no-progress failure.
431
+ */
432
+ function syncLocalBaseToUpstream(
433
+ repoRoot: string,
434
+ worktreePath: string,
435
+ base: string,
436
+ upstream: string,
437
+ ): { ok: true } | { ok: false; error: string } {
438
+ const upstreamSha = git(["rev-parse", "--verify", upstream], worktreePath).stdout;
439
+ if (!upstreamSha) return { ok: false, error: `cannot resolve ${upstream} to sync ${base}` };
440
+ const baseWorktree = worktreeForBranch(repoRoot, base);
441
+ if (baseWorktree) {
442
+ if (baseWorktree.dirty) return { ok: false, error: `base branch '${base}' has uncommitted changes in ${baseWorktree.path}; cannot sync to ${upstream}` };
443
+ const ff = git(["merge", "--ff-only", upstream], baseWorktree.path);
444
+ if (!ff.ok) return { ok: false, error: ff.stderr || `failed to fast-forward ${base} to ${upstream}` };
445
+ return { ok: true };
446
+ }
447
+ const update = git(["update-ref", `refs/heads/${base}`, upstreamSha], repoRoot);
448
+ if (!update.ok) return { ok: false, error: update.stderr || `failed to advance ${base} to ${upstream}` };
449
+ return { ok: true };
450
+ }
451
+
424
452
  function mergeRebaseFf(
425
453
  input: WorkspaceMergeInput,
426
454
  worktreePath: string,
@@ -433,11 +461,20 @@ function mergeRebaseFf(
433
461
  if (!base) return head({ status: "review_requested", error: "no base branch to merge into" });
434
462
  if (!branch) return head({ status: "review_requested", error: "cannot determine agent branch" });
435
463
 
436
- // Reconcile with origin before landing (#190/#203). When base tracks an
437
- // upstream (e.g. main -> origin/main) and we'll push, fetch it and refuse if
438
- // origin has moved ahead of local base: pushing would then be a non-fast-forward,
439
- // and we won't rewrite published history or strand a local-only land. The
440
- // refusal happens BEFORE we mutate anything, so a diverged base is a clean no-op.
464
+ // Reconcile with origin before landing (#190/#203/#638). When base tracks an
465
+ // upstream (e.g. main -> origin/main) and we'll push, fetch it and check whether
466
+ // origin has moved ahead of local base.
467
+ //
468
+ // Origin-ahead is the COMMON case under concurrency (#638): a sibling lane lands
469
+ // and advances origin/<base> while this lane still sits on a stale local base.
470
+ // Refusing here floods the steward on every normal multi-lane batch. Instead:
471
+ // - If local base carries nothing origin lacks (it's a strict ancestor of the
472
+ // fetched upstream), fast-forward local base to the upstream and land onto the
473
+ // FRESH origin (the no-ff merge below ties the branch onto the advanced base).
474
+ // This is a clean recovery — no steward escalation.
475
+ // - Refuse ONLY on genuine divergence: local base has commits not on origin, so
476
+ // a sync would rewrite/discard published-or-local history. A real failure must
477
+ // still set `error` so the relay's no-progress backoff (#638) can engage.
441
478
  const upstream = upstreamRef(worktreePath, base);
442
479
  const slash = upstream ? upstream.indexOf("/") : -1;
443
480
  const remote = slash > 0 ? upstream!.slice(0, slash) : undefined; // remote of a `remote/branch` upstream
@@ -445,7 +482,13 @@ function mergeRebaseFf(
445
482
  if (upstream && remote && pushEnabled) {
446
483
  git(["fetch", remote, base], worktreePath); // best-effort freshness; a stale ref can only under-detect divergence
447
484
  if (!git(["merge-base", "--is-ancestor", upstream, base], worktreePath).ok) {
448
- return head({ status: "review_requested", error: `origin moved ahead of local ${base}; sync ${base} with ${upstream} before landing` });
485
+ // Origin moved ahead. Sync-then-land iff local base is cleanly behind (ancestor
486
+ // of upstream); otherwise it's genuine divergence — refuse without mutating.
487
+ if (!git(["merge-base", "--is-ancestor", base, upstream], worktreePath).ok) {
488
+ return head({ status: "review_requested", error: `local ${base} has diverged from ${upstream} (commits not on origin); sync before landing` });
489
+ }
490
+ const synced = syncLocalBaseToUpstream(repoRoot, worktreePath, base, upstream);
491
+ if (!synced.ok) return head({ status: "review_requested", error: synced.error });
449
492
  }
450
493
  }
451
494