agent-relay-server 0.27.1 → 0.27.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -5,11 +5,15 @@
5
5
  // visible to the agent or the dashboard, so unlanded work can sit stranded for
6
6
  // weeks (one real casualty: a CI-guard test, recovered by hand).
7
7
  //
8
- // THE invariant (single home, see `worktreeReapable`): reaping is gated on
9
- // "nothing would be lost" landed or empty NEVER on session liveness or a
10
- // timer. A worktree holding un-landed commits is flagged for attention, never
11
- // force-removed. This module is the disk⇄DB reconciler the GC's `git worktree
12
- // prune` (a no-op while the directory exists) never was.
8
+ // THE invariant (single home, see `worktreeReapable`): a worktree is reaped only
9
+ // when BOTH hold — "nothing would be lost" (landed or empty; un-landed commits are
10
+ // flagged, never force-removed) AND "nobody is using it" (the owner is dead, and no
11
+ // live row claims the path). #278 proved the first half alone is not enough: a
12
+ // post-land recycled worktree (ahead 0) owned by a LIVE session is reap-safe by
13
+ // land-state, yet destroying it kills that session's toolchain mid-flight. Path-keyed
14
+ // row matching (never repoRoot-scoped) plus an owner-liveness guard enforce the
15
+ // second half. This module is the disk⇄DB reconciler the GC's `git worktree prune`
16
+ // (a no-op while the directory exists) never was.
13
17
 
14
18
  import { resolve } from "node:path";
15
19
  import { RELAY_TOKEN_HEADER } from "agent-relay-sdk";
@@ -20,6 +24,7 @@ import { emitRelayEvent } from "./events";
20
24
  import { isPathWithinBase } from "./utils";
21
25
  import { TERMINAL_WORKSPACE_STATUSES, worktreeReapable, type WorktreeReapState } from "./workspace-phase";
22
26
  import { isOwnerAlive } from "./workspace-merge";
27
+ import { applyWorkspaceAction } from "./workspace-actions";
23
28
 
24
29
  // Don't re-flag the same un-landed orphan every sweep — surface it once, then
25
30
  // stay quiet for this window. In-memory (keyed by worktree path) like the
@@ -29,10 +34,21 @@ const UNLANDED_FLAG_COOLDOWN_MS = Number(process.env.AGENT_RELAY_ORPHAN_FLAG_COO
29
34
  // remove them (parity with the session reaper's detect-only switch).
30
35
  const orphanWorktreeReapEnabled = () => process.env.AGENT_RELAY_ORPHAN_WORKTREE_REAP !== "0";
31
36
  const flaggedAt = new Map<string, number>();
32
- const IN_FLIGHT_MISSING_WORKTREE_STATUSES = new Set<WorkspaceStatus>(["merge_planned", "cleanup_requested"]);
37
+ const IN_FLIGHT_WORKSPACE_STATUSES = new Set<WorkspaceStatus>(["merge_planned", "cleanup_requested"]);
38
+
39
+ // #279 dead-owner grace window: a tracked worktree whose owner has died is only
40
+ // reaped after the owner is observed dead continuously for this window, so a
41
+ // reconnecting agent isn't raced. Keyed by resolved worktree path; in-memory like
42
+ // the session reaper's tracker (a restart re-arms the clock — conservative).
43
+ const orphanGraceMs = (): number => {
44
+ const v = Number(process.env.AGENT_RELAY_ORPHAN_GRACE_MS);
45
+ return Number.isFinite(v) && v >= 0 ? v : 30 * 60 * 1000;
46
+ };
47
+ const deadOwnerTracker = new Map<string, { firstSeenDeadAt: number }>();
33
48
 
34
49
  export function resetOrphanWorktreeStateForTests(): void {
35
50
  flaggedAt.clear();
51
+ deadOwnerTracker.clear();
36
52
  }
37
53
 
38
54
  interface OnlineOrchestrator {
@@ -138,6 +154,17 @@ export interface CollectOrphansResult {
138
154
  baseSha?: string;
139
155
  ownerAgentId?: string;
140
156
  }>;
157
+ /** Non-terminal isolated rows whose worktree IS present on disk (#279). Tracked,
158
+ * not orphaned — the reaper decides on owner-liveness + grace + land-state. */
159
+ deadOwnerWorkspaces: Array<{
160
+ workspaceId: string;
161
+ worktreePath: string;
162
+ repoRoot: string;
163
+ branch?: string;
164
+ baseRef?: string;
165
+ baseSha?: string;
166
+ ownerAgentId?: string;
167
+ }>;
141
168
  reason?: string;
142
169
  }
143
170
 
@@ -150,11 +177,27 @@ export interface CollectOrphansResult {
150
177
  */
151
178
  export async function collectWorkspaceOrphans(): Promise<CollectOrphansResult> {
152
179
  const orchestrators = onlineOrchestrators();
153
- if (!orchestrators.length) return { orphans: [], missingWorktrees: [], reason: "no online orchestrators" };
180
+ if (!orchestrators.length) return { orphans: [], missingWorktrees: [], deadOwnerWorkspaces: [], reason: "no online orchestrators" };
154
181
 
155
182
  const all = listWorkspaces();
156
183
  const orphans: WorkspaceOrphan[] = [];
157
184
  const missingWorktrees: CollectOrphansResult["missingWorktrees"] = [];
185
+ const deadOwnerWorkspaces: CollectOrphansResult["deadOwnerWorkspaces"] = [];
186
+
187
+ // Worktree paths are globally unique, so a row from ANY repoRoot that records a
188
+ // path is the authoritative claim on it. Match disk→DB across ALL rows, NEVER
189
+ // scoped to one repoRoot: the scoped version false-orphaned chained workspaces
190
+ // (managed session → base checkout → isolated worktree, whose row records the base
191
+ // checkout as repoRoot, not the main repo the probe is seeded from), so the reaper
192
+ // destroyed a live session's worktree (#278).
193
+ const rowsByPath = new Map(all.filter((ws) => ws.worktreePath).map((ws) => [resolve(ws.worktreePath), ws]));
194
+
195
+ // Union of every probed repo's on-disk worktrees + the repoRoots we actually
196
+ // reached. The DB→disk (missing-worktree) pass runs ONCE, globally, after the loop
197
+ // against these — per-repo scoping there falsely flagged chained rows as missing
198
+ // (their worktree lives under a different repo's probe than the one iterating). #278
199
+ const onDiskAll = new Set<string>();
200
+ const probedRepoRoots = new Set<string>();
158
201
 
159
202
  for (const repoRoot of knownRepoRoots(all)) {
160
203
  const orch = orchestrators.find((candidate) => candidate.apiUrl && isPathWithinBase(repoRoot, candidate.baseDir));
@@ -162,40 +205,32 @@ export async function collectWorkspaceOrphans(): Promise<CollectOrphansResult> {
162
205
  const probe = await fetchHostProbe(orch.apiUrl, repoRoot);
163
206
  if (!probe?.worktrees) continue;
164
207
 
165
- const liveRowsByPath = new Map(
166
- all
167
- .filter((ws) => ws.repoRoot === repoRoot && ws.worktreePath && !TERMINAL_WORKSPACE_STATUSES.has(ws.status))
168
- .map((ws) => [resolve(ws.worktreePath), ws]),
169
- );
170
- const onDisk = new Set(probe.worktrees.map((wt) => (wt.path ? resolve(wt.path) : "")).filter(Boolean));
171
-
172
- // DB→disk drift: a live isolated row whose worktree is no longer on disk.
173
- for (const [path, ws] of liveRowsByPath) {
174
- if (ws.mode === "isolated" && !onDisk.has(path)) {
175
- missingWorktrees.push({
176
- workspaceId: ws.id,
177
- worktreePath: ws.worktreePath,
178
- repoRoot,
179
- status: ws.status,
180
- branch: ws.branch,
181
- baseRef: ws.baseRef,
182
- baseSha: ws.baseSha,
183
- ownerAgentId: ws.ownerAgentId,
184
- });
185
- }
186
- }
208
+ probedRepoRoots.add(resolve(repoRoot));
209
+ for (const wt of probe.worktrees) if (wt.path) onDiskAll.add(resolve(wt.path));
187
210
 
188
- // disk→DB drift: a worktree on disk with no live row.
189
- const rowsByPath = new Map(
190
- all.filter((ws) => ws.repoRoot === repoRoot && ws.worktreePath).map((ws) => [resolve(ws.worktreePath), ws]),
191
- );
211
+ // disk→DB drift: a worktree on disk with no live row → orphan. A worktree with a
212
+ // live isolated row is NOT an orphan, but if its owner has died it's the #279
213
+ // dead-owner case surface it for the reaper to evaluate (liveness + grace).
192
214
  for (const worktree of probe.worktrees) {
193
215
  if (!worktree.path || resolve(worktree.path) === resolve(repoRoot)) continue;
194
216
  // Only agent-relay-created worktrees (agent/* branches) are reclaimable —
195
217
  // never touch a user's own linked worktrees.
196
218
  if (!worktree.branch?.startsWith("agent/")) continue;
197
219
  const row = rowsByPath.get(resolve(worktree.path));
198
- if (row && !TERMINAL_WORKSPACE_STATUSES.has(row.status)) continue; // tracked & live
220
+ if (row && !TERMINAL_WORKSPACE_STATUSES.has(row.status)) {
221
+ if (row.mode === "isolated") {
222
+ deadOwnerWorkspaces.push({
223
+ workspaceId: row.id,
224
+ worktreePath: worktree.path,
225
+ repoRoot: row.repoRoot,
226
+ branch: row.branch ?? worktree.branch,
227
+ baseRef: row.baseRef,
228
+ baseSha: row.baseSha,
229
+ ownerAgentId: row.ownerAgentId,
230
+ });
231
+ }
232
+ continue; // tracked & live — not an orphan
233
+ }
199
234
 
200
235
  const orphan: WorkspaceOrphan = {
201
236
  worktreePath: worktree.path,
@@ -223,7 +258,32 @@ export async function collectWorkspaceOrphans(): Promise<CollectOrphansResult> {
223
258
  }
224
259
  }
225
260
 
226
- return { orphans, missingWorktrees };
261
+ // DB→disk drift: a non-terminal isolated row whose worktree is gone from disk. Run
262
+ // once, globally, against the union of probed repos. Flag "missing" only when the
263
+ // path is absent from EVERY probe AND the row was COVERABLE — its repoRoot was
264
+ // probed, OR its repoRoot is itself a worktree we saw on disk (the chained case: an
265
+ // isolated row's repoRoot is its session base checkout, a linked worktree of a
266
+ // probed repo). The coverability gate is what stops rows under an un-probeable host
267
+ // from being falsely flagged missing. #278
268
+ for (const ws of all) {
269
+ if (ws.mode !== "isolated" || !ws.worktreePath) continue;
270
+ if (TERMINAL_WORKSPACE_STATUSES.has(ws.status)) continue;
271
+ if (onDiskAll.has(resolve(ws.worktreePath))) continue;
272
+ const coverable = probedRepoRoots.has(resolve(ws.repoRoot)) || onDiskAll.has(resolve(ws.repoRoot));
273
+ if (!coverable) continue;
274
+ missingWorktrees.push({
275
+ workspaceId: ws.id,
276
+ worktreePath: ws.worktreePath,
277
+ repoRoot: ws.repoRoot,
278
+ status: ws.status,
279
+ branch: ws.branch,
280
+ baseRef: ws.baseRef,
281
+ baseSha: ws.baseSha,
282
+ ownerAgentId: ws.ownerAgentId,
283
+ });
284
+ }
285
+
286
+ return { orphans, missingWorktrees, deadOwnerWorkspaces };
227
287
  }
228
288
 
229
289
  function dispatchCleanup(orch: OnlineOrchestrator, orphan: WorkspaceOrphan): string {
@@ -254,7 +314,7 @@ function dispatchCleanup(orch: OnlineOrchestrator, orphan: WorkspaceOrphan): str
254
314
  * directions surface. Never removes on uncertainty.
255
315
  */
256
316
  export async function reapOrphanedWorktrees(): Promise<Record<string, unknown>> {
257
- const { orphans, missingWorktrees, reason } = await collectWorkspaceOrphans();
317
+ const { orphans, missingWorktrees, deadOwnerWorkspaces, reason } = await collectWorkspaceOrphans();
258
318
  if (reason) return { skipped: reason };
259
319
 
260
320
  const orchestrators = onlineOrchestrators();
@@ -263,14 +323,27 @@ export async function reapOrphanedWorktrees(): Promise<Record<string, unknown>>
263
323
  const flagged: string[] = [];
264
324
  const autoAbandoned: string[] = [];
265
325
  const flaggedMissingWorktrees: string[] = [];
326
+ const deadOwnerReaped: string[] = [];
327
+ const deadOwnerFlagged: string[] = [];
266
328
  const now = Date.now();
267
329
 
330
+ // Defense in depth (#278): the matching fix in collectWorkspaceOrphans should keep
331
+ // any path with a live, non-terminal row out of `orphans`. This is the belt to that
332
+ // suspenders — re-read rows fresh and never reap a path one still claims, even if a
333
+ // future refactor reintroduces a lookup gap. False-reaping a live session is fatal.
334
+ const nonTerminalRowsByPath = new Map(
335
+ listWorkspaces()
336
+ .filter((ws) => ws.worktreePath && !TERMINAL_WORKSPACE_STATUSES.has(ws.status))
337
+ .map((ws) => [resolve(ws.worktreePath), ws]),
338
+ );
339
+
268
340
  for (const orphan of orphans) {
269
341
  const orch = orchestrators.find((candidate) => isPathWithinBase(orphan.repoRoot, candidate.baseDir));
270
342
  if (!orch) continue;
271
343
 
272
344
  if (orphan.safeToReap === true) {
273
345
  if (!reapEnabled) continue; // detect-only mode
346
+ if (nonTerminalRowsByPath.has(resolve(orphan.worktreePath))) continue; // still claimed — never reap (#278)
274
347
  const commandId = dispatchCleanup(orch, orphan);
275
348
  reaped.push(orphan.worktreePath);
276
349
  flaggedAt.delete(orphan.worktreePath);
@@ -315,7 +388,7 @@ export async function reapOrphanedWorktrees(): Promise<Record<string, unknown>>
315
388
  if (!workspace || TERMINAL_WORKSPACE_STATUSES.has(workspace.status) || workspace.mode !== "isolated" || !workspace.worktreePath) continue;
316
389
  const key = `missing:${workspace.worktreePath}`;
317
390
  const ownerAlive = isOwnerAlive(workspace.ownerAgentId);
318
- const inFlight = IN_FLIGHT_MISSING_WORKTREE_STATUSES.has(workspace.status);
391
+ const inFlight = IN_FLIGHT_WORKSPACE_STATUSES.has(workspace.status);
319
392
  const last = flaggedAt.get(key) ?? 0;
320
393
  if (ownerAlive || inFlight) {
321
394
  if (now - last < UNLANDED_FLAG_COOLDOWN_MS) continue;
@@ -410,9 +483,95 @@ export async function reapOrphanedWorktrees(): Promise<Record<string, unknown>>
410
483
  });
411
484
  }
412
485
 
486
+ // #279: a tracked worktree whose owner died — the common stale case the reaper
487
+ // never handled (inverse of #278's false kill). Gate on owner-liveness + a grace
488
+ // window + "nothing would be lost", then dispatch through the GUARDED action (the
489
+ // #254 isOwnerAlive re-check at dispatch is the safety net against a stale read).
490
+ const graceMs = orphanGraceMs();
491
+ const liveDeadOwnerKeys = new Set<string>();
492
+ for (const cand of deadOwnerWorkspaces) {
493
+ const workspace = getWorkspace(cand.workspaceId);
494
+ if (!workspace || workspace.mode !== "isolated" || !workspace.worktreePath) continue;
495
+ if (TERMINAL_WORKSPACE_STATUSES.has(workspace.status)) continue;
496
+ if (IN_FLIGHT_WORKSPACE_STATUSES.has(workspace.status)) continue; // cleanup/merge already dispatched
497
+ const key = `dead-owner:${resolve(workspace.worktreePath)}`;
498
+ liveDeadOwnerKeys.add(key);
499
+
500
+ if (isOwnerAlive(workspace.ownerAgentId)) {
501
+ deadOwnerTracker.delete(key); // owner alive — reset the grace window (regression vs #278)
502
+ continue;
503
+ }
504
+
505
+ // Owner observed dead. Require continuous dead-ness for the grace window so a
506
+ // reconnecting agent isn't raced; the in-memory tracker re-arms on restart. A
507
+ // grace of 0 reaps on the first dead observation (the env's escape hatch).
508
+ const tracked = deadOwnerTracker.get(key);
509
+ const firstSeenDeadAt = tracked?.firstSeenDeadAt ?? now;
510
+ if (!tracked) deadOwnerTracker.set(key, { firstSeenDeadAt });
511
+ if (now - firstSeenDeadAt < graceMs) continue;
512
+
513
+ // Grace elapsed + owner dead. Gate on land-state — "nothing would be lost".
514
+ const orch = orchestrators.find((candidate) => candidate.apiUrl && isPathWithinBase(workspace.repoRoot, candidate.baseDir));
515
+ const preview = orch?.apiUrl
516
+ ? await fetchWorktreeReapState(orch.apiUrl, workspace.worktreePath, workspace.baseRef)
517
+ : null;
518
+ const safeToReap = preview && !preview.missing && !preview.error
519
+ ? worktreeReapable({ landed: preview.landed, ahead: preview.ahead, unmergedAhead: preview.unmergedAhead, dirtyCount: preview.dirtyCount })
520
+ : undefined;
521
+
522
+ if (safeToReap === true) {
523
+ if (!reapEnabled) continue; // detect-only mode
524
+ // Route through the guarded action (#279): no `force`, so a flipped-alive owner
525
+ // is rejected (409) — our safety net. Also does the cleanup_requested transition
526
+ // + audit. Attributed to a distinct requester so incidents are traceable.
527
+ const result = applyWorkspaceAction(workspace, {
528
+ action: "cleanup",
529
+ agentId: "workspace-dead-owner-reaper",
530
+ auditMetadata: { source: "server", maintenanceJobId: "workspace-orphan-reaper", requestedBy: "workspace-dead-owner-reaper", deadOwner: workspace.ownerAgentId },
531
+ });
532
+ if (!result.ok) continue; // guard tripped (owner came back) or no owning orchestrator — re-evaluate next sweep
533
+ if (result.command) {
534
+ emitRelayEvent({ type: `command.${result.command.status}`, source: result.command.source, subject: result.command.id, data: { command: result.command } });
535
+ }
536
+ deadOwnerReaped.push(workspace.id);
537
+ deadOwnerTracker.delete(key);
538
+ continue;
539
+ }
540
+
541
+ // Un-landed or un-probeable: flag as stranded (dead owner), never remove.
542
+ const flagKey = `dead-owner-stranded:${resolve(workspace.worktreePath)}`;
543
+ liveDeadOwnerKeys.add(flagKey);
544
+ const last = flaggedAt.get(flagKey) ?? 0;
545
+ if (now - last < UNLANDED_FLAG_COOLDOWN_MS) continue;
546
+ flaggedAt.set(flagKey, now);
547
+ deadOwnerFlagged.push(workspace.id);
548
+ const detail = safeToReap === undefined
549
+ ? "host could not be probed for land-state"
550
+ : (preview && (preview.dirtyCount ?? 0) > 0)
551
+ ? "uncommitted changes in the worktree"
552
+ : `${preview?.unmergedAhead ?? preview?.ahead ?? "?"} un-landed commit(s)`;
553
+ createActivityEvent({
554
+ clientId: `workspace-dead-owner-stranded-${workspace.id}-${now}`,
555
+ kind: "state",
556
+ title: "Dead-owner worktree needs attention",
557
+ body: `${workspace.branch ?? workspace.id} in ${workspace.repoRoot} — owning agent ${workspace.ownerAgentId ?? "?"} is gone and the worktree holds work that hasn't landed (${detail}). Recover the commits, then clean it up explicitly.`,
558
+ meta: workspace.branch ?? workspace.id,
559
+ icon: "ti-alert-triangle",
560
+ view: "orchestrators",
561
+ metadata: { source: "server", maintenanceJobId: "workspace-orphan-reaper", workspaceId: workspace.id, worktreePath: workspace.worktreePath, branch: workspace.branch, ownerAgentId: workspace.ownerAgentId, deadOwner: true },
562
+ });
563
+ }
564
+ // Drop grace-tracker entries for worktrees that are gone (reaped/landed) so a
565
+ // future re-orphaning re-arms the window from scratch.
566
+ for (const key of deadOwnerTracker.keys()) if (!liveDeadOwnerKeys.has(key)) deadOwnerTracker.delete(key);
567
+
413
568
  // Forget cooldown entries for orphans that are gone (reaped/recovered) so a
414
569
  // future re-orphaning of the same path re-announces immediately.
415
- const liveKeys = new Set([...orphans.map((o) => o.worktreePath), ...missingWorktrees.map((m) => `missing:${m.worktreePath}`)]);
570
+ const liveKeys = new Set([
571
+ ...orphans.map((o) => o.worktreePath),
572
+ ...missingWorktrees.map((m) => `missing:${m.worktreePath}`),
573
+ ...liveDeadOwnerKeys,
574
+ ]);
416
575
  for (const key of flaggedAt.keys()) if (!liveKeys.has(key) && !reaped.includes(key)) flaggedAt.delete(key);
417
576
 
418
577
  return {
@@ -422,6 +581,8 @@ export async function reapOrphanedWorktrees(): Promise<Record<string, unknown>>
422
581
  autoAbandoned,
423
582
  flaggedMissingWorktrees,
424
583
  missingWorktrees: missingWorktrees.map((m) => m.workspaceId),
584
+ deadOwnerReaped,
585
+ deadOwnerFlagged,
425
586
  reapEnabled,
426
587
  };
427
588
  }