agent-relay-server 0.11.6 → 0.11.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/docs/openapi.json +211 -1
- package/package.json +7 -5
- package/public/index.html +6012 -1098
- package/runner/src/adapter.ts +4 -0
- package/src/bus.ts +42 -0
- package/src/config-store.ts +58 -0
- package/src/config.ts +4 -0
- package/src/db.ts +224 -21
- package/src/maintenance.ts +394 -24
- package/src/routes.ts +223 -63
- package/src/runtime-tokens.ts +44 -1
- package/src/security.ts +17 -0
- package/src/steward.ts +117 -0
- package/src/workspace-merge.ts +108 -0
package/src/maintenance.ts
CHANGED
|
@@ -9,22 +9,31 @@ import {
|
|
|
9
9
|
createActivityEvent,
|
|
10
10
|
evaluatePoolBindings,
|
|
11
11
|
expireQueuedMessages,
|
|
12
|
+
getAgent,
|
|
12
13
|
getDb,
|
|
14
|
+
getRepoSteward,
|
|
15
|
+
getWorkspace,
|
|
13
16
|
listOrchestrators,
|
|
14
17
|
listWorkspaces,
|
|
18
|
+
patchWorkspaceMetadata,
|
|
15
19
|
pruneOfflineAgents,
|
|
16
20
|
pruneOldMessages,
|
|
17
21
|
deleteWorkspace,
|
|
18
22
|
pruneOrphanedSharedWorkspaces,
|
|
19
23
|
reapStaleAgents,
|
|
20
24
|
reapStaleOrchestrators,
|
|
25
|
+
reelectRepoSteward,
|
|
21
26
|
releaseExpiredClaims,
|
|
27
|
+
releaseExpiredMergeLeases,
|
|
22
28
|
releaseOrphanedTasks,
|
|
23
29
|
sendMessage,
|
|
24
30
|
sweepArtifacts,
|
|
25
31
|
updateWorkspaceStatus,
|
|
26
32
|
} from "./db";
|
|
27
33
|
import type { WorkspaceMergePreview, WorkspaceRecord, WorkspaceStatus } from "./types";
|
|
34
|
+
import { requestWorkspaceMerge } from "./workspace-merge";
|
|
35
|
+
import { getStewardConfig } from "./config-store";
|
|
36
|
+
import { ensureRepoSteward } from "./steward";
|
|
28
37
|
import { emitRelayEvent } from "./events";
|
|
29
38
|
import { getLifecycleManager } from "./lifecycle-manager";
|
|
30
39
|
import { applyCommandToRecipe } from "./recipe-runner";
|
|
@@ -49,10 +58,58 @@ const CONFLICT_SCAN_INTERVAL_MS = Number(process.env.AGENT_RELAY_CONFLICT_SCAN_I
|
|
|
49
58
|
const WORKSPACE_RETENTION_MS = Number(process.env.AGENT_RELAY_WORKSPACE_RETENTION_MS) || DAY_MS;
|
|
50
59
|
const WORKSPACE_REVIEW_TTL_MS = Number(process.env.AGENT_RELAY_WORKSPACE_REVIEW_TTL_MS) || 3 * DAY_MS;
|
|
51
60
|
const WORKSPACE_GC_INTERVAL_MS = Number(process.env.AGENT_RELAY_WORKSPACE_GC_INTERVAL_MS) || 60 * 60 * 1000;
|
|
61
|
+
// Deterministic auto-land (Layer 0): merge clean fast-forwards with no human in
|
|
62
|
+
// the loop. Default on for the seamless workflow; set AGENT_RELAY_WORKSPACE_AUTO_MERGE=0
|
|
63
|
+
// to require a manual or steward merge per repo. Read at call-time so operators can
|
|
64
|
+
// toggle it without a restart.
|
|
65
|
+
const WORKSPACE_AUTO_MERGE_INTERVAL_MS = Number(process.env.AGENT_RELAY_WORKSPACE_AUTO_MERGE_INTERVAL_MS) || CONFLICT_SCAN_INTERVAL_MS;
|
|
66
|
+
// Don't re-wake the managed steward for the same workspace more than once per
|
|
67
|
+
// this window — a persistent conflict/behind row would otherwise re-ping every sweep.
|
|
68
|
+
const STEWARD_WAKE_COOLDOWN_MS = Number(process.env.AGENT_RELAY_STEWARD_WAKE_COOLDOWN_MS) || 10 * 60 * 1000;
|
|
69
|
+
// How long a stranded review_requested/conflict worktree (no online steward) may
|
|
70
|
+
// sit before escalating to the configured fallback target, and the durable
|
|
71
|
+
// escalation target itself (`policy:<name>`, `label:<name>`, `cap:<name>`, an
|
|
72
|
+
// agent id, or `broadcast`). Read at call-time so config changes take effect
|
|
73
|
+
// without a restart (issue #157).
|
|
74
|
+
const stewardEscalationMs = () => Number(process.env.AGENT_RELAY_WORKSPACE_STEWARD_ESCALATION_MS) || 60 * 60 * 1000;
|
|
75
|
+
const stewardFallbackTarget = () => (process.env.AGENT_RELAY_WORKSPACE_STEWARD_FALLBACK || "").trim();
|
|
76
|
+
// Statuses that need an owner — a stranded one of these is what escalation rescues.
|
|
77
|
+
const STRANDABLE_STATUSES = new Set<WorkspaceStatus>(["review_requested", "conflict"]);
|
|
52
78
|
// Live statuses worth scanning. Terminal (cleaned/merged/abandoned) and
|
|
53
79
|
// in-flight (cleanup_requested) states are skipped.
|
|
54
80
|
const CONFLICT_SCAN_STATUSES = new Set<WorkspaceStatus>(["active", "ready", "review_requested", "merge_planned", "conflict"]);
|
|
55
81
|
const TERMINAL_WORKSPACE_STATUSES = new Set<WorkspaceStatus>(["cleaned", "merged", "abandoned"]);
|
|
82
|
+
// In-flight merge statuses that should reconcile to `merged` once the host
|
|
83
|
+
// reports the branch's work has landed in base (squash/cherry-pick, or a merged
|
|
84
|
+
// PR). Excludes active/ready: an agent still working may have landed an early
|
|
85
|
+
// commit while more work is in flight — don't yank its workspace out from under it.
|
|
86
|
+
const LANDED_RECONCILE_STATUSES = new Set<WorkspaceStatus>(["merge_planned", "review_requested", "conflict"]);
|
|
87
|
+
|
|
88
|
+
// Orphaned-session reaper. A spawned agent's process can outlive its relay
|
|
89
|
+
// presence: the relay agent goes offline/pruned but the orchestrator still
|
|
90
|
+
// reports the session's process running, so it lingers forever (visible under the
|
|
91
|
+
// orchestrator, gone from the Agents panel). Runtime-token self-heal recovers the
|
|
92
|
+
// recoverable ones; this is the backstop that stops the genuinely stuck ones.
|
|
93
|
+
// Conservative by design — a session must be observed continuously orphaned by
|
|
94
|
+
// THIS relay for the grace window before it is reaped, and the tracker is in-memory
|
|
95
|
+
// so a relay restart restarts the clock (giving self-heal first crack every time).
|
|
96
|
+
const ORPHAN_REAPER_INTERVAL_MS = Number(process.env.AGENT_RELAY_ORPHAN_REAPER_INTERVAL_MS) || 5 * 60 * 1000;
|
|
97
|
+
// Read at call-time so changes take effect without a restart (and so tests can tune
|
|
98
|
+
// them). Parsed to allow an explicit 0 (immediate) — `|| default` would reject it.
|
|
99
|
+
const envMsOrDefault = (name: string, fallback: number): number => {
|
|
100
|
+
const v = Number(process.env[name]);
|
|
101
|
+
return Number.isFinite(v) && v >= 0 ? v : fallback;
|
|
102
|
+
};
|
|
103
|
+
const orphanGraceMs = () => envMsOrDefault("AGENT_RELAY_ORPHAN_GRACE_MS", 30 * 60 * 1000);
|
|
104
|
+
const orphanReapCooldownMs = () => envMsOrDefault("AGENT_RELAY_ORPHAN_REAP_COOLDOWN_MS", 5 * 60 * 1000);
|
|
105
|
+
// Set AGENT_RELAY_ORPHAN_REAP=0 to detect + log orphans but never stop them.
|
|
106
|
+
const orphanReapEnabled = () => process.env.AGENT_RELAY_ORPHAN_REAP !== "0";
|
|
107
|
+
// orchestratorId + session identity -> when we first saw it orphaned (and last reaped).
|
|
108
|
+
const orphanTracker = new Map<string, { firstOrphanedAt: number; lastReapAt?: number }>();
|
|
109
|
+
|
|
110
|
+
export function resetOrphanTrackerForTests(): void {
|
|
111
|
+
orphanTracker.clear();
|
|
112
|
+
}
|
|
56
113
|
|
|
57
114
|
interface MaintenanceJobDefinition {
|
|
58
115
|
id: string;
|
|
@@ -205,6 +262,14 @@ const definitions: MaintenanceJobDefinition[] = [
|
|
|
205
262
|
return { prunedAgentIds };
|
|
206
263
|
},
|
|
207
264
|
},
|
|
265
|
+
{
|
|
266
|
+
id: "orphaned-session-reaper",
|
|
267
|
+
title: "Orphaned session reaper",
|
|
268
|
+
description: "Stop spawned sessions whose relay agent is offline/gone but whose process the orchestrator still reports running, after a grace period for self-heal.",
|
|
269
|
+
intervalMs: ORPHAN_REAPER_INTERVAL_MS,
|
|
270
|
+
runOnStart: false,
|
|
271
|
+
handler: reapOrphanedSessions,
|
|
272
|
+
},
|
|
208
273
|
{
|
|
209
274
|
id: "orchestrator-reaper",
|
|
210
275
|
title: "Orchestrator reaper",
|
|
@@ -305,6 +370,15 @@ const definitions: MaintenanceJobDefinition[] = [
|
|
|
305
370
|
timeoutMs: 60 * 1000,
|
|
306
371
|
handler: scanWorkspaceConflicts,
|
|
307
372
|
},
|
|
373
|
+
{
|
|
374
|
+
id: "workspace-auto-merge",
|
|
375
|
+
title: "Workspace auto-merge",
|
|
376
|
+
description: "Auto-merge clean fast-forward review_requested worktrees into base under the per-repo lease; conflicts and diverged bases are left for the steward.",
|
|
377
|
+
intervalMs: WORKSPACE_AUTO_MERGE_INTERVAL_MS,
|
|
378
|
+
runOnStart: false,
|
|
379
|
+
timeoutMs: 60 * 1000,
|
|
380
|
+
handler: autoMergeCleanFastForwards,
|
|
381
|
+
},
|
|
308
382
|
{
|
|
309
383
|
id: "workspace-gc",
|
|
310
384
|
title: "Workspace GC",
|
|
@@ -323,7 +397,7 @@ function workspacePathWithinBase(path: string | undefined, baseDir: string | und
|
|
|
323
397
|
}
|
|
324
398
|
|
|
325
399
|
async function fetchHostMergePreview(apiUrl: string, workspace: WorkspaceRecord): Promise<WorkspaceMergePreview | { available: false } | null> {
|
|
326
|
-
const query = new URLSearchParams({ path: workspace.worktreePath });
|
|
400
|
+
const query = new URLSearchParams({ path: workspace.worktreePath, checkPr: "1" });
|
|
327
401
|
if (workspace.baseRef) query.set("baseRef", workspace.baseRef);
|
|
328
402
|
if (workspace.baseSha) query.set("baseSha", workspace.baseSha);
|
|
329
403
|
const headers: Record<string, string> = {};
|
|
@@ -342,6 +416,118 @@ async function fetchHostMergePreview(apiUrl: string, workspace: WorkspaceRecord)
|
|
|
342
416
|
// cleanly. Auto-flag `conflict` when a clean merge is no longer possible, and
|
|
343
417
|
// auto-clear conflicts we set ourselves once they resolve (restoring the prior
|
|
344
418
|
// status). Human-set conflicts are never cleared.
|
|
419
|
+
// Stop orphaned spawned sessions: process reported alive by the orchestrator, but
|
|
420
|
+
// the relay agent is offline/pruned and self-heal has had its chance. See the
|
|
421
|
+
// ORPHAN_* notes above. Covers both policy-managed and dashboard/ad-hoc spawns by
|
|
422
|
+
// iterating the orchestrators' reported managedAgents directly.
|
|
423
|
+
function reapOrphanedSessions(): Record<string, unknown> {
|
|
424
|
+
const now = Date.now();
|
|
425
|
+
const grace = orphanGraceMs();
|
|
426
|
+
const cooldown = orphanReapCooldownMs();
|
|
427
|
+
const reapEnabled = orphanReapEnabled();
|
|
428
|
+
const seen = new Set<string>();
|
|
429
|
+
const reaped: string[] = [];
|
|
430
|
+
let orphaned = 0;
|
|
431
|
+
|
|
432
|
+
for (const orch of listOrchestrators()) {
|
|
433
|
+
if (orch.status !== "online") continue; // can't trust the report or deliver the stop
|
|
434
|
+
for (const agent of orch.managedAgents) {
|
|
435
|
+
const sessionId = agent.spawnRequestId || agent.tmuxSession || agent.sessionName || agent.agentId;
|
|
436
|
+
if (!sessionId) continue;
|
|
437
|
+
const key = `${orch.id}:${sessionId}`;
|
|
438
|
+
const relayAgent = agent.agentId ? getAgent(agent.agentId) : null;
|
|
439
|
+
// Orphan = orchestrator reports the process running, but no live relay agent.
|
|
440
|
+
// "stale" is a recent/borderline disconnect — treat as alive and give it time;
|
|
441
|
+
// it will either recover or progress to "offline" and be caught next pass.
|
|
442
|
+
const isOrphan = !relayAgent || relayAgent.status === "offline";
|
|
443
|
+
if (!isOrphan) { orphanTracker.delete(key); continue; }
|
|
444
|
+
seen.add(key);
|
|
445
|
+
orphaned++;
|
|
446
|
+
const entry = orphanTracker.get(key) ?? { firstOrphanedAt: now };
|
|
447
|
+
orphanTracker.set(key, entry);
|
|
448
|
+
if (now - entry.firstOrphanedAt < grace) continue; // let self-heal recover it first
|
|
449
|
+
if (!reapEnabled) continue; // detect-only mode
|
|
450
|
+
if (entry.lastReapAt && now - entry.lastReapAt < cooldown) continue; // don't spam shutdowns
|
|
451
|
+
entry.lastReapAt = now;
|
|
452
|
+
const command = createCommand({
|
|
453
|
+
type: "agent.shutdown",
|
|
454
|
+
source: "system",
|
|
455
|
+
target: orch.agentId,
|
|
456
|
+
correlationId: agent.spawnRequestId,
|
|
457
|
+
params: {
|
|
458
|
+
action: "shutdown",
|
|
459
|
+
agentId: agent.agentId,
|
|
460
|
+
spawnRequestId: agent.spawnRequestId,
|
|
461
|
+
sessionName: agent.sessionName,
|
|
462
|
+
tmuxSession: agent.tmuxSession,
|
|
463
|
+
policyName: agent.policyName,
|
|
464
|
+
graceful: false,
|
|
465
|
+
timeoutMs: 10_000,
|
|
466
|
+
reason: "orphaned-session-reaper",
|
|
467
|
+
requestedBy: "orphaned-session-reaper",
|
|
468
|
+
requestedAt: now,
|
|
469
|
+
orchestratorId: orch.id,
|
|
470
|
+
},
|
|
471
|
+
});
|
|
472
|
+
emitRelayEvent({ type: "command.requested", source: "system", subject: command.id, data: { command } });
|
|
473
|
+
createActivityEvent({
|
|
474
|
+
clientId: `orphaned-session-reaper-${key}-${now}`,
|
|
475
|
+
kind: "state",
|
|
476
|
+
title: "Orphaned session reaped",
|
|
477
|
+
body: `${agent.label ?? agent.agentId ?? sessionId}: process still running on ${orch.id}, but its relay agent has been offline > ${Math.round(grace / 60000)}m and did not self-heal — stopping it`,
|
|
478
|
+
meta: agent.label ?? agent.agentId ?? sessionId,
|
|
479
|
+
icon: "ti-ghost",
|
|
480
|
+
view: "orchestrators",
|
|
481
|
+
agentId: agent.agentId || undefined,
|
|
482
|
+
metadata: {
|
|
483
|
+
source: "server",
|
|
484
|
+
maintenanceJobId: "orphaned-session-reaper",
|
|
485
|
+
orchestratorId: orch.id,
|
|
486
|
+
agentId: agent.agentId,
|
|
487
|
+
spawnRequestId: agent.spawnRequestId,
|
|
488
|
+
tmuxSession: agent.tmuxSession,
|
|
489
|
+
commandId: command.id,
|
|
490
|
+
orphanAgeMs: now - entry.firstOrphanedAt,
|
|
491
|
+
},
|
|
492
|
+
});
|
|
493
|
+
reaped.push(key);
|
|
494
|
+
}
|
|
495
|
+
}
|
|
496
|
+
// Forget sessions that recovered or are no longer reported, so a future orphaning
|
|
497
|
+
// of the same session starts a fresh grace window.
|
|
498
|
+
for (const key of orphanTracker.keys()) if (!seen.has(key)) orphanTracker.delete(key);
|
|
499
|
+
return { orphaned, reaped, tracked: orphanTracker.size, reapEnabled };
|
|
500
|
+
}
|
|
501
|
+
|
|
502
|
+
// Wake the managed per-repo steward (issue #167) for a workspace it should handle:
|
|
503
|
+
// auto-provision the policy from global steward config, then queue a `policy:` wake
|
|
504
|
+
// message (which also spawns the on-demand agent now via onMessageForPolicy). Honors a
|
|
505
|
+
// per-workspace cooldown so a persistent conflict/behind row isn't re-pinged every sweep.
|
|
506
|
+
// Returns the steward policy name on a fresh wake, or null (disabled / no owner / cooled down).
|
|
507
|
+
function wakeRepoSteward(ws: WorkspaceRecord, reason: string): string | null {
|
|
508
|
+
const meta = ws.metadata as Record<string, unknown>;
|
|
509
|
+
const lastWoke = typeof meta.stewardWokenAt === "number" ? meta.stewardWokenAt : 0;
|
|
510
|
+
if (lastWoke && Date.now() - lastWoke < STEWARD_WAKE_COOLDOWN_MS) return null;
|
|
511
|
+
const policyName = ensureRepoSteward(ws.repoRoot);
|
|
512
|
+
if (!policyName) return null;
|
|
513
|
+
try {
|
|
514
|
+
const msg = sendMessage({
|
|
515
|
+
from: "system",
|
|
516
|
+
to: `policy:${policyName}`,
|
|
517
|
+
kind: "system",
|
|
518
|
+
subject: `Steward: ${ws.status} workspace needs attention`,
|
|
519
|
+
body: `Workspace \`${ws.branch ?? ws.id}\` in ${ws.repoRoot} is ${ws.status} and could not auto-land (${reason}). cd into ${ws.worktreePath}, rebase onto ${ws.baseRef ?? "base"}, resolve, run checks, then land it via POST /api/workspaces/${ws.id}/actions {"action":"merge","strategy":"rebase-ff"} — or escalate if you can't.`,
|
|
520
|
+
payload: { kind: "workspace.steward-task", workspaceId: ws.id, repoRoot: ws.repoRoot, worktreePath: ws.worktreePath, branch: ws.branch, baseRef: ws.baseRef, status: ws.status, reason },
|
|
521
|
+
});
|
|
522
|
+
emitNewMessage(msg);
|
|
523
|
+
getLifecycleManager().onMessageForPolicy(policyName);
|
|
524
|
+
patchWorkspaceMetadata(ws.id, { stewardWokenAt: Date.now(), stewardPolicy: policyName });
|
|
525
|
+
return policyName;
|
|
526
|
+
} catch {
|
|
527
|
+
return null;
|
|
528
|
+
}
|
|
529
|
+
}
|
|
530
|
+
|
|
345
531
|
async function scanWorkspaceConflicts(): Promise<Record<string, unknown>> {
|
|
346
532
|
const orchestrators = listOrchestrators().filter((orch) => orch.status === "online" && orch.apiUrl);
|
|
347
533
|
if (!orchestrators.length) return { scanned: 0, skipped: "no online orchestrators" };
|
|
@@ -351,6 +537,7 @@ async function scanWorkspaceConflicts(): Promise<Record<string, unknown>> {
|
|
|
351
537
|
);
|
|
352
538
|
const flagged: string[] = [];
|
|
353
539
|
const cleared: string[] = [];
|
|
540
|
+
const merged: string[] = [];
|
|
354
541
|
const notifiedStewards: string[] = [];
|
|
355
542
|
|
|
356
543
|
for (const ws of candidates) {
|
|
@@ -362,6 +549,37 @@ async function scanWorkspaceConflicts(): Promise<Record<string, unknown>> {
|
|
|
362
549
|
if (p.error || p.missing || p.conflict === undefined) continue;
|
|
363
550
|
|
|
364
551
|
const meta = ws.metadata as Record<string, unknown>;
|
|
552
|
+
|
|
553
|
+
// Landing wins over everything else. Once the work is in base — whether the
|
|
554
|
+
// PR was squash/cherry-pick merged on GitHub or fast-forwarded locally — the
|
|
555
|
+
// workspace is done, even if `git merge-tree` still predicts a textual
|
|
556
|
+
// conflict against the now-moved base (a PR-strategy row sits at
|
|
557
|
+
// merge_planned forever otherwise, and the conflict scan can even pin a
|
|
558
|
+
// landed branch to `conflict`). Reconcile to the terminal `merged` status so
|
|
559
|
+
// the dashboard stops showing it as unmerged and GC prunes it on schedule.
|
|
560
|
+
const landed = p.landed === true || p.prMerged === true;
|
|
561
|
+
if (landed && LANDED_RECONCILE_STATUSES.has(ws.status)) {
|
|
562
|
+
updateWorkspaceStatus(ws.id, "merged", {
|
|
563
|
+
autoMerged: true,
|
|
564
|
+
mergedFromStatus: ws.status,
|
|
565
|
+
landedDetectedAt: Date.now(),
|
|
566
|
+
landedVia: p.prMerged === true ? "pr" : "git",
|
|
567
|
+
autoConflict: false,
|
|
568
|
+
});
|
|
569
|
+
merged.push(ws.id);
|
|
570
|
+
createActivityEvent({
|
|
571
|
+
clientId: "server-workspace-" + ws.id + "-merged-" + Date.now(),
|
|
572
|
+
kind: "state",
|
|
573
|
+
title: "Workspace work landed in base",
|
|
574
|
+
body: `${ws.branch ?? ws.id} is ${p.prMerged === true ? "merged on the remote (PR)" : "already merged into base"} ${p.baseRef ? `(${p.baseRef})` : ""} — marking merged`,
|
|
575
|
+
meta: ws.branch ?? ws.id,
|
|
576
|
+
icon: "ti-git-merge",
|
|
577
|
+
view: "orchestrators",
|
|
578
|
+
metadata: { source: "server", maintenanceJobId: "workspace-conflict-scan", workspaceId: ws.id, fromStatus: ws.status },
|
|
579
|
+
});
|
|
580
|
+
continue;
|
|
581
|
+
}
|
|
582
|
+
|
|
365
583
|
if (p.conflict === true && ws.status !== "conflict") {
|
|
366
584
|
updateWorkspaceStatus(ws.id, "conflict", {
|
|
367
585
|
autoConflict: true,
|
|
@@ -382,10 +600,15 @@ async function scanWorkspaceConflicts(): Promise<Record<string, unknown>> {
|
|
|
382
600
|
view: "orchestrators",
|
|
383
601
|
metadata: { source: "server", maintenanceJobId: "workspace-conflict-scan", workspaceId: ws.id, ahead: p.ahead, behind: p.behind },
|
|
384
602
|
});
|
|
385
|
-
//
|
|
386
|
-
//
|
|
387
|
-
//
|
|
388
|
-
|
|
603
|
+
// Hand the conflict to a steward so it gets resolved instead of rotting
|
|
604
|
+
// until merge time. Once-per-onset (we only enter this branch on the
|
|
605
|
+
// active→conflict transition). When managed stewards are enabled, wake the
|
|
606
|
+
// auto-provisioned per-repo steward agent (#167); otherwise fall back to the
|
|
607
|
+
// legacy direct ping of the elected steward agent.
|
|
608
|
+
if (getStewardConfig().enabled) {
|
|
609
|
+
const woke = wakeRepoSteward(getWorkspace(ws.id) ?? ws, "conflict");
|
|
610
|
+
if (woke) notifiedStewards.push(woke);
|
|
611
|
+
} else if (ws.stewardAgentId) {
|
|
389
612
|
try {
|
|
390
613
|
const msg = sendMessage({
|
|
391
614
|
from: "system",
|
|
@@ -410,7 +633,86 @@ async function scanWorkspaceConflicts(): Promise<Record<string, unknown>> {
|
|
|
410
633
|
}
|
|
411
634
|
}
|
|
412
635
|
|
|
413
|
-
return { scanned: candidates.length, flagged, cleared, notifiedStewards };
|
|
636
|
+
return { scanned: candidates.length, flagged, cleared, merged, notifiedStewards };
|
|
637
|
+
}
|
|
638
|
+
|
|
639
|
+
// Deterministic auto-land (Layer 0, issue #167). Walk the "ready to land" queue
|
|
640
|
+
// (`review_requested` isolated worktrees) and, for any whose work is a strict
|
|
641
|
+
// clean fast-forward (no conflict, base hasn't moved, real commits ahead), land
|
|
642
|
+
// it via the shared merge helper — the same lease-serialized path the merge route
|
|
643
|
+
// uses. Conflicts and diverged bases (`behind>0`, even if cleanly rebasable) are
|
|
644
|
+
// deliberately left for the steward (a human or, later, the managed steward
|
|
645
|
+
// agent): per the chosen "Clean FF immediate" gate, anything needing a rebase or
|
|
646
|
+
// conflict reasoning is not auto-landed. No agent in the loop for the easy case.
|
|
647
|
+
async function autoMergeCleanFastForwards(): Promise<Record<string, unknown>> {
|
|
648
|
+
if (process.env.AGENT_RELAY_WORKSPACE_AUTO_MERGE === "0") return { skipped: "disabled" };
|
|
649
|
+
const orchestrators = listOrchestrators().filter((orch) => orch.status === "online" && orch.apiUrl);
|
|
650
|
+
if (!orchestrators.length) return { scanned: 0, skipped: "no online orchestrators" };
|
|
651
|
+
|
|
652
|
+
const candidates = listWorkspaces().filter(
|
|
653
|
+
(ws) => ws.mode === "isolated" && Boolean(ws.worktreePath) && ws.status === "review_requested",
|
|
654
|
+
);
|
|
655
|
+
const stewardEnabled = getStewardConfig().enabled;
|
|
656
|
+
const merged: string[] = [];
|
|
657
|
+
const heldByLease: string[] = [];
|
|
658
|
+
const leftForSteward: string[] = [];
|
|
659
|
+
const wokeStewards: string[] = [];
|
|
660
|
+
|
|
661
|
+
for (const ws of candidates) {
|
|
662
|
+
const orch = orchestrators.find((candidate) => workspacePathWithinBase(ws.sourceCwd, candidate.baseDir));
|
|
663
|
+
if (!orch?.apiUrl) continue;
|
|
664
|
+
const preview = await fetchHostMergePreview(orch.apiUrl, ws);
|
|
665
|
+
if (!preview || (preview as { available?: false }).available === false) continue;
|
|
666
|
+
const p = preview as WorkspaceMergePreview;
|
|
667
|
+
if (p.error || p.missing) continue;
|
|
668
|
+
|
|
669
|
+
const ahead = p.unmergedAhead ?? p.ahead ?? 0;
|
|
670
|
+
const cleanFF = p.cleanFastForward === true && p.conflict !== true && (p.behind ?? 0) === 0 && ahead > 0;
|
|
671
|
+
if (!cleanFF) {
|
|
672
|
+
// Base moved on (behind>0) or conflict — needs reasoning/rebase, which is the
|
|
673
|
+
// steward's job. Wake the managed steward when enabled (cooldown-guarded);
|
|
674
|
+
// otherwise leave it for conflict-scan's legacy ping / human review.
|
|
675
|
+
leftForSteward.push(ws.id);
|
|
676
|
+
if (stewardEnabled) {
|
|
677
|
+
const woke = wakeRepoSteward(ws, (p.behind ?? 0) > 0 ? "base moved on (behind>0)" : "conflict");
|
|
678
|
+
if (woke) wokeStewards.push(woke);
|
|
679
|
+
}
|
|
680
|
+
continue;
|
|
681
|
+
}
|
|
682
|
+
|
|
683
|
+
const result = requestWorkspaceMerge(ws, { strategy: "rebase-ff", requestedBy: "auto-merge" });
|
|
684
|
+
if (!result.ok) {
|
|
685
|
+
// 409 = another merge holds this repo's lease this tick; retry next sweep.
|
|
686
|
+
heldByLease.push(ws.id);
|
|
687
|
+
continue;
|
|
688
|
+
}
|
|
689
|
+
emitCommand(result.command);
|
|
690
|
+
merged.push(ws.id);
|
|
691
|
+
createActivityEvent({
|
|
692
|
+
clientId: `workspace-auto-merge-${ws.id}-${Date.now()}`,
|
|
693
|
+
kind: "state",
|
|
694
|
+
title: "Workspace auto-merging (clean fast-forward)",
|
|
695
|
+
body: `${ws.branch ?? ws.id} → ${p.baseRef ?? "base"} (${ahead} ahead, clean)`,
|
|
696
|
+
meta: ws.branch ?? ws.id,
|
|
697
|
+
icon: "ti-git-merge",
|
|
698
|
+
view: "orchestrators",
|
|
699
|
+
metadata: { source: "server", maintenanceJobId: "workspace-auto-merge", workspaceId: ws.id, commandId: result.command.id, ahead },
|
|
700
|
+
});
|
|
701
|
+
}
|
|
702
|
+
|
|
703
|
+
return { scanned: candidates.length, merged, heldByLease, leftForSteward, wokeStewards };
|
|
704
|
+
}
|
|
705
|
+
|
|
706
|
+
// Send a system DM, swallowing failures (a stale/missing/misconfigured target
|
|
707
|
+
// must never break the GC sweep). Returns the target on success, null otherwise.
|
|
708
|
+
function notifyTarget(target: string, subject: string, body: string, payload: Record<string, unknown>): string | null {
|
|
709
|
+
if (!target) return null;
|
|
710
|
+
try {
|
|
711
|
+
emitNewMessage(sendMessage({ from: "system", to: target, kind: "system", subject, body, payload }));
|
|
712
|
+
return target;
|
|
713
|
+
} catch {
|
|
714
|
+
return null;
|
|
715
|
+
}
|
|
414
716
|
}
|
|
415
717
|
|
|
416
718
|
async function workspaceGC(): Promise<Record<string, unknown>> {
|
|
@@ -418,6 +720,10 @@ async function workspaceGC(): Promise<Record<string, unknown>> {
|
|
|
418
720
|
const cutoff = now - WORKSPACE_RETENTION_MS;
|
|
419
721
|
const reviewCutoff = now - WORKSPACE_REVIEW_TTL_MS;
|
|
420
722
|
|
|
723
|
+
// 0. Free any merge leases whose holder never reported back (orchestrator died
|
|
724
|
+
// mid-merge). The lease TTL is the safety net; this just reclaims them eagerly.
|
|
725
|
+
const releasedLeaseRepos = releaseExpiredMergeLeases(now);
|
|
726
|
+
|
|
421
727
|
// 1. Prune terminal rows past retention
|
|
422
728
|
const all = listWorkspaces();
|
|
423
729
|
const terminalIds: string[] = [];
|
|
@@ -428,29 +734,84 @@ async function workspaceGC(): Promise<Record<string, unknown>> {
|
|
|
428
734
|
}
|
|
429
735
|
}
|
|
430
736
|
|
|
431
|
-
// 2.
|
|
737
|
+
// 2. Rescue stranded review_requested/conflict worktrees (issue #157). A
|
|
738
|
+
// worktree is "stranded" when its steward is gone (all repo agents offline).
|
|
739
|
+
// Re-elect first — an agent may have rejoined — and hand off to the new
|
|
740
|
+
// steward; if none can be elected past the TTL, escalate to the fallback
|
|
741
|
+
// target so it never rots in silence. Bookkeeping uses patchWorkspaceMetadata
|
|
742
|
+
// (no updated_at bump) so the auto-abandon clock below keeps ticking.
|
|
743
|
+
const escalatedIds: string[] = [];
|
|
744
|
+
const reassignedIds: string[] = [];
|
|
745
|
+
const escalationTargets: string[] = [];
|
|
746
|
+
const escalationMs = stewardEscalationMs();
|
|
747
|
+
const fallbackTarget = stewardFallbackTarget();
|
|
748
|
+
for (const ws of all) {
|
|
749
|
+
if (!STRANDABLE_STATUSES.has(ws.status) || ws.mode !== "isolated" || !ws.worktreePath) continue;
|
|
750
|
+
reelectRepoSteward(ws.repoRoot);
|
|
751
|
+
const fresh = getWorkspace(ws.id);
|
|
752
|
+
if (!fresh || !STRANDABLE_STATUSES.has(fresh.status)) continue;
|
|
753
|
+
const meta = fresh.metadata as Record<string, unknown>;
|
|
754
|
+
const steward = fresh.stewardAgentId;
|
|
755
|
+
const stewardOnline = Boolean(steward && getAgent(steward) && getAgent(steward)!.status !== "offline");
|
|
756
|
+
const strandedAt = typeof meta.strandedAt === "number" ? meta.strandedAt : undefined;
|
|
757
|
+
|
|
758
|
+
if (stewardOnline) {
|
|
759
|
+
// An online steward owns it. If it was previously stranded and this
|
|
760
|
+
// steward hasn't been told, hand it off explicitly, then clear markers.
|
|
761
|
+
if (strandedAt !== undefined && meta.strandedNotifiedSteward !== steward) {
|
|
762
|
+
const sent = notifyTarget(
|
|
763
|
+
steward!,
|
|
764
|
+
"Workspace stewardship reassigned",
|
|
765
|
+
`You are now steward for ${fresh.repoRoot}. Workspace \`${fresh.branch ?? fresh.id}\` is ${fresh.status} and was stranded without an online steward — please coordinate ${fresh.status === "conflict" ? "conflict resolution" : "review/merge"}.`,
|
|
766
|
+
{ kind: "workspace.steward-reassigned", workspaceId: fresh.id, repoRoot: fresh.repoRoot, branch: fresh.branch, status: fresh.status },
|
|
767
|
+
);
|
|
768
|
+
if (sent) reassignedIds.push(fresh.id);
|
|
769
|
+
}
|
|
770
|
+
patchWorkspaceMetadata(fresh.id, { strandedAt: undefined, escalatedAt: undefined, strandedNotifiedSteward: steward });
|
|
771
|
+
continue;
|
|
772
|
+
}
|
|
773
|
+
|
|
774
|
+
// Stranded: no online steward could be elected.
|
|
775
|
+
if (strandedAt === undefined) { patchWorkspaceMetadata(fresh.id, { strandedAt: now }); continue; }
|
|
776
|
+
if (now - strandedAt < escalationMs || meta.escalatedAt) continue;
|
|
777
|
+
const sent = notifyTarget(
|
|
778
|
+
fallbackTarget,
|
|
779
|
+
"Stranded workspace needs an owner",
|
|
780
|
+
`Workspace \`${fresh.branch ?? fresh.id}\` in ${fresh.repoRoot} is ${fresh.status} with no online steward (all repo agents offline) for ${Math.round((now - strandedAt) / (60 * 60 * 1000))}h. Please coordinate ${fresh.status === "conflict" ? "conflict resolution" : "review/merge"} or clean up the worktree.`,
|
|
781
|
+
{ kind: "workspace.stranded-escalation", workspaceId: fresh.id, repoRoot: fresh.repoRoot, branch: fresh.branch, status: fresh.status, strandedAt },
|
|
782
|
+
);
|
|
783
|
+
if (sent) escalationTargets.push(sent);
|
|
784
|
+
patchWorkspaceMetadata(fresh.id, { escalatedAt: now });
|
|
785
|
+
escalatedIds.push(fresh.id);
|
|
786
|
+
createActivityEvent({
|
|
787
|
+
clientId: `workspace-gc-escalate-${fresh.id}-${now}`,
|
|
788
|
+
kind: "state",
|
|
789
|
+
title: "Workspace escalated",
|
|
790
|
+
body: `${fresh.branch ?? fresh.id} in ${fresh.repoRoot} — stranded ${fresh.status} escalated${fallbackTarget ? ` to ${fallbackTarget}` : " (no fallback configured)"}`,
|
|
791
|
+
meta: fresh.branch ?? fresh.id,
|
|
792
|
+
icon: "ti-alert-octagon",
|
|
793
|
+
view: "orchestrators",
|
|
794
|
+
metadata: { source: "server", maintenanceJobId: "workspace-gc", workspaceId: fresh.id, fallback: fallbackTarget || null },
|
|
795
|
+
});
|
|
796
|
+
}
|
|
797
|
+
|
|
798
|
+
// 3. Auto-abandon stale review_requested worktrees
|
|
432
799
|
const abandonedIds: string[] = [];
|
|
433
800
|
const notifiedStewards: string[] = [];
|
|
434
801
|
for (const ws of all) {
|
|
435
802
|
if (ws.status === "review_requested" && ws.updatedAt < reviewCutoff) {
|
|
436
803
|
updateWorkspaceStatus(ws.id, "abandoned", { autoAbandoned: true, abandonedReason: "review_requested TTL exceeded", abandonedAt: now });
|
|
437
804
|
abandonedIds.push(ws.id);
|
|
438
|
-
if
|
|
439
|
-
|
|
440
|
-
|
|
441
|
-
|
|
442
|
-
|
|
443
|
-
|
|
444
|
-
|
|
445
|
-
|
|
446
|
-
|
|
447
|
-
|
|
448
|
-
emitNewMessage(msg);
|
|
449
|
-
notifiedStewards.push(ws.stewardAgentId);
|
|
450
|
-
} catch {
|
|
451
|
-
// Steward gone — activity event is enough.
|
|
452
|
-
}
|
|
453
|
-
}
|
|
805
|
+
// Notify the steward if one exists, else the configured fallback so a
|
|
806
|
+
// stranded abandon isn't silent (issue #157).
|
|
807
|
+
const target = ws.stewardAgentId ?? fallbackTarget;
|
|
808
|
+
const sent = notifyTarget(
|
|
809
|
+
target,
|
|
810
|
+
"Workspace auto-abandoned",
|
|
811
|
+
`Workspace \`${ws.branch ?? ws.id}\` in ${ws.repoRoot} was auto-abandoned after ${Math.round(WORKSPACE_REVIEW_TTL_MS / DAY_MS)}d without steward action. Run workspace cleanup to reclaim the worktree.`,
|
|
812
|
+
{ kind: "workspace.auto-abandoned", workspaceId: ws.id, repoRoot: ws.repoRoot, branch: ws.branch },
|
|
813
|
+
);
|
|
814
|
+
if (sent) notifiedStewards.push(sent);
|
|
454
815
|
createActivityEvent({
|
|
455
816
|
clientId: `workspace-gc-abandon-${ws.id}-${now}`,
|
|
456
817
|
kind: "state",
|
|
@@ -483,7 +844,16 @@ async function workspaceGC(): Promise<Record<string, unknown>> {
|
|
|
483
844
|
pruneCommands.push(command.id);
|
|
484
845
|
}
|
|
485
846
|
|
|
486
|
-
return {
|
|
847
|
+
return {
|
|
848
|
+
prunedTerminal: terminalIds,
|
|
849
|
+
autoAbandoned: abandonedIds,
|
|
850
|
+
notifiedStewards,
|
|
851
|
+
pruneCommands,
|
|
852
|
+
releasedLeaseRepos,
|
|
853
|
+
escalated: escalatedIds,
|
|
854
|
+
reassigned: reassignedIds,
|
|
855
|
+
escalationTargets,
|
|
856
|
+
};
|
|
487
857
|
}
|
|
488
858
|
|
|
489
859
|
let timer: Timer | null = null;
|