@desplega.ai/agent-swarm 1.100.2 → 1.100.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/openapi.json CHANGED
@@ -2,7 +2,7 @@
2
2
  "openapi": "3.1.0",
3
3
  "info": {
4
4
  "title": "Agent Swarm API",
5
- "version": "1.100.2",
5
+ "version": "1.100.4",
6
6
  "description": "Multi-agent orchestration API for Claude Code, Codex, and Gemini CLI. Enables task distribution, agent communication, and service discovery.\n\nMCP tools are documented separately in [MCP.md](./MCP.md)."
7
7
  },
8
8
  "servers": [
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@desplega.ai/agent-swarm",
3
- "version": "1.100.2",
3
+ "version": "1.100.4",
4
4
  "description": "Multi-agent orchestration for Claude Code, Codex, Gemini CLI, and other AI coding assistants",
5
5
  "license": "MIT",
6
6
  "author": "desplega.sh <contact@desplega.sh>",
package/src/be/db.ts CHANGED
@@ -753,8 +753,13 @@ export function getAllAgents(opts?: { slim?: boolean }): Agent[] {
753
753
  }
754
754
 
755
755
  export function getLeadAgent(): Agent | null {
756
- const agents = getAllAgents();
757
- return agents.find((a) => a.isLead) ?? null;
756
+ const leads = getAllAgents().filter((a) => a.isLead);
757
+ // Prefer a usable (non-offline) lead so callers route to one that can actually
758
+ // poll — e.g. an old offline lead must not shadow a live replacement. Falls
759
+ // back to any lead (incl. offline) so existing "is there a lead at all?"
760
+ // semantics are preserved; callers that require a live lead must check
761
+ // `status` themselves (see escalateUnreclaimedResumes).
762
+ return leads.find((a) => a.status !== "offline") ?? leads[0] ?? null;
758
763
  }
759
764
 
760
765
  export function updateAgentStatus(id: string, status: AgentStatus): Agent | null {
@@ -1444,6 +1449,31 @@ export function hasNonTerminalResumeChild(parentId: string): boolean {
1444
1449
  return row !== undefined && row !== null;
1445
1450
  }
1446
1451
 
1452
+ /**
1453
+ * True when a non-terminal `reroute-decision` child exists for `parentId`.
1454
+ *
1455
+ * Mirrors {@link hasNonTerminalResumeChild} but filters on
1456
+ * `taskType = 'reroute-decision'` — the Lead-owned re-delegation decision
1457
+ * created when a pinned crash-recovery resume is never reclaimed (DES-523).
1458
+ * Makes escalation idempotent: a later heartbeat sweep must not create a second
1459
+ * decision for the same original task. We filter on the taskType marker
1460
+ * specifically (not any child) so ordinary delegation / completion follow-up
1461
+ * children of the original cannot suppress a needed decision, and nothing else
1462
+ * is mistaken for one.
1463
+ */
1464
+ export function hasNonTerminalRerouteDecisionChild(parentId: string): boolean {
1465
+ const row = getDb()
1466
+ .prepare(
1467
+ `SELECT 1 FROM agent_tasks
1468
+ WHERE parentTaskId = ?
1469
+ AND taskType = 'reroute-decision'
1470
+ AND status NOT IN ('completed', 'failed', 'cancelled', 'superseded')
1471
+ LIMIT 1`,
1472
+ )
1473
+ .get(parentId);
1474
+ return row !== undefined && row !== null;
1475
+ }
1476
+
1447
1477
  export function updateTaskClaudeSessionId(
1448
1478
  taskId: string,
1449
1479
  claudeSessionId: string,
@@ -2949,6 +2979,14 @@ export interface CreateTaskOptions {
2949
2979
  * a schema'd task should be defensive about JSON parsing.
2950
2980
  */
2951
2981
  outputSchema?: Record<string, unknown>;
2982
+ /**
2983
+ * When a `parentTaskId` is set, the child inherits the parent's `outputSchema`
2984
+ * by default. Set this to `false` to opt out — used by control-plane children
2985
+ * (e.g. the Lead `reroute-decision` task) that must inherit Slack/VCS context
2986
+ * from the parent but must NOT be forced to satisfy the original work's output
2987
+ * contract on completion (which would block the control task — DES-523).
2988
+ */
2989
+ inheritParentOutputSchema?: boolean;
2952
2990
  followUpConfig?: FollowUpConfig;
2953
2991
  requestedByUserId?: string;
2954
2992
  contextKey?: string;
@@ -3101,8 +3139,15 @@ export function createTaskExtended(task: string, options?: CreateTaskOptions): A
3101
3139
 
3102
3140
  // Contract (schema validation) — `store-progress` validates completion
3103
3141
  // output against `outputSchema`, runner injects structured-output
3104
- // instructions only when it's present.
3105
- if (parent.outputSchema && !options.outputSchema) {
3142
+ // instructions only when it's present. Opt-out via
3143
+ // `inheritParentOutputSchema: false` for control-plane children (e.g. the
3144
+ // Lead reroute-decision) that must not be held to the original work's
3145
+ // output contract.
3146
+ if (
3147
+ parent.outputSchema &&
3148
+ !options.outputSchema &&
3149
+ options.inheritParentOutputSchema !== false
3150
+ ) {
3106
3151
  options.outputSchema = parent.outputSchema;
3107
3152
  }
3108
3153
 
@@ -6506,6 +6551,88 @@ export function getStalledInProgressTasks(thresholdMinutes: number = 30): AgentT
6506
6551
  .map(rowToAgentTask);
6507
6552
  }
6508
6553
 
6554
+ /**
6555
+ * Genuine same-agent crash-recovery PINS (tagged `crash-recovery-pin`, DES-523
6556
+ * Phase 1) that are still `pending` `graceMin` minutes after creation — the
6557
+ * heartbeat reaper escalates these to a Lead reroute-decision.
6558
+ *
6559
+ * Three scoping clauses, each load-bearing:
6560
+ * - `tags LIKE '%"crash-recovery-pin"%'` — restricts to resumes actually pinned
6561
+ * to their original agent on the crash path. Without it, a *pooled* resume
6562
+ * that `autoAssignPoolTasks` flips to `pending` earlier in the SAME sweep
6563
+ * (keeping its old `createdAt`) would be reaped and cancelled before the
6564
+ * assigned worker polls; it also keeps `context_limits` / `manual_supersede`
6565
+ * pins from being escalated under a `crash_recovery` label. (Literal must
6566
+ * match `CRASH_RECOVERY_PIN_TAG` in src/tasks/worker-follow-up.ts.)
6567
+ * - `status = 'pending'` — the "currently unreclaimed" discriminator: when the
6568
+ * agent reclaims via the normal poll path, `startTask` flips the row to
6569
+ * `in_progress` and it drops out of this set. (A reclaimed resume whose
6570
+ * session later orphans can be flipped back to `pending` by
6571
+ * `resetOrphanedInProgressTasksForAgent`, re-entering this set on a later
6572
+ * sweep — re-escalating genuinely re-stalled work, which is fine.) We do NOT
6573
+ * gate on `lastActivityAt` — it is stale for a returned-but-idle agent.
6574
+ * - `createdAt < cutoff` — `createdAt` is the resume's creation = crash-DETECTION
6575
+ * time, so the grace window is measured from detection.
6576
+ *
6577
+ * Keys only on reboot-durable columns, so a pending pin survives a server reboot
6578
+ * and is caught on the first post-reboot sweep.
6579
+ */
6580
+ export function getStalePinnedResumes(graceMin: number): AgentTask[] {
6581
+ const cutoff = new Date(Date.now() - graceMin * 60 * 1000).toISOString();
6582
+ return getDb()
6583
+ .prepare<AgentTaskRow, [string]>(
6584
+ `SELECT * FROM agent_tasks
6585
+ WHERE taskType = 'resume' AND status = 'pending'
6586
+ AND tags LIKE '%"crash-recovery-pin"%'
6587
+ AND createdAt < ?
6588
+ ORDER BY createdAt ASC`,
6589
+ )
6590
+ .all(cutoff)
6591
+ .map(rowToAgentTask);
6592
+ }
6593
+
6594
+ /**
6595
+ * Atomically terminalize a pinned resume ONLY if it is still `pending`, in one
6596
+ * `UPDATE … RETURNING`. Returns the row when the transition fired, or `null`
6597
+ * when it did not (the agent reclaimed it in the gap → `startTask` already
6598
+ * flipped it to `in_progress`). The heartbeat reaper escalates to the Lead ONLY
6599
+ * when this returns a row, closing the TOCTOU window between reading the resume
6600
+ * as `pending` and writing.
6601
+ *
6602
+ * Deliberately NOT `failTask`: `failTask`'s backing SQL is keyed on `id` with no
6603
+ * status precondition, so it would terminalize an `in_progress` resume the
6604
+ * worker just started. The `AND status = 'pending'` here is the guard.
6605
+ */
6606
+ export function failPendingResumeIfUnclaimed(
6607
+ taskId: string,
6608
+ status: "cancelled" | "failed",
6609
+ failureReason: string,
6610
+ ): AgentTask | null {
6611
+ const now = new Date().toISOString();
6612
+ const scrubbedReason = scrubSecrets(failureReason);
6613
+ const row = getDb()
6614
+ .prepare<AgentTaskRow, [string, string, string, string, string]>(
6615
+ `UPDATE agent_tasks SET status = ?, failureReason = ?, finishedAt = ?, lastUpdatedAt = ?
6616
+ WHERE id = ? AND status = 'pending' RETURNING *`,
6617
+ )
6618
+ .get(status, scrubbedReason, now, now, taskId);
6619
+
6620
+ if (row) {
6621
+ try {
6622
+ createLogEntry({
6623
+ eventType: "task_status_change",
6624
+ taskId,
6625
+ agentId: row.agentId ?? undefined,
6626
+ oldValue: "pending",
6627
+ newValue: status,
6628
+ metadata: { reason: scrubbedReason, reaper: "pin_unreclaimed" },
6629
+ });
6630
+ } catch {}
6631
+ }
6632
+
6633
+ return row ? rowToAgentTask(row) : null;
6634
+ }
6635
+
6509
6636
  /**
6510
6637
  * Get idle, non-lead, non-offline agents that have capacity for more tasks.
6511
6638
  * Used by the heartbeat for auto-assignment of pool tasks.
@@ -42,16 +42,17 @@ export function recordRetrievals(
42
42
  const db = getDb();
43
43
  const insert = db.prepare(
44
44
  `INSERT INTO memory_retrieval
45
- (id, taskId, agentId, sessionId, memoryId, similarity, retrievedAt, contextKey, intent, eventType)
46
- VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`,
45
+ (id, taskId, agentId, sessionId, memoryId, similarity, retrievedAt, contextKey, intent, eventType, retrievalId, rank)
46
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`,
47
47
  );
48
48
  const now = new Date().toISOString();
49
+ const retrievalId = crypto.randomUUID();
49
50
  const contextKey = extras?.contextKey ?? null;
50
51
  const intent = extras?.intent ?? null;
51
52
  const eventType = extras?.eventType ?? "search";
52
53
 
53
54
  db.transaction(() => {
54
- for (const r of results) {
55
+ for (const [rank, r] of results.entries()) {
55
56
  insert.run(
56
57
  crypto.randomUUID(),
57
58
  taskId,
@@ -63,6 +64,8 @@ export function recordRetrievals(
63
64
  contextKey,
64
65
  intent,
65
66
  eventType,
67
+ retrievalId,
68
+ rank,
66
69
  );
67
70
  }
68
71
  })();
@@ -0,0 +1,10 @@
1
+ -- Add explicit per-search grouping to memory_retrieval.
2
+ --
3
+ -- `recordRetrievals()` writes one row per returned memory. A single
4
+ -- retrievalId groups all rows from the same search/get call, and rank
5
+ -- preserves the result order within that call for precision@k/MRR analysis.
6
+
7
+ ALTER TABLE memory_retrieval ADD COLUMN retrievalId TEXT;
8
+ ALTER TABLE memory_retrieval ADD COLUMN rank INTEGER;
9
+
10
+ CREATE INDEX idx_memret_retrieval_id ON memory_retrieval(retrievalId);
@@ -3,6 +3,7 @@ import { findUserByExternalId } from "../be/users";
3
3
  import { resolveTemplate } from "../prompts/resolver";
4
4
  import { githubContextKey } from "../tasks/context-key";
5
5
  import { createTaskWithSiblingAwareness } from "../tasks/sibling-awareness";
6
+ import { getInstallationToken } from "./app";
6
7
  import {
7
8
  detectMention,
8
9
  extractMentionContext,
@@ -936,6 +937,69 @@ export async function handleComment(
936
937
  return { created: true, taskId: task.id };
937
938
  }
938
939
 
940
+ interface ReviewInlineComment {
941
+ id: number;
942
+ path: string;
943
+ line: number | null;
944
+ body: string;
945
+ html_url: string;
946
+ diff_hunk: string;
947
+ }
948
+
949
+ function parseNextPageLink(linkHeader: string | null): string | null {
950
+ if (!linkHeader) return null;
951
+ const match = linkHeader.match(/<([^>]+)>;\s*rel="next"/);
952
+ return match ? (match[1] ?? null) : null;
953
+ }
954
+
955
+ async function fetchReviewComments(
956
+ repo: string,
957
+ prNumber: number,
958
+ reviewId: number,
959
+ installationId: number,
960
+ ): Promise<ReviewInlineComment[]> {
961
+ const token = await getInstallationToken(installationId);
962
+ if (!token) {
963
+ return [];
964
+ }
965
+ const headers = {
966
+ Accept: "application/vnd.github+json",
967
+ Authorization: `Bearer ${token}`,
968
+ "X-GitHub-Api-Version": "2022-11-28",
969
+ };
970
+ const allComments: ReviewInlineComment[] = [];
971
+ let url: string | null =
972
+ `https://api.github.com/repos/${repo}/pulls/${prNumber}/reviews/${reviewId}/comments?per_page=100`;
973
+ try {
974
+ while (url) {
975
+ const response = await fetch(url, { headers });
976
+ if (!response.ok) {
977
+ console.error(`[GitHub] Failed to fetch review inline comments: ${response.status}`);
978
+ return allComments;
979
+ }
980
+ const page = (await response.json()) as ReviewInlineComment[];
981
+ if (Array.isArray(page)) {
982
+ allComments.push(...page);
983
+ }
984
+ url = parseNextPageLink(response.headers.get("link"));
985
+ }
986
+ return allComments;
987
+ } catch (error) {
988
+ console.error("[GitHub] Error fetching review inline comments:", error);
989
+ return allComments;
990
+ }
991
+ }
992
+
993
+ function buildInlineCommentsSection(comments: ReviewInlineComment[]): string {
994
+ if (comments.length === 0) return "";
995
+ const items = comments.map((c) => {
996
+ const loc = c.line ? `${c.path}:${c.line}` : c.path;
997
+ const hunk = c.diff_hunk ? `\n\`\`\`diff\n${c.diff_hunk.slice(0, 300)}\n\`\`\`` : "";
998
+ return `- **${loc}**${hunk}\n > ${c.body}`;
999
+ });
1000
+ return `\n\n## Inline review comments (${comments.length})\n\n${items.join("\n\n")}`;
1001
+ }
1002
+
939
1003
  /**
940
1004
  * Handle pull_request_review events (submitted, edited, dismissed)
941
1005
  *
@@ -963,15 +1027,21 @@ export async function handlePullRequestReview(
963
1027
  return { created: false };
964
1028
  }
965
1029
 
966
- // Skip "commented" reviews that are empty - these are often just line comments
967
- // without an overall review body
968
- if (review.state === "commented" && !review.body) {
1030
+ // Deduplicate before making any API calls
1031
+ const eventKey = `pr-review:${repository.full_name}:${pr.number}:${review.id}`;
1032
+ if (isDuplicate(eventKey)) {
969
1033
  return { created: false };
970
1034
  }
971
1035
 
972
- // Deduplicate
973
- const eventKey = `pr-review:${repository.full_name}:${pr.number}:${review.id}`;
974
- if (isDuplicate(eventKey)) {
1036
+ // Fetch inline comments now so we can decide whether to skip and include them in the task.
1037
+ // Returns [] when no installation credentials are available (graceful degradation).
1038
+ const inlineComments = installation?.id
1039
+ ? await fetchReviewComments(repository.full_name, pr.number, review.id, installation.id)
1040
+ : [];
1041
+
1042
+ // Skip "commented" reviews only when there is neither an overall body nor any inline
1043
+ // comments — a body-less review with inline comments carries real reviewer feedback.
1044
+ if (review.state === "commented" && !review.body && inlineComments.length === 0) {
975
1045
  return { created: false };
976
1046
  }
977
1047
 
@@ -992,15 +1062,21 @@ export async function handlePullRequestReview(
992
1062
 
993
1063
  // Build task description
994
1064
  const reviewBodySection = review.body ? `\n\nReview Comment:\n${review.body}` : "";
1065
+ const inlineCommentsSection = buildInlineCommentsSection(inlineComments);
995
1066
  const relatedTaskSection = existingTask
996
1067
  ? `Related task: ${existingTask.id}\n🔀 Consider routing to the same agent working on the related task.\n`
997
1068
  : "";
998
- const reviewSuggestions =
1069
+
1070
+ const hasInlineComments = inlineComments.length > 0;
1071
+ const baseReviewSuggestion =
999
1072
  review.state === "approved"
1000
1073
  ? "💡 Suggested: Merge the PR or wait for additional reviews"
1001
1074
  : review.state === "changes_requested"
1002
1075
  ? "💡 Suggested: Address the requested changes and update the PR"
1003
1076
  : "💡 Suggested: Review the feedback and respond if needed";
1077
+ const reviewSuggestions = hasInlineComments
1078
+ ? `${baseReviewSuggestion}\n💬 Address EVERY inline comment. After pushing fixes, reply to and resolve each inline review thread on GitHub so the reviewer sees visible confirmation.`
1079
+ : baseReviewSuggestion;
1004
1080
 
1005
1081
  const result = resolveTemplate(
1006
1082
  "github.pull_request.review_submitted",
@@ -1013,6 +1089,7 @@ export async function handlePullRequestReview(
1013
1089
  repo_full_name: repository.full_name,
1014
1090
  review_url: review.html_url,
1015
1091
  review_body_section: reviewBodySection,
1092
+ inline_comments_section: inlineCommentsSection,
1016
1093
  related_task_section: relatedTaskSection,
1017
1094
  review_suggestions: reviewSuggestions,
1018
1095
  },
@@ -350,7 +350,7 @@ registerTemplate({
350
350
  defaultBody: `PR: {{pr_title}}
351
351
  Reviewer: {{sender_login}}
352
352
  Repo: {{repo_full_name}}
353
- URL: {{review_url}}{{review_body_section}}
353
+ URL: {{review_url}}{{review_body_section}}{{inline_comments_section}}
354
354
 
355
355
  ---
356
356
  {{related_task_section}}{{@template[common.delegation_instruction]}}
@@ -363,7 +363,11 @@ URL: {{review_url}}{{review_body_section}}
363
363
  { name: "sender_login", description: "Reviewer login" },
364
364
  { name: "repo_full_name", description: "Repository full name (owner/repo)" },
365
365
  { name: "review_url", description: "Review HTML URL" },
366
- { name: "review_body_section", description: "Review comment section or empty string" },
366
+ { name: "review_body_section", description: "Review overall body section or empty string" },
367
+ {
368
+ name: "inline_comments_section",
369
+ description: "Formatted inline review comments section or empty string",
370
+ },
367
371
  { name: "related_task_section", description: "Related task info or empty string" },
368
372
  { name: "review_suggestions", description: "Context-appropriate review suggestion" },
369
373
  ],
@@ -4,6 +4,7 @@ import {
4
4
  cleanupStaleSessions,
5
5
  createTaskExtended,
6
6
  deleteActiveSession,
7
+ failPendingResumeIfUnclaimed,
7
8
  failTask,
8
9
  getActiveSessionForTask,
9
10
  getActiveTaskCount,
@@ -14,7 +15,9 @@ import {
14
15
  getRecentCompletedCount,
15
16
  getRecentFailedCount,
16
17
  getRecentFailedTasks,
18
+ getStalePinnedResumes,
17
19
  getStalledInProgressTasks,
20
+ getTaskById,
18
21
  getTaskStats,
19
22
  getTasksByStatus,
20
23
  getUnassignedPoolTasks,
@@ -25,8 +28,14 @@ import {
25
28
  supersedeTask,
26
29
  updateAgentStatus,
27
30
  } from "../be/db";
31
+ import { repointTrackerSyncBySwarmId } from "../be/db-queries/tracker";
28
32
  import { resolveTemplate } from "../prompts/resolver";
29
- import { createResumeFollowUp, getNextResumeGeneration } from "../tasks/worker-follow-up";
33
+ import {
34
+ createRerouteDecisionTask,
35
+ createResumeFollowUp,
36
+ getNextResumeGeneration,
37
+ getResumeGeneration,
38
+ } from "../tasks/worker-follow-up";
30
39
  import type { AgentTask } from "../types";
31
40
  import { getExecutorRegistry } from "../workflows";
32
41
  import { recoverIncompleteRuns } from "../workflows/recovery";
@@ -36,8 +45,20 @@ import "./templates";
36
45
  /**
37
46
  * System tasks that must NOT be auto-resumed — mirrors `runRebootSweep`'s exclusion list
38
47
  * to prevent infinite retry loops on the heartbeat/triage system tasks themselves.
48
+ *
49
+ * `reroute-decision` is included (DES-523): it is a control-plane Lead task, not
50
+ * user work. If a Lead crashed while holding one, auto-resuming it would create a
51
+ * crash-recovery pin for the decision; reaping that pin would then treat the
52
+ * decision as the `original`, producing nested reroute-decisions ABOUT the control
53
+ * prompt instead of recovering the real work. So a crashed decision is failed, not
54
+ * resumed (the original work was already superseded; its recovery chain is separate).
39
55
  */
40
- const SKIP_AUTO_RESUME_TYPES = new Set(["heartbeat-checklist", "boot-triage", "heartbeat"]);
56
+ const SKIP_AUTO_RESUME_TYPES = new Set([
57
+ "heartbeat-checklist",
58
+ "boot-triage",
59
+ "heartbeat",
60
+ "reroute-decision",
61
+ ]);
41
62
 
42
63
  // ============================================================================
43
64
  // Configuration (env var overrides)
@@ -66,6 +87,29 @@ export const MAX_RESUME_GENERATIONS = Number(process.env.HEARTBEAT_MAX_RESUME_GE
66
87
 
67
88
  export const RESUME_BUDGET_EXHAUSTED_REASON = "resume_budget_exhausted";
68
89
 
90
+ /**
91
+ * Grace window (minutes) a crash-recovery resume pinned to its original agent
92
+ * (DES-523 Phase 1) waits to be reclaimed before the reaper concludes the agent
93
+ * is gone and escalates to a Lead re-delegation decision. Generous enough for a
94
+ * slow container restart / image pull, short enough that a genuinely-gone
95
+ * agent's work reaches the Lead promptly. Measured from the resume's `createdAt`
96
+ * (= crash-detection time), so worst-case crash→escalation latency is
97
+ * ~`STALL_THRESHOLD_NO_SESSION_MIN` + this. Set to `0` to disable the reaper.
98
+ *
99
+ * Uses `??` (not `|| 10`) so an explicit `0` is honored as "reaper off" rather
100
+ * than coerced back to the default.
101
+ */
102
+ export const HEARTBEAT_RESUME_PIN_GRACE_MIN = (() => {
103
+ const raw = process.env.HEARTBEAT_RESUME_PIN_GRACE_MIN;
104
+ if (raw === undefined) return 10;
105
+ const parsed = Number(raw);
106
+ // Honor an explicit `0` (reaper off), but fall back to the default on a
107
+ // non-finite value (e.g. a typo'd `abc` → NaN). Without this guard, NaN passes
108
+ // the `<= 0` disable check, reaches getStalePinnedResumes(NaN), and throws in
109
+ // `new Date(NaN).toISOString()` — breaking cleanup on every sweep.
110
+ return Number.isFinite(parsed) ? parsed : 10;
111
+ })();
112
+
69
113
  /** Heartbeat checklist interval: how often to check HEARTBEAT.md (default: 30 min) */
70
114
  const HEARTBEAT_CHECKLIST_INTERVAL_MS =
71
115
  Number(process.env.HEARTBEAT_CHECKLIST_INTERVAL_MS) || 30 * 60 * 1000;
@@ -86,6 +130,17 @@ export interface HeartbeatFindings {
86
130
  agentId: string;
87
131
  reason: string;
88
132
  }>;
133
+ /**
134
+ * Crash-recovery resumes pinned back to their original (stable-ID) agent
135
+ * instead of being released to the role-blind unassigned pool (DES-523). A
136
+ * subset of `autoResumedTasks`: the resume `taskId` + the agent it pinned to.
137
+ */
138
+ pinnedResumes: Array<{ taskId: string; agentId: string }>;
139
+ /**
140
+ * Pinned crash-recovery resumes that were never reclaimed within the grace
141
+ * window and were escalated to a Lead re-delegation decision (DES-523 Phase 3).
142
+ */
143
+ escalatedReroutes: Array<{ originalTaskId: string; decisionTaskId: string }>;
89
144
  workerHealthFixes: Array<{ agentId: string; oldStatus: string; newStatus: string }>;
90
145
  autoAssigned: Array<{ taskId: string; agentId: string }>;
91
146
  staleCleanup: {
@@ -157,6 +212,8 @@ export async function codeLevelTriage(): Promise<HeartbeatFindings> {
157
212
  stalledTasks: [],
158
213
  autoFailedTasks: [],
159
214
  autoResumedTasks: [],
215
+ pinnedResumes: [],
216
+ escalatedReroutes: [],
160
217
  workerHealthFixes: [],
161
218
  autoAssigned: [],
162
219
  staleCleanup: {
@@ -353,9 +410,20 @@ function remediateCrashedWorkerTask(
353
410
  agentId: task.agentId,
354
411
  reason: opts.supersedeReason,
355
412
  });
356
- console.log(
357
- `[Heartbeat] Auto-superseded task ${task.id.slice(0, 8)} created resume ${resume.task.id.slice(0, 8)} (${opts.shortLabel})`,
358
- );
413
+ // Phase 1 (DES-523): when the resume pinned back to the original
414
+ // (stable-ID) agent, record it so the sweep summary surfaces the pin
415
+ // rather than a silent pool fallback. `createResumeFollowUp` sets the
416
+ // resume's `agentId` to the original only on the crash_recovery pin path.
417
+ if (resume.task.agentId === task.agentId) {
418
+ findings.pinnedResumes.push({ taskId: resume.task.id, agentId: task.agentId });
419
+ console.log(
420
+ `[Heartbeat] Auto-superseded task ${task.id.slice(0, 8)} — pinned resume ${resume.task.id.slice(0, 8)} to original agent ${task.agentId.slice(0, 8)} (${opts.shortLabel})`,
421
+ );
422
+ } else {
423
+ console.log(
424
+ `[Heartbeat] Auto-superseded task ${task.id.slice(0, 8)} — created resume ${resume.task.id.slice(0, 8)} in unassigned pool (${opts.shortLabel})`,
425
+ );
426
+ }
359
427
  } else {
360
428
  const reason =
361
429
  resume.kind === "skipped"
@@ -558,6 +626,113 @@ function autoAssignPoolTasks(findings: HeartbeatFindings): void {
558
626
  })();
559
627
  }
560
628
 
629
+ /**
630
+ * Reaper (DES-523 Phase 3): escalate crash-recovery resumes that were pinned to
631
+ * their original agent (Phase 1) but never reclaimed within
632
+ * `HEARTBEAT_RESUME_PIN_GRACE_MIN`. This is the ONLY path to the Lead decision —
633
+ * "gone" can't be told from "restarting" at crash-detection time, so Phase 1
634
+ * pins optimistically and this reaper decides "gone" once a pin demonstrably
635
+ * fails to be reclaimed. After this runs, the heartbeat crash path never touches
636
+ * the unassigned pool.
637
+ *
638
+ * Wired into `cleanupStaleResources`, so it runs on every sweep — including the
639
+ * cleanup-only preflight-bail path and the first post-reboot sweep — and a
640
+ * pending pin is reaped even when the system otherwise looks idle.
641
+ */
642
+ function escalateUnreclaimedResumes(findings: HeartbeatFindings): void {
643
+ // Grace 0 = reaper disabled (rollback switch).
644
+ if (HEARTBEAT_RESUME_PIN_GRACE_MIN <= 0) return;
645
+
646
+ const stale = getStalePinnedResumes(HEARTBEAT_RESUME_PIN_GRACE_MIN);
647
+ if (stale.length === 0) return;
648
+
649
+ // A non-offline Lead is required to re-delegate. Without one (none registered,
650
+ // or the only lead is `offline` after POST /close), leave escalation candidates
651
+ // `pending` rather than cancel the pin and hand the decision to an agent that
652
+ // can't poll it (which would strand the work). The budget-exhaustion path below
653
+ // is independent of the Lead and still runs. `getLeadAgent` already prefers a
654
+ // non-offline lead, so this also guards the createRerouteDecisionTask assignment.
655
+ const lead = getLeadAgent();
656
+ const hasLead = lead != null && lead.status !== "offline";
657
+
658
+ for (const resume of stale) {
659
+ if (!resume.parentTaskId) continue; // Defensive — resumes always have a parent.
660
+
661
+ // Budget guard: a resume already at the generation cap must NOT spawn another
662
+ // Lead re-delegation (send-task does not enforce the generation tag, so a
663
+ // flapping task could loop forever). Terminalize and stop. Atomic, so we
664
+ // never kill a resume the agent just reclaimed in the gap.
665
+ if (getResumeGeneration(resume) >= MAX_RESUME_GENERATIONS) {
666
+ const failed = failPendingResumeIfUnclaimed(
667
+ resume.id,
668
+ "failed",
669
+ RESUME_BUDGET_EXHAUSTED_REASON,
670
+ );
671
+ if (failed) {
672
+ console.warn(
673
+ `[Heartbeat] Unreclaimed pinned resume ${resume.id.slice(0, 8)} hit the resume-generation cap — terminalized, no Lead decision`,
674
+ );
675
+ }
676
+ continue;
677
+ }
678
+
679
+ if (!hasLead) continue; // No lead → leave the pin pending; nothing to escalate to.
680
+
681
+ const original = getTaskById(resume.parentTaskId);
682
+ if (!original) continue; // Parent gone — nothing to escalate against.
683
+
684
+ // Escalate atomically: terminalize the pin + repoint the tracker link
685
+ // (original → R1 at pin time; R1 is now dead, so move it back so the Lead's
686
+ // re-delegated resume inherits it via send-task) + create the Lead decision,
687
+ // all in ONE transaction. A mid-sequence process death therefore can't leave
688
+ // the pin cancelled with no Lead signal (which would orphan the work — it is
689
+ // invisible to both the stall detector and this reaper afterward).
690
+ // - The conditional terminalize still returns null if the agent reclaimed
691
+ // the pin in the gap → abort with no writes and skip (TOCTOU guard).
692
+ // - If the decision can't be created (unexpected — hasLead is checked and a
693
+ // still-`pending` pin implies no prior decision), throw to roll back the
694
+ // cancel so the pin is retried next sweep instead of being stranded.
695
+ let escalation: { decisionTaskId: string } | null = null;
696
+ try {
697
+ escalation = getDb().transaction(() => {
698
+ const terminalized = failPendingResumeIfUnclaimed(
699
+ resume.id,
700
+ "cancelled",
701
+ "pin_unreclaimed_escalated",
702
+ );
703
+ if (!terminalized) return null; // reclaimed in the gap — no writes made
704
+ repointTrackerSyncBySwarmId(resume.id, original.id);
705
+ const decision = createRerouteDecisionTask({
706
+ original,
707
+ staleResume: resume,
708
+ reason: "crash_recovery",
709
+ maxGenerations: MAX_RESUME_GENERATIONS,
710
+ });
711
+ if (decision.kind !== "created") {
712
+ throw new Error(`reroute-decision not created: ${decision.reason}`);
713
+ }
714
+ return { decisionTaskId: decision.task.id };
715
+ })();
716
+ } catch (err) {
717
+ console.warn(
718
+ `[Heartbeat] Reroute escalation rolled back for resume ${resume.id.slice(0, 8)} — ${
719
+ err instanceof Error ? err.message : String(err)
720
+ }; pin left pending for the next sweep`,
721
+ );
722
+ continue;
723
+ }
724
+ if (!escalation) continue; // agent reclaimed the pin in the gap
725
+
726
+ findings.escalatedReroutes.push({
727
+ originalTaskId: original.id,
728
+ decisionTaskId: escalation.decisionTaskId,
729
+ });
730
+ console.log(
731
+ `[Heartbeat] Escalated unreclaimed pinned resume ${resume.id.slice(0, 8)} → Lead reroute-decision ${escalation.decisionTaskId.slice(0, 8)} (original ${original.id.slice(0, 8)})`,
732
+ );
733
+ }
734
+ }
735
+
561
736
  /**
562
737
  * Call existing stale resource cleanup functions.
563
738
  */
@@ -572,6 +747,9 @@ async function cleanupStaleResources(findings: HeartbeatFindings): Promise<void>
572
747
  findings.staleCleanup.inboxProcessing = releaseStaleProcessingInbox(
573
748
  STALE_CLEANUP_THRESHOLD_MINUTES,
574
749
  );
750
+ // DES-523 Phase 3: escalate pinned crash-recovery resumes that were never
751
+ // reclaimed within the grace window to a Lead re-delegation decision.
752
+ escalateUnreclaimedResumes(findings);
575
753
  try {
576
754
  findings.staleCleanup.workflowRuns = await recoverIncompleteRuns(getExecutorRegistry());
577
755
  } catch {
@@ -854,6 +1032,8 @@ export async function runHeartbeatSweep(): Promise<void> {
854
1032
  stalledTasks: [],
855
1033
  autoFailedTasks: [],
856
1034
  autoResumedTasks: [],
1035
+ pinnedResumes: [],
1036
+ escalatedReroutes: [],
857
1037
  workerHealthFixes: [],
858
1038
  autoAssigned: [],
859
1039
  staleCleanup: {
@@ -891,6 +1071,12 @@ function logFindings(findings: HeartbeatFindings): void {
891
1071
  if (findings.autoResumedTasks.length > 0) {
892
1072
  parts.push(`auto_resumed=${findings.autoResumedTasks.length}`);
893
1073
  }
1074
+ if (findings.pinnedResumes.length > 0) {
1075
+ parts.push(`pinned_resumes=${findings.pinnedResumes.length}`);
1076
+ }
1077
+ if (findings.escalatedReroutes.length > 0) {
1078
+ parts.push(`escalated_reroutes=${findings.escalatedReroutes.length}`);
1079
+ }
894
1080
  if (findings.stalledTasks.length > 0) {
895
1081
  parts.push(`stalled=${findings.stalledTasks.length}`);
896
1082
  }