@desplega.ai/agent-swarm 1.100.2 → 1.100.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/openapi.json +1 -1
- package/package.json +1 -1
- package/src/be/db.ts +131 -4
- package/src/be/memory/raters/retrieval.ts +6 -3
- package/src/be/migrations/097_memory_retrieval_grouping.sql +10 -0
- package/src/github/handlers.ts +84 -7
- package/src/github/templates.ts +6 -2
- package/src/heartbeat/heartbeat.ts +191 -5
- package/src/providers/claude-adapter.ts +41 -4
- package/src/slack/assistant.ts +28 -0
- package/src/slack/channel-join.ts +38 -3
- package/src/slack/handlers.ts +4 -1
- package/src/tasks/worker-follow-up.ts +181 -20
- package/src/tests/claude-adapter-binary.test.ts +74 -0
- package/src/tests/github-handlers-inline-comments.test.ts +308 -0
- package/src/tests/heartbeat-reroute-decision.test.ts +570 -0
- package/src/tests/heartbeat-supersede-resume.test.ts +137 -0
- package/src/tests/heartbeat.test.ts +4 -2
- package/src/tests/memory-rater-implicit-citation.test.ts +31 -0
- package/src/tests/prompt-template-remaining.test.ts +2 -1
- package/src/tests/slack-assistant-comention-production.test.ts +319 -0
- package/src/tests/slack-assistant-comention.test.ts +139 -0
- package/src/tests/slack-channel-join.test.ts +150 -16
- package/src/tools/send-task.ts +51 -1
- package/src/tools/templates.ts +61 -0
package/openapi.json
CHANGED
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
"openapi": "3.1.0",
|
|
3
3
|
"info": {
|
|
4
4
|
"title": "Agent Swarm API",
|
|
5
|
-
"version": "1.100.
|
|
5
|
+
"version": "1.100.3",
|
|
6
6
|
"description": "Multi-agent orchestration API for Claude Code, Codex, and Gemini CLI. Enables task distribution, agent communication, and service discovery.\n\nMCP tools are documented separately in [MCP.md](./MCP.md)."
|
|
7
7
|
},
|
|
8
8
|
"servers": [
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@desplega.ai/agent-swarm",
|
|
3
|
-
"version": "1.100.
|
|
3
|
+
"version": "1.100.3",
|
|
4
4
|
"description": "Multi-agent orchestration for Claude Code, Codex, Gemini CLI, and other AI coding assistants",
|
|
5
5
|
"license": "MIT",
|
|
6
6
|
"author": "desplega.sh <contact@desplega.sh>",
|
package/src/be/db.ts
CHANGED
|
@@ -753,8 +753,13 @@ export function getAllAgents(opts?: { slim?: boolean }): Agent[] {
|
|
|
753
753
|
}
|
|
754
754
|
|
|
755
755
|
export function getLeadAgent(): Agent | null {
|
|
756
|
-
const
|
|
757
|
-
|
|
756
|
+
const leads = getAllAgents().filter((a) => a.isLead);
|
|
757
|
+
// Prefer a usable (non-offline) lead so callers route to one that can actually
|
|
758
|
+
// poll — e.g. an old offline lead must not shadow a live replacement. Falls
|
|
759
|
+
// back to any lead (incl. offline) so existing "is there a lead at all?"
|
|
760
|
+
// semantics are preserved; callers that require a live lead must check
|
|
761
|
+
// `status` themselves (see escalateUnreclaimedResumes).
|
|
762
|
+
return leads.find((a) => a.status !== "offline") ?? leads[0] ?? null;
|
|
758
763
|
}
|
|
759
764
|
|
|
760
765
|
export function updateAgentStatus(id: string, status: AgentStatus): Agent | null {
|
|
@@ -1444,6 +1449,31 @@ export function hasNonTerminalResumeChild(parentId: string): boolean {
|
|
|
1444
1449
|
return row !== undefined && row !== null;
|
|
1445
1450
|
}
|
|
1446
1451
|
|
|
1452
|
+
/**
|
|
1453
|
+
* True when a non-terminal `reroute-decision` child exists for `parentId`.
|
|
1454
|
+
*
|
|
1455
|
+
* Mirrors {@link hasNonTerminalResumeChild} but filters on
|
|
1456
|
+
* `taskType = 'reroute-decision'` — the Lead-owned re-delegation decision
|
|
1457
|
+
* created when a pinned crash-recovery resume is never reclaimed (DES-523).
|
|
1458
|
+
* Makes escalation idempotent: a later heartbeat sweep must not create a second
|
|
1459
|
+
* decision for the same original task. We filter on the taskType marker
|
|
1460
|
+
* specifically (not any child) so ordinary delegation / completion follow-up
|
|
1461
|
+
* children of the original cannot suppress a needed decision, and nothing else
|
|
1462
|
+
* is mistaken for one.
|
|
1463
|
+
*/
|
|
1464
|
+
export function hasNonTerminalRerouteDecisionChild(parentId: string): boolean {
|
|
1465
|
+
const row = getDb()
|
|
1466
|
+
.prepare(
|
|
1467
|
+
`SELECT 1 FROM agent_tasks
|
|
1468
|
+
WHERE parentTaskId = ?
|
|
1469
|
+
AND taskType = 'reroute-decision'
|
|
1470
|
+
AND status NOT IN ('completed', 'failed', 'cancelled', 'superseded')
|
|
1471
|
+
LIMIT 1`,
|
|
1472
|
+
)
|
|
1473
|
+
.get(parentId);
|
|
1474
|
+
return row !== undefined && row !== null;
|
|
1475
|
+
}
|
|
1476
|
+
|
|
1447
1477
|
export function updateTaskClaudeSessionId(
|
|
1448
1478
|
taskId: string,
|
|
1449
1479
|
claudeSessionId: string,
|
|
@@ -2949,6 +2979,14 @@ export interface CreateTaskOptions {
|
|
|
2949
2979
|
* a schema'd task should be defensive about JSON parsing.
|
|
2950
2980
|
*/
|
|
2951
2981
|
outputSchema?: Record<string, unknown>;
|
|
2982
|
+
/**
|
|
2983
|
+
* When a `parentTaskId` is set, the child inherits the parent's `outputSchema`
|
|
2984
|
+
* by default. Set this to `false` to opt out — used by control-plane children
|
|
2985
|
+
* (e.g. the Lead `reroute-decision` task) that must inherit Slack/VCS context
|
|
2986
|
+
* from the parent but must NOT be forced to satisfy the original work's output
|
|
2987
|
+
* contract on completion (which would block the control task — DES-523).
|
|
2988
|
+
*/
|
|
2989
|
+
inheritParentOutputSchema?: boolean;
|
|
2952
2990
|
followUpConfig?: FollowUpConfig;
|
|
2953
2991
|
requestedByUserId?: string;
|
|
2954
2992
|
contextKey?: string;
|
|
@@ -3101,8 +3139,15 @@ export function createTaskExtended(task: string, options?: CreateTaskOptions): A
|
|
|
3101
3139
|
|
|
3102
3140
|
// Contract (schema validation) — `store-progress` validates completion
|
|
3103
3141
|
// output against `outputSchema`, runner injects structured-output
|
|
3104
|
-
// instructions only when it's present.
|
|
3105
|
-
|
|
3142
|
+
// instructions only when it's present. Opt-out via
|
|
3143
|
+
// `inheritParentOutputSchema: false` for control-plane children (e.g. the
|
|
3144
|
+
// Lead reroute-decision) that must not be held to the original work's
|
|
3145
|
+
// output contract.
|
|
3146
|
+
if (
|
|
3147
|
+
parent.outputSchema &&
|
|
3148
|
+
!options.outputSchema &&
|
|
3149
|
+
options.inheritParentOutputSchema !== false
|
|
3150
|
+
) {
|
|
3106
3151
|
options.outputSchema = parent.outputSchema;
|
|
3107
3152
|
}
|
|
3108
3153
|
|
|
@@ -6506,6 +6551,88 @@ export function getStalledInProgressTasks(thresholdMinutes: number = 30): AgentT
|
|
|
6506
6551
|
.map(rowToAgentTask);
|
|
6507
6552
|
}
|
|
6508
6553
|
|
|
6554
|
+
/**
|
|
6555
|
+
* Genuine same-agent crash-recovery PINS (tagged `crash-recovery-pin`, DES-523
|
|
6556
|
+
* Phase 1) that are still `pending` `graceMin` minutes after creation — the
|
|
6557
|
+
* heartbeat reaper escalates these to a Lead reroute-decision.
|
|
6558
|
+
*
|
|
6559
|
+
* Three scoping clauses, each load-bearing:
|
|
6560
|
+
* - `tags LIKE '%"crash-recovery-pin"%'` — restricts to resumes actually pinned
|
|
6561
|
+
* to their original agent on the crash path. Without it, a *pooled* resume
|
|
6562
|
+
* that `autoAssignPoolTasks` flips to `pending` earlier in the SAME sweep
|
|
6563
|
+
* (keeping its old `createdAt`) would be reaped and cancelled before the
|
|
6564
|
+
* assigned worker polls; it also keeps `context_limits` / `manual_supersede`
|
|
6565
|
+
* pins from being escalated under a `crash_recovery` label. (Literal must
|
|
6566
|
+
* match `CRASH_RECOVERY_PIN_TAG` in src/tasks/worker-follow-up.ts.)
|
|
6567
|
+
* - `status = 'pending'` — the "currently unreclaimed" discriminator: when the
|
|
6568
|
+
* agent reclaims via the normal poll path, `startTask` flips the row to
|
|
6569
|
+
* `in_progress` and it drops out of this set. (A reclaimed resume whose
|
|
6570
|
+
* session later orphans can be flipped back to `pending` by
|
|
6571
|
+
* `resetOrphanedInProgressTasksForAgent`, re-entering this set on a later
|
|
6572
|
+
* sweep — re-escalating genuinely re-stalled work, which is fine.) We do NOT
|
|
6573
|
+
* gate on `lastActivityAt` — it is stale for a returned-but-idle agent.
|
|
6574
|
+
* - `createdAt < cutoff` — `createdAt` is the resume's creation = crash-DETECTION
|
|
6575
|
+
* time, so the grace window is measured from detection.
|
|
6576
|
+
*
|
|
6577
|
+
* Keys only on reboot-durable columns, so a pending pin survives a server reboot
|
|
6578
|
+
* and is caught on the first post-reboot sweep.
|
|
6579
|
+
*/
|
|
6580
|
+
export function getStalePinnedResumes(graceMin: number): AgentTask[] {
|
|
6581
|
+
const cutoff = new Date(Date.now() - graceMin * 60 * 1000).toISOString();
|
|
6582
|
+
return getDb()
|
|
6583
|
+
.prepare<AgentTaskRow, [string]>(
|
|
6584
|
+
`SELECT * FROM agent_tasks
|
|
6585
|
+
WHERE taskType = 'resume' AND status = 'pending'
|
|
6586
|
+
AND tags LIKE '%"crash-recovery-pin"%'
|
|
6587
|
+
AND createdAt < ?
|
|
6588
|
+
ORDER BY createdAt ASC`,
|
|
6589
|
+
)
|
|
6590
|
+
.all(cutoff)
|
|
6591
|
+
.map(rowToAgentTask);
|
|
6592
|
+
}
|
|
6593
|
+
|
|
6594
|
+
/**
|
|
6595
|
+
* Atomically terminalize a pinned resume ONLY if it is still `pending`, in one
|
|
6596
|
+
* `UPDATE … RETURNING`. Returns the row when the transition fired, or `null`
|
|
6597
|
+
* when it did not (the agent reclaimed it in the gap → `startTask` already
|
|
6598
|
+
* flipped it to `in_progress`). The heartbeat reaper escalates to the Lead ONLY
|
|
6599
|
+
* when this returns a row, closing the TOCTOU window between reading the resume
|
|
6600
|
+
* as `pending` and writing.
|
|
6601
|
+
*
|
|
6602
|
+
* Deliberately NOT `failTask`: `failTask`'s backing SQL is keyed on `id` with no
|
|
6603
|
+
* status precondition, so it would terminalize an `in_progress` resume the
|
|
6604
|
+
* worker just started. The `AND status = 'pending'` here is the guard.
|
|
6605
|
+
*/
|
|
6606
|
+
export function failPendingResumeIfUnclaimed(
|
|
6607
|
+
taskId: string,
|
|
6608
|
+
status: "cancelled" | "failed",
|
|
6609
|
+
failureReason: string,
|
|
6610
|
+
): AgentTask | null {
|
|
6611
|
+
const now = new Date().toISOString();
|
|
6612
|
+
const scrubbedReason = scrubSecrets(failureReason);
|
|
6613
|
+
const row = getDb()
|
|
6614
|
+
.prepare<AgentTaskRow, [string, string, string, string, string]>(
|
|
6615
|
+
`UPDATE agent_tasks SET status = ?, failureReason = ?, finishedAt = ?, lastUpdatedAt = ?
|
|
6616
|
+
WHERE id = ? AND status = 'pending' RETURNING *`,
|
|
6617
|
+
)
|
|
6618
|
+
.get(status, scrubbedReason, now, now, taskId);
|
|
6619
|
+
|
|
6620
|
+
if (row) {
|
|
6621
|
+
try {
|
|
6622
|
+
createLogEntry({
|
|
6623
|
+
eventType: "task_status_change",
|
|
6624
|
+
taskId,
|
|
6625
|
+
agentId: row.agentId ?? undefined,
|
|
6626
|
+
oldValue: "pending",
|
|
6627
|
+
newValue: status,
|
|
6628
|
+
metadata: { reason: scrubbedReason, reaper: "pin_unreclaimed" },
|
|
6629
|
+
});
|
|
6630
|
+
} catch {}
|
|
6631
|
+
}
|
|
6632
|
+
|
|
6633
|
+
return row ? rowToAgentTask(row) : null;
|
|
6634
|
+
}
|
|
6635
|
+
|
|
6509
6636
|
/**
|
|
6510
6637
|
* Get idle, non-lead, non-offline agents that have capacity for more tasks.
|
|
6511
6638
|
* Used by the heartbeat for auto-assignment of pool tasks.
|
|
@@ -42,16 +42,17 @@ export function recordRetrievals(
|
|
|
42
42
|
const db = getDb();
|
|
43
43
|
const insert = db.prepare(
|
|
44
44
|
`INSERT INTO memory_retrieval
|
|
45
|
-
(id, taskId, agentId, sessionId, memoryId, similarity, retrievedAt, contextKey, intent, eventType)
|
|
46
|
-
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`,
|
|
45
|
+
(id, taskId, agentId, sessionId, memoryId, similarity, retrievedAt, contextKey, intent, eventType, retrievalId, rank)
|
|
46
|
+
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`,
|
|
47
47
|
);
|
|
48
48
|
const now = new Date().toISOString();
|
|
49
|
+
const retrievalId = crypto.randomUUID();
|
|
49
50
|
const contextKey = extras?.contextKey ?? null;
|
|
50
51
|
const intent = extras?.intent ?? null;
|
|
51
52
|
const eventType = extras?.eventType ?? "search";
|
|
52
53
|
|
|
53
54
|
db.transaction(() => {
|
|
54
|
-
for (const r of results) {
|
|
55
|
+
for (const [rank, r] of results.entries()) {
|
|
55
56
|
insert.run(
|
|
56
57
|
crypto.randomUUID(),
|
|
57
58
|
taskId,
|
|
@@ -63,6 +64,8 @@ export function recordRetrievals(
|
|
|
63
64
|
contextKey,
|
|
64
65
|
intent,
|
|
65
66
|
eventType,
|
|
67
|
+
retrievalId,
|
|
68
|
+
rank,
|
|
66
69
|
);
|
|
67
70
|
}
|
|
68
71
|
})();
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
-- Add explicit per-search grouping to memory_retrieval.
|
|
2
|
+
--
|
|
3
|
+
-- `recordRetrievals()` writes one row per returned memory. A single
|
|
4
|
+
-- retrievalId groups all rows from the same search/get call, and rank
|
|
5
|
+
-- preserves the result order within that call for precision@k/MRR analysis.
|
|
6
|
+
|
|
7
|
+
ALTER TABLE memory_retrieval ADD COLUMN retrievalId TEXT;
|
|
8
|
+
ALTER TABLE memory_retrieval ADD COLUMN rank INTEGER;
|
|
9
|
+
|
|
10
|
+
CREATE INDEX idx_memret_retrieval_id ON memory_retrieval(retrievalId);
|
package/src/github/handlers.ts
CHANGED
|
@@ -3,6 +3,7 @@ import { findUserByExternalId } from "../be/users";
|
|
|
3
3
|
import { resolveTemplate } from "../prompts/resolver";
|
|
4
4
|
import { githubContextKey } from "../tasks/context-key";
|
|
5
5
|
import { createTaskWithSiblingAwareness } from "../tasks/sibling-awareness";
|
|
6
|
+
import { getInstallationToken } from "./app";
|
|
6
7
|
import {
|
|
7
8
|
detectMention,
|
|
8
9
|
extractMentionContext,
|
|
@@ -936,6 +937,69 @@ export async function handleComment(
|
|
|
936
937
|
return { created: true, taskId: task.id };
|
|
937
938
|
}
|
|
938
939
|
|
|
940
|
+
interface ReviewInlineComment {
|
|
941
|
+
id: number;
|
|
942
|
+
path: string;
|
|
943
|
+
line: number | null;
|
|
944
|
+
body: string;
|
|
945
|
+
html_url: string;
|
|
946
|
+
diff_hunk: string;
|
|
947
|
+
}
|
|
948
|
+
|
|
949
|
+
function parseNextPageLink(linkHeader: string | null): string | null {
|
|
950
|
+
if (!linkHeader) return null;
|
|
951
|
+
const match = linkHeader.match(/<([^>]+)>;\s*rel="next"/);
|
|
952
|
+
return match ? (match[1] ?? null) : null;
|
|
953
|
+
}
|
|
954
|
+
|
|
955
|
+
async function fetchReviewComments(
|
|
956
|
+
repo: string,
|
|
957
|
+
prNumber: number,
|
|
958
|
+
reviewId: number,
|
|
959
|
+
installationId: number,
|
|
960
|
+
): Promise<ReviewInlineComment[]> {
|
|
961
|
+
const token = await getInstallationToken(installationId);
|
|
962
|
+
if (!token) {
|
|
963
|
+
return [];
|
|
964
|
+
}
|
|
965
|
+
const headers = {
|
|
966
|
+
Accept: "application/vnd.github+json",
|
|
967
|
+
Authorization: `Bearer ${token}`,
|
|
968
|
+
"X-GitHub-Api-Version": "2022-11-28",
|
|
969
|
+
};
|
|
970
|
+
const allComments: ReviewInlineComment[] = [];
|
|
971
|
+
let url: string | null =
|
|
972
|
+
`https://api.github.com/repos/${repo}/pulls/${prNumber}/reviews/${reviewId}/comments?per_page=100`;
|
|
973
|
+
try {
|
|
974
|
+
while (url) {
|
|
975
|
+
const response = await fetch(url, { headers });
|
|
976
|
+
if (!response.ok) {
|
|
977
|
+
console.error(`[GitHub] Failed to fetch review inline comments: ${response.status}`);
|
|
978
|
+
return allComments;
|
|
979
|
+
}
|
|
980
|
+
const page = (await response.json()) as ReviewInlineComment[];
|
|
981
|
+
if (Array.isArray(page)) {
|
|
982
|
+
allComments.push(...page);
|
|
983
|
+
}
|
|
984
|
+
url = parseNextPageLink(response.headers.get("link"));
|
|
985
|
+
}
|
|
986
|
+
return allComments;
|
|
987
|
+
} catch (error) {
|
|
988
|
+
console.error("[GitHub] Error fetching review inline comments:", error);
|
|
989
|
+
return allComments;
|
|
990
|
+
}
|
|
991
|
+
}
|
|
992
|
+
|
|
993
|
+
function buildInlineCommentsSection(comments: ReviewInlineComment[]): string {
|
|
994
|
+
if (comments.length === 0) return "";
|
|
995
|
+
const items = comments.map((c) => {
|
|
996
|
+
const loc = c.line ? `${c.path}:${c.line}` : c.path;
|
|
997
|
+
const hunk = c.diff_hunk ? `\n\`\`\`diff\n${c.diff_hunk.slice(0, 300)}\n\`\`\`` : "";
|
|
998
|
+
return `- **${loc}**${hunk}\n > ${c.body}`;
|
|
999
|
+
});
|
|
1000
|
+
return `\n\n## Inline review comments (${comments.length})\n\n${items.join("\n\n")}`;
|
|
1001
|
+
}
|
|
1002
|
+
|
|
939
1003
|
/**
|
|
940
1004
|
* Handle pull_request_review events (submitted, edited, dismissed)
|
|
941
1005
|
*
|
|
@@ -963,15 +1027,21 @@ export async function handlePullRequestReview(
|
|
|
963
1027
|
return { created: false };
|
|
964
1028
|
}
|
|
965
1029
|
|
|
966
|
-
//
|
|
967
|
-
|
|
968
|
-
if (
|
|
1030
|
+
// Deduplicate before making any API calls
|
|
1031
|
+
const eventKey = `pr-review:${repository.full_name}:${pr.number}:${review.id}`;
|
|
1032
|
+
if (isDuplicate(eventKey)) {
|
|
969
1033
|
return { created: false };
|
|
970
1034
|
}
|
|
971
1035
|
|
|
972
|
-
//
|
|
973
|
-
|
|
974
|
-
|
|
1036
|
+
// Fetch inline comments now so we can decide whether to skip and include them in the task.
|
|
1037
|
+
// Returns [] when no installation credentials are available (graceful degradation).
|
|
1038
|
+
const inlineComments = installation?.id
|
|
1039
|
+
? await fetchReviewComments(repository.full_name, pr.number, review.id, installation.id)
|
|
1040
|
+
: [];
|
|
1041
|
+
|
|
1042
|
+
// Skip "commented" reviews only when there is neither an overall body nor any inline
|
|
1043
|
+
// comments — a body-less review with inline comments carries real reviewer feedback.
|
|
1044
|
+
if (review.state === "commented" && !review.body && inlineComments.length === 0) {
|
|
975
1045
|
return { created: false };
|
|
976
1046
|
}
|
|
977
1047
|
|
|
@@ -992,15 +1062,21 @@ export async function handlePullRequestReview(
|
|
|
992
1062
|
|
|
993
1063
|
// Build task description
|
|
994
1064
|
const reviewBodySection = review.body ? `\n\nReview Comment:\n${review.body}` : "";
|
|
1065
|
+
const inlineCommentsSection = buildInlineCommentsSection(inlineComments);
|
|
995
1066
|
const relatedTaskSection = existingTask
|
|
996
1067
|
? `Related task: ${existingTask.id}\n🔀 Consider routing to the same agent working on the related task.\n`
|
|
997
1068
|
: "";
|
|
998
|
-
|
|
1069
|
+
|
|
1070
|
+
const hasInlineComments = inlineComments.length > 0;
|
|
1071
|
+
const baseReviewSuggestion =
|
|
999
1072
|
review.state === "approved"
|
|
1000
1073
|
? "💡 Suggested: Merge the PR or wait for additional reviews"
|
|
1001
1074
|
: review.state === "changes_requested"
|
|
1002
1075
|
? "💡 Suggested: Address the requested changes and update the PR"
|
|
1003
1076
|
: "💡 Suggested: Review the feedback and respond if needed";
|
|
1077
|
+
const reviewSuggestions = hasInlineComments
|
|
1078
|
+
? `${baseReviewSuggestion}\n💬 Address EVERY inline comment. After pushing fixes, reply to and resolve each inline review thread on GitHub so the reviewer sees visible confirmation.`
|
|
1079
|
+
: baseReviewSuggestion;
|
|
1004
1080
|
|
|
1005
1081
|
const result = resolveTemplate(
|
|
1006
1082
|
"github.pull_request.review_submitted",
|
|
@@ -1013,6 +1089,7 @@ export async function handlePullRequestReview(
|
|
|
1013
1089
|
repo_full_name: repository.full_name,
|
|
1014
1090
|
review_url: review.html_url,
|
|
1015
1091
|
review_body_section: reviewBodySection,
|
|
1092
|
+
inline_comments_section: inlineCommentsSection,
|
|
1016
1093
|
related_task_section: relatedTaskSection,
|
|
1017
1094
|
review_suggestions: reviewSuggestions,
|
|
1018
1095
|
},
|
package/src/github/templates.ts
CHANGED
|
@@ -350,7 +350,7 @@ registerTemplate({
|
|
|
350
350
|
defaultBody: `PR: {{pr_title}}
|
|
351
351
|
Reviewer: {{sender_login}}
|
|
352
352
|
Repo: {{repo_full_name}}
|
|
353
|
-
URL: {{review_url}}{{review_body_section}}
|
|
353
|
+
URL: {{review_url}}{{review_body_section}}{{inline_comments_section}}
|
|
354
354
|
|
|
355
355
|
---
|
|
356
356
|
{{related_task_section}}{{@template[common.delegation_instruction]}}
|
|
@@ -363,7 +363,11 @@ URL: {{review_url}}{{review_body_section}}
|
|
|
363
363
|
{ name: "sender_login", description: "Reviewer login" },
|
|
364
364
|
{ name: "repo_full_name", description: "Repository full name (owner/repo)" },
|
|
365
365
|
{ name: "review_url", description: "Review HTML URL" },
|
|
366
|
-
{ name: "review_body_section", description: "Review
|
|
366
|
+
{ name: "review_body_section", description: "Review overall body section or empty string" },
|
|
367
|
+
{
|
|
368
|
+
name: "inline_comments_section",
|
|
369
|
+
description: "Formatted inline review comments section or empty string",
|
|
370
|
+
},
|
|
367
371
|
{ name: "related_task_section", description: "Related task info or empty string" },
|
|
368
372
|
{ name: "review_suggestions", description: "Context-appropriate review suggestion" },
|
|
369
373
|
],
|
|
@@ -4,6 +4,7 @@ import {
|
|
|
4
4
|
cleanupStaleSessions,
|
|
5
5
|
createTaskExtended,
|
|
6
6
|
deleteActiveSession,
|
|
7
|
+
failPendingResumeIfUnclaimed,
|
|
7
8
|
failTask,
|
|
8
9
|
getActiveSessionForTask,
|
|
9
10
|
getActiveTaskCount,
|
|
@@ -14,7 +15,9 @@ import {
|
|
|
14
15
|
getRecentCompletedCount,
|
|
15
16
|
getRecentFailedCount,
|
|
16
17
|
getRecentFailedTasks,
|
|
18
|
+
getStalePinnedResumes,
|
|
17
19
|
getStalledInProgressTasks,
|
|
20
|
+
getTaskById,
|
|
18
21
|
getTaskStats,
|
|
19
22
|
getTasksByStatus,
|
|
20
23
|
getUnassignedPoolTasks,
|
|
@@ -25,8 +28,14 @@ import {
|
|
|
25
28
|
supersedeTask,
|
|
26
29
|
updateAgentStatus,
|
|
27
30
|
} from "../be/db";
|
|
31
|
+
import { repointTrackerSyncBySwarmId } from "../be/db-queries/tracker";
|
|
28
32
|
import { resolveTemplate } from "../prompts/resolver";
|
|
29
|
-
import {
|
|
33
|
+
import {
|
|
34
|
+
createRerouteDecisionTask,
|
|
35
|
+
createResumeFollowUp,
|
|
36
|
+
getNextResumeGeneration,
|
|
37
|
+
getResumeGeneration,
|
|
38
|
+
} from "../tasks/worker-follow-up";
|
|
30
39
|
import type { AgentTask } from "../types";
|
|
31
40
|
import { getExecutorRegistry } from "../workflows";
|
|
32
41
|
import { recoverIncompleteRuns } from "../workflows/recovery";
|
|
@@ -36,8 +45,20 @@ import "./templates";
|
|
|
36
45
|
/**
|
|
37
46
|
* System tasks that must NOT be auto-resumed — mirrors `runRebootSweep`'s exclusion list
|
|
38
47
|
* to prevent infinite retry loops on the heartbeat/triage system tasks themselves.
|
|
48
|
+
*
|
|
49
|
+
* `reroute-decision` is included (DES-523): it is a control-plane Lead task, not
|
|
50
|
+
* user work. If a Lead crashed while holding one, auto-resuming it would create a
|
|
51
|
+
* crash-recovery pin for the decision; reaping that pin would then treat the
|
|
52
|
+
* decision as the `original`, producing nested reroute-decisions ABOUT the control
|
|
53
|
+
* prompt instead of recovering the real work. So a crashed decision is failed, not
|
|
54
|
+
* resumed (the original work was already superseded; its recovery chain is separate).
|
|
39
55
|
*/
|
|
40
|
-
const SKIP_AUTO_RESUME_TYPES = new Set([
|
|
56
|
+
const SKIP_AUTO_RESUME_TYPES = new Set([
|
|
57
|
+
"heartbeat-checklist",
|
|
58
|
+
"boot-triage",
|
|
59
|
+
"heartbeat",
|
|
60
|
+
"reroute-decision",
|
|
61
|
+
]);
|
|
41
62
|
|
|
42
63
|
// ============================================================================
|
|
43
64
|
// Configuration (env var overrides)
|
|
@@ -66,6 +87,29 @@ export const MAX_RESUME_GENERATIONS = Number(process.env.HEARTBEAT_MAX_RESUME_GE
|
|
|
66
87
|
|
|
67
88
|
export const RESUME_BUDGET_EXHAUSTED_REASON = "resume_budget_exhausted";
|
|
68
89
|
|
|
90
|
+
/**
|
|
91
|
+
* Grace window (minutes) a crash-recovery resume pinned to its original agent
|
|
92
|
+
* (DES-523 Phase 1) waits to be reclaimed before the reaper concludes the agent
|
|
93
|
+
* is gone and escalates to a Lead re-delegation decision. Generous enough for a
|
|
94
|
+
* slow container restart / image pull, short enough that a genuinely-gone
|
|
95
|
+
* agent's work reaches the Lead promptly. Measured from the resume's `createdAt`
|
|
96
|
+
* (= crash-detection time), so worst-case crash→escalation latency is
|
|
97
|
+
* ~`STALL_THRESHOLD_NO_SESSION_MIN` + this. Set to `0` to disable the reaper.
|
|
98
|
+
*
|
|
99
|
+
* Uses `??` (not `|| 10`) so an explicit `0` is honored as "reaper off" rather
|
|
100
|
+
* than coerced back to the default.
|
|
101
|
+
*/
|
|
102
|
+
export const HEARTBEAT_RESUME_PIN_GRACE_MIN = (() => {
|
|
103
|
+
const raw = process.env.HEARTBEAT_RESUME_PIN_GRACE_MIN;
|
|
104
|
+
if (raw === undefined) return 10;
|
|
105
|
+
const parsed = Number(raw);
|
|
106
|
+
// Honor an explicit `0` (reaper off), but fall back to the default on a
|
|
107
|
+
// non-finite value (e.g. a typo'd `abc` → NaN). Without this guard, NaN passes
|
|
108
|
+
// the `<= 0` disable check, reaches getStalePinnedResumes(NaN), and throws in
|
|
109
|
+
// `new Date(NaN).toISOString()` — breaking cleanup on every sweep.
|
|
110
|
+
return Number.isFinite(parsed) ? parsed : 10;
|
|
111
|
+
})();
|
|
112
|
+
|
|
69
113
|
/** Heartbeat checklist interval: how often to check HEARTBEAT.md (default: 30 min) */
|
|
70
114
|
const HEARTBEAT_CHECKLIST_INTERVAL_MS =
|
|
71
115
|
Number(process.env.HEARTBEAT_CHECKLIST_INTERVAL_MS) || 30 * 60 * 1000;
|
|
@@ -86,6 +130,17 @@ export interface HeartbeatFindings {
|
|
|
86
130
|
agentId: string;
|
|
87
131
|
reason: string;
|
|
88
132
|
}>;
|
|
133
|
+
/**
|
|
134
|
+
* Crash-recovery resumes pinned back to their original (stable-ID) agent
|
|
135
|
+
* instead of being released to the role-blind unassigned pool (DES-523). A
|
|
136
|
+
* subset of `autoResumedTasks`: the resume `taskId` + the agent it pinned to.
|
|
137
|
+
*/
|
|
138
|
+
pinnedResumes: Array<{ taskId: string; agentId: string }>;
|
|
139
|
+
/**
|
|
140
|
+
* Pinned crash-recovery resumes that were never reclaimed within the grace
|
|
141
|
+
* window and were escalated to a Lead re-delegation decision (DES-523 Phase 3).
|
|
142
|
+
*/
|
|
143
|
+
escalatedReroutes: Array<{ originalTaskId: string; decisionTaskId: string }>;
|
|
89
144
|
workerHealthFixes: Array<{ agentId: string; oldStatus: string; newStatus: string }>;
|
|
90
145
|
autoAssigned: Array<{ taskId: string; agentId: string }>;
|
|
91
146
|
staleCleanup: {
|
|
@@ -157,6 +212,8 @@ export async function codeLevelTriage(): Promise<HeartbeatFindings> {
|
|
|
157
212
|
stalledTasks: [],
|
|
158
213
|
autoFailedTasks: [],
|
|
159
214
|
autoResumedTasks: [],
|
|
215
|
+
pinnedResumes: [],
|
|
216
|
+
escalatedReroutes: [],
|
|
160
217
|
workerHealthFixes: [],
|
|
161
218
|
autoAssigned: [],
|
|
162
219
|
staleCleanup: {
|
|
@@ -353,9 +410,20 @@ function remediateCrashedWorkerTask(
|
|
|
353
410
|
agentId: task.agentId,
|
|
354
411
|
reason: opts.supersedeReason,
|
|
355
412
|
});
|
|
356
|
-
|
|
357
|
-
|
|
358
|
-
|
|
413
|
+
// Phase 1 (DES-523): when the resume pinned back to the original
|
|
414
|
+
// (stable-ID) agent, record it so the sweep summary surfaces the pin
|
|
415
|
+
// rather than a silent pool fallback. `createResumeFollowUp` sets the
|
|
416
|
+
// resume's `agentId` to the original only on the crash_recovery pin path.
|
|
417
|
+
if (resume.task.agentId === task.agentId) {
|
|
418
|
+
findings.pinnedResumes.push({ taskId: resume.task.id, agentId: task.agentId });
|
|
419
|
+
console.log(
|
|
420
|
+
`[Heartbeat] Auto-superseded task ${task.id.slice(0, 8)} — pinned resume ${resume.task.id.slice(0, 8)} to original agent ${task.agentId.slice(0, 8)} (${opts.shortLabel})`,
|
|
421
|
+
);
|
|
422
|
+
} else {
|
|
423
|
+
console.log(
|
|
424
|
+
`[Heartbeat] Auto-superseded task ${task.id.slice(0, 8)} — created resume ${resume.task.id.slice(0, 8)} in unassigned pool (${opts.shortLabel})`,
|
|
425
|
+
);
|
|
426
|
+
}
|
|
359
427
|
} else {
|
|
360
428
|
const reason =
|
|
361
429
|
resume.kind === "skipped"
|
|
@@ -558,6 +626,113 @@ function autoAssignPoolTasks(findings: HeartbeatFindings): void {
|
|
|
558
626
|
})();
|
|
559
627
|
}
|
|
560
628
|
|
|
629
|
+
/**
|
|
630
|
+
* Reaper (DES-523 Phase 3): escalate crash-recovery resumes that were pinned to
|
|
631
|
+
* their original agent (Phase 1) but never reclaimed within
|
|
632
|
+
* `HEARTBEAT_RESUME_PIN_GRACE_MIN`. This is the ONLY path to the Lead decision —
|
|
633
|
+
* "gone" can't be told from "restarting" at crash-detection time, so Phase 1
|
|
634
|
+
* pins optimistically and this reaper decides "gone" once a pin demonstrably
|
|
635
|
+
* fails to be reclaimed. After this runs, the heartbeat crash path never touches
|
|
636
|
+
* the unassigned pool.
|
|
637
|
+
*
|
|
638
|
+
* Wired into `cleanupStaleResources`, so it runs on every sweep — including the
|
|
639
|
+
* cleanup-only preflight-bail path and the first post-reboot sweep — and a
|
|
640
|
+
* pending pin is reaped even when the system otherwise looks idle.
|
|
641
|
+
*/
|
|
642
|
+
function escalateUnreclaimedResumes(findings: HeartbeatFindings): void {
|
|
643
|
+
// Grace 0 = reaper disabled (rollback switch).
|
|
644
|
+
if (HEARTBEAT_RESUME_PIN_GRACE_MIN <= 0) return;
|
|
645
|
+
|
|
646
|
+
const stale = getStalePinnedResumes(HEARTBEAT_RESUME_PIN_GRACE_MIN);
|
|
647
|
+
if (stale.length === 0) return;
|
|
648
|
+
|
|
649
|
+
// A non-offline Lead is required to re-delegate. Without one (none registered,
|
|
650
|
+
// or the only lead is `offline` after POST /close), leave escalation candidates
|
|
651
|
+
// `pending` rather than cancel the pin and hand the decision to an agent that
|
|
652
|
+
// can't poll it (which would strand the work). The budget-exhaustion path below
|
|
653
|
+
// is independent of the Lead and still runs. `getLeadAgent` already prefers a
|
|
654
|
+
// non-offline lead, so this also guards the createRerouteDecisionTask assignment.
|
|
655
|
+
const lead = getLeadAgent();
|
|
656
|
+
const hasLead = lead != null && lead.status !== "offline";
|
|
657
|
+
|
|
658
|
+
for (const resume of stale) {
|
|
659
|
+
if (!resume.parentTaskId) continue; // Defensive — resumes always have a parent.
|
|
660
|
+
|
|
661
|
+
// Budget guard: a resume already at the generation cap must NOT spawn another
|
|
662
|
+
// Lead re-delegation (send-task does not enforce the generation tag, so a
|
|
663
|
+
// flapping task could loop forever). Terminalize and stop. Atomic, so we
|
|
664
|
+
// never kill a resume the agent just reclaimed in the gap.
|
|
665
|
+
if (getResumeGeneration(resume) >= MAX_RESUME_GENERATIONS) {
|
|
666
|
+
const failed = failPendingResumeIfUnclaimed(
|
|
667
|
+
resume.id,
|
|
668
|
+
"failed",
|
|
669
|
+
RESUME_BUDGET_EXHAUSTED_REASON,
|
|
670
|
+
);
|
|
671
|
+
if (failed) {
|
|
672
|
+
console.warn(
|
|
673
|
+
`[Heartbeat] Unreclaimed pinned resume ${resume.id.slice(0, 8)} hit the resume-generation cap — terminalized, no Lead decision`,
|
|
674
|
+
);
|
|
675
|
+
}
|
|
676
|
+
continue;
|
|
677
|
+
}
|
|
678
|
+
|
|
679
|
+
if (!hasLead) continue; // No lead → leave the pin pending; nothing to escalate to.
|
|
680
|
+
|
|
681
|
+
const original = getTaskById(resume.parentTaskId);
|
|
682
|
+
if (!original) continue; // Parent gone — nothing to escalate against.
|
|
683
|
+
|
|
684
|
+
// Escalate atomically: terminalize the pin + repoint the tracker link
|
|
685
|
+
// (original → R1 at pin time; R1 is now dead, so move it back so the Lead's
|
|
686
|
+
// re-delegated resume inherits it via send-task) + create the Lead decision,
|
|
687
|
+
// all in ONE transaction. A mid-sequence process death therefore can't leave
|
|
688
|
+
// the pin cancelled with no Lead signal (which would orphan the work — it is
|
|
689
|
+
// invisible to both the stall detector and this reaper afterward).
|
|
690
|
+
// - The conditional terminalize still returns null if the agent reclaimed
|
|
691
|
+
// the pin in the gap → abort with no writes and skip (TOCTOU guard).
|
|
692
|
+
// - If the decision can't be created (unexpected — hasLead is checked and a
|
|
693
|
+
// still-`pending` pin implies no prior decision), throw to roll back the
|
|
694
|
+
// cancel so the pin is retried next sweep instead of being stranded.
|
|
695
|
+
let escalation: { decisionTaskId: string } | null = null;
|
|
696
|
+
try {
|
|
697
|
+
escalation = getDb().transaction(() => {
|
|
698
|
+
const terminalized = failPendingResumeIfUnclaimed(
|
|
699
|
+
resume.id,
|
|
700
|
+
"cancelled",
|
|
701
|
+
"pin_unreclaimed_escalated",
|
|
702
|
+
);
|
|
703
|
+
if (!terminalized) return null; // reclaimed in the gap — no writes made
|
|
704
|
+
repointTrackerSyncBySwarmId(resume.id, original.id);
|
|
705
|
+
const decision = createRerouteDecisionTask({
|
|
706
|
+
original,
|
|
707
|
+
staleResume: resume,
|
|
708
|
+
reason: "crash_recovery",
|
|
709
|
+
maxGenerations: MAX_RESUME_GENERATIONS,
|
|
710
|
+
});
|
|
711
|
+
if (decision.kind !== "created") {
|
|
712
|
+
throw new Error(`reroute-decision not created: ${decision.reason}`);
|
|
713
|
+
}
|
|
714
|
+
return { decisionTaskId: decision.task.id };
|
|
715
|
+
})();
|
|
716
|
+
} catch (err) {
|
|
717
|
+
console.warn(
|
|
718
|
+
`[Heartbeat] Reroute escalation rolled back for resume ${resume.id.slice(0, 8)} — ${
|
|
719
|
+
err instanceof Error ? err.message : String(err)
|
|
720
|
+
}; pin left pending for the next sweep`,
|
|
721
|
+
);
|
|
722
|
+
continue;
|
|
723
|
+
}
|
|
724
|
+
if (!escalation) continue; // agent reclaimed the pin in the gap
|
|
725
|
+
|
|
726
|
+
findings.escalatedReroutes.push({
|
|
727
|
+
originalTaskId: original.id,
|
|
728
|
+
decisionTaskId: escalation.decisionTaskId,
|
|
729
|
+
});
|
|
730
|
+
console.log(
|
|
731
|
+
`[Heartbeat] Escalated unreclaimed pinned resume ${resume.id.slice(0, 8)} → Lead reroute-decision ${escalation.decisionTaskId.slice(0, 8)} (original ${original.id.slice(0, 8)})`,
|
|
732
|
+
);
|
|
733
|
+
}
|
|
734
|
+
}
|
|
735
|
+
|
|
561
736
|
/**
|
|
562
737
|
* Call existing stale resource cleanup functions.
|
|
563
738
|
*/
|
|
@@ -572,6 +747,9 @@ async function cleanupStaleResources(findings: HeartbeatFindings): Promise<void>
|
|
|
572
747
|
findings.staleCleanup.inboxProcessing = releaseStaleProcessingInbox(
|
|
573
748
|
STALE_CLEANUP_THRESHOLD_MINUTES,
|
|
574
749
|
);
|
|
750
|
+
// DES-523 Phase 3: escalate pinned crash-recovery resumes that were never
|
|
751
|
+
// reclaimed within the grace window to a Lead re-delegation decision.
|
|
752
|
+
escalateUnreclaimedResumes(findings);
|
|
575
753
|
try {
|
|
576
754
|
findings.staleCleanup.workflowRuns = await recoverIncompleteRuns(getExecutorRegistry());
|
|
577
755
|
} catch {
|
|
@@ -854,6 +1032,8 @@ export async function runHeartbeatSweep(): Promise<void> {
|
|
|
854
1032
|
stalledTasks: [],
|
|
855
1033
|
autoFailedTasks: [],
|
|
856
1034
|
autoResumedTasks: [],
|
|
1035
|
+
pinnedResumes: [],
|
|
1036
|
+
escalatedReroutes: [],
|
|
857
1037
|
workerHealthFixes: [],
|
|
858
1038
|
autoAssigned: [],
|
|
859
1039
|
staleCleanup: {
|
|
@@ -891,6 +1071,12 @@ function logFindings(findings: HeartbeatFindings): void {
|
|
|
891
1071
|
if (findings.autoResumedTasks.length > 0) {
|
|
892
1072
|
parts.push(`auto_resumed=${findings.autoResumedTasks.length}`);
|
|
893
1073
|
}
|
|
1074
|
+
if (findings.pinnedResumes.length > 0) {
|
|
1075
|
+
parts.push(`pinned_resumes=${findings.pinnedResumes.length}`);
|
|
1076
|
+
}
|
|
1077
|
+
if (findings.escalatedReroutes.length > 0) {
|
|
1078
|
+
parts.push(`escalated_reroutes=${findings.escalatedReroutes.length}`);
|
|
1079
|
+
}
|
|
894
1080
|
if (findings.stalledTasks.length > 0) {
|
|
895
1081
|
parts.push(`stalled=${findings.stalledTasks.length}`);
|
|
896
1082
|
}
|