@desplega.ai/agent-swarm 1.89.0 → 1.90.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,5 +1,6 @@
1
1
  import {
2
2
  assignUnassignedTaskPending,
3
+ backfillSupersedeTaskResumeTaskId,
3
4
  cleanupStaleSessions,
4
5
  createTaskExtended,
5
6
  deleteActiveSession,
@@ -25,7 +26,7 @@ import {
25
26
  updateAgentStatus,
26
27
  } from "../be/db";
27
28
  import { resolveTemplate } from "../prompts/resolver";
28
- import { createResumeFollowUp } from "../tasks/worker-follow-up";
29
+ import { createResumeFollowUp, getNextResumeGeneration } from "../tasks/worker-follow-up";
29
30
  import type { AgentTask } from "../types";
30
31
  import { getExecutorRegistry } from "../workflows";
31
32
  import { recoverIncompleteRuns } from "../workflows/recovery";
@@ -60,6 +61,11 @@ const STALE_CLEANUP_THRESHOLD_MINUTES = Number(process.env.HEARTBEAT_STALE_CLEAN
60
61
  /** Max pool tasks to auto-assign per sweep */
61
62
  const MAX_AUTO_ASSIGN_PER_SWEEP = Number(process.env.HEARTBEAT_MAX_AUTO_ASSIGN) || 5;
62
63
 
64
+ /** Max crash-recovery resume generations before failing for lead triage */
65
+ export const MAX_RESUME_GENERATIONS = Number(process.env.HEARTBEAT_MAX_RESUME_GENERATIONS) || 3;
66
+
67
+ export const RESUME_BUDGET_EXHAUSTED_REASON = "resume_budget_exhausted";
68
+
63
69
  /** Heartbeat checklist interval: how often to check HEARTBEAT.md (default: 30 min) */
64
70
  const HEARTBEAT_CHECKLIST_INTERVAL_MS =
65
71
  Number(process.env.HEARTBEAT_CHECKLIST_INTERVAL_MS) || 30 * 60 * 1000;
@@ -98,10 +104,17 @@ export interface HeartbeatFindings {
98
104
  let heartbeatInterval: ReturnType<typeof setInterval> | null = null;
99
105
  let checklistInterval: ReturnType<typeof setInterval> | null = null;
100
106
  let isSweeping = false;
107
+ let beforeHeartbeatSupersedeForTests: ((task: AgentTask) => void) | null = null;
101
108
 
102
109
  /** Tasks auto-failed during the reboot sweep, consumed by boot triage */
103
110
  let rebootAffectedTasks: Array<{ original: AgentTask; retryTaskId: string | null }> = [];
104
111
 
112
+ export function setBeforeHeartbeatSupersedeForTests(
113
+ hook: ((task: AgentTask) => void) | null,
114
+ ): void {
115
+ beforeHeartbeatSupersedeForTests = hook;
116
+ }
117
+
105
118
  // ============================================================================
106
119
  // Tier 1: Preflight Gate
107
120
  // ============================================================================
@@ -300,16 +313,40 @@ function remediateCrashedWorkerTask(
300
313
  return;
301
314
  }
302
315
 
303
- // Supersede + resume path.
316
+ const nextResumeGeneration = getNextResumeGeneration(task);
317
+ if (nextResumeGeneration > MAX_RESUME_GENERATIONS) {
318
+ const failed = failTask(task.id, RESUME_BUDGET_EXHAUSTED_REASON);
319
+ if (failed) {
320
+ findings.autoFailedTasks.push({
321
+ taskId: task.id,
322
+ agentId: task.agentId,
323
+ reason: RESUME_BUDGET_EXHAUSTED_REASON,
324
+ });
325
+ if (opts.cleanupActiveSession) deleteActiveSession(task.id);
326
+ console.warn(
327
+ `[Heartbeat] Auto-failed task ${task.id.slice(0, 8)} — ${RESUME_BUDGET_EXHAUSTED_REASON} (${opts.shortLabel})`,
328
+ );
329
+ const remaining = getActiveTaskCount(task.agentId);
330
+ if (remaining === 0) updateAgentStatus(task.agentId, "idle");
331
+ }
332
+ return;
333
+ }
334
+
335
+ beforeHeartbeatSupersedeForTests?.(task);
336
+
304
337
  const superseded = supersedeTask(task.id, {
305
338
  reason: opts.supersedeReason,
306
339
  resumeTaskId: null,
307
340
  });
308
- if (!superseded) return;
341
+ if (!superseded) {
342
+ return;
343
+ }
309
344
 
310
345
  const resume = createResumeFollowUp({ parentId: task.id, reason: "crash_recovery" });
311
346
 
312
347
  if (resume.kind === "created") {
348
+ backfillSupersedeTaskResumeTaskId(task.id, resume.task.id);
349
+
313
350
  findings.autoResumedTasks.push({
314
351
  taskId: task.id,
315
352
  resumeTaskId: resume.task.id,
@@ -320,10 +357,20 @@ function remediateCrashedWorkerTask(
320
357
  `[Heartbeat] Auto-superseded task ${task.id.slice(0, 8)} — created resume ${resume.task.id.slice(0, 8)} (${opts.shortLabel})`,
321
358
  );
322
359
  } else {
323
- // `workflow-skip` is unreachable here (handled above). `skipped` covers
324
- // parent-not-found / lead-not-found edge cases — just log for operators.
325
- console.log(
326
- `[Heartbeat] Task ${task.id.slice(0, 8)} superseded but no resume created (${
360
+ const reason =
361
+ resume.kind === "skipped"
362
+ ? `resume_creation_skipped_${resume.reason}`
363
+ : "resume_creation_skipped_workflow";
364
+ const failed = failTask(task.id, reason);
365
+ if (failed) {
366
+ findings.autoFailedTasks.push({
367
+ taskId: task.id,
368
+ agentId: task.agentId,
369
+ reason,
370
+ });
371
+ }
372
+ console.warn(
373
+ `[Heartbeat] Task ${task.id.slice(0, 8)} failed because no resume was created (${
327
374
  resume.kind === "skipped" ? resume.reason : "workflow-skip"
328
375
  })`,
329
376
  );
package/src/http/tasks.ts CHANGED
@@ -2,6 +2,7 @@ import type { IncomingMessage, ServerResponse } from "node:http";
2
2
  import { ensure } from "@desplega.ai/business-use";
3
3
  import { z } from "zod";
4
4
  import {
5
+ backfillSupersedeTaskResumeTaskId,
5
6
  cancelTask,
6
7
  completeTask,
7
8
  failTask,
@@ -905,6 +906,7 @@ export async function handleTasks(
905
906
  }
906
907
 
907
908
  const resumeTaskId = followUp.task.id;
909
+ backfillSupersedeTaskResumeTaskId(parsed.params.id, resumeTaskId);
908
910
 
909
911
  ensure({
910
912
  id: "task.superseded",
@@ -22,6 +22,20 @@ export const WORKER_LIVENESS_WINDOW_SECONDS = Number(
22
22
  process.env.WORKER_LIVENESS_WINDOW_SECONDS || "30",
23
23
  );
24
24
 
25
+ export const RESUME_GENERATION_TAG_PREFIX = "resume-generation:";
26
+
27
+ export function getResumeGeneration(task: Pick<AgentTask, "tags">): number {
28
+ const tag = task.tags.find((value) => value.startsWith(RESUME_GENERATION_TAG_PREFIX));
29
+ if (!tag) return 0;
30
+
31
+ const parsed = Number(tag.slice(RESUME_GENERATION_TAG_PREFIX.length));
32
+ return Number.isInteger(parsed) && parsed > 0 ? parsed : 0;
33
+ }
34
+
35
+ export function getNextResumeGeneration(parent: Pick<AgentTask, "tags">): number {
36
+ return getResumeGeneration(parent) + 1;
37
+ }
38
+
25
39
  function attachmentPointer(a: TaskAttachment): string {
26
40
  switch (a.kind) {
27
41
  case "url":
@@ -205,7 +219,11 @@ export function createResumeFollowUp(args: {
205
219
  ].join("\n");
206
220
 
207
221
  const priority = Math.min(100, (parent.priority ?? 50) + 10);
208
- const tags = ["auto-resume", `reason:${args.reason}`];
222
+ const tags = [
223
+ "auto-resume",
224
+ `reason:${args.reason}`,
225
+ `${RESUME_GENERATION_TAG_PREFIX}${getNextResumeGeneration(parent)}`,
226
+ ];
209
227
 
210
228
  // Identity-shaped fields (dir, VCS provider/repo/number/url/etc.,
211
229
  // outputSchema, slack channel/thread/user, agentmail, mention, contextKey,
@@ -10,16 +10,29 @@ import { afterAll, beforeAll, beforeEach, describe, expect, test } from "bun:tes
10
10
  import { unlink } from "node:fs/promises";
11
11
  import {
12
12
  closeDb,
13
+ completeTask,
13
14
  createAgent,
14
15
  createTaskExtended,
15
16
  getChildTasks,
16
17
  getDb,
18
+ getLogsByTaskId,
17
19
  getTaskById,
18
20
  initDb,
19
21
  insertActiveSession,
20
22
  startTask,
21
23
  } from "../be/db";
22
- import { codeLevelTriage } from "../heartbeat/heartbeat";
24
+ import {
25
+ createTrackerSync,
26
+ getTrackerSync,
27
+ getTrackerSyncByExternalId,
28
+ } from "../be/db-queries/tracker";
29
+ import {
30
+ codeLevelTriage,
31
+ MAX_RESUME_GENERATIONS,
32
+ RESUME_BUDGET_EXHAUSTED_REASON,
33
+ setBeforeHeartbeatSupersedeForTests,
34
+ } from "../heartbeat/heartbeat";
35
+ import { RESUME_GENERATION_TAG_PREFIX } from "../tasks/worker-follow-up";
23
36
 
24
37
  const TEST_DB_PATH = "./test-heartbeat-supersede-resume.sqlite";
25
38
 
@@ -46,6 +59,8 @@ describe("Heartbeat — supersede + resume (DES-523)", () => {
46
59
  });
47
60
 
48
61
  beforeEach(() => {
62
+ setBeforeHeartbeatSupersedeForTests(null);
63
+ getDb().run("DELETE FROM tracker_sync");
49
64
  getDb().run("DELETE FROM agent_tasks");
50
65
  getDb().run("DELETE FROM agents");
51
66
  getDb().run("DELETE FROM active_sessions");
@@ -81,7 +96,82 @@ describe("Heartbeat — supersede + resume (DES-523)", () => {
81
96
  expect(resume.taskType).toBe("resume");
82
97
  expect(resume.tags).toContain("auto-resume");
83
98
  expect(resume.tags).toContain("reason:crash_recovery");
99
+ expect(resume.tags).toContain(`${RESUME_GENERATION_TAG_PREFIX}1`);
84
100
  expect(resume.id).toBe(findings.autoResumedTasks[0]!.resumeTaskId);
101
+
102
+ const supersedeLog = getLogsByTaskId(parent.id).find(
103
+ (log) => log.eventType === "task_superseded",
104
+ );
105
+ expect(supersedeLog).toBeTruthy();
106
+ const metadata = JSON.parse(supersedeLog!.metadata ?? "{}") as { resumeTaskId?: string };
107
+ expect(metadata.resumeTaskId).toBe(resume.id);
108
+ });
109
+
110
+ test("Case A: crash-recovery resume chain stops at the generation cap", async () => {
111
+ const agent = createAgent({ name: "dead-resume-worker", isLead: false, status: "busy" });
112
+ const parent = createTaskExtended("Resume at generation cap", {
113
+ agentId: agent.id,
114
+ taskType: "resume",
115
+ tags: [
116
+ "auto-resume",
117
+ "reason:crash_recovery",
118
+ `${RESUME_GENERATION_TAG_PREFIX}${MAX_RESUME_GENERATIONS}`,
119
+ ],
120
+ });
121
+ startTask(parent.id);
122
+
123
+ const oldTime = new Date(Date.now() - 10 * 60 * 1000).toISOString();
124
+ getDb().run("UPDATE agent_tasks SET lastUpdatedAt = ? WHERE id = ?", [oldTime, parent.id]);
125
+
126
+ const findings = await codeLevelTriage();
127
+
128
+ expect(findings.autoResumedTasks.length).toBe(0);
129
+ expect(findings.autoFailedTasks.length).toBe(1);
130
+ expect(findings.autoFailedTasks[0]!.taskId).toBe(parent.id);
131
+ expect(findings.autoFailedTasks[0]!.reason).toBe(RESUME_BUDGET_EXHAUSTED_REASON);
132
+
133
+ const updatedParent = getTaskById(parent.id);
134
+ expect(updatedParent?.status).toBe("failed");
135
+ expect(updatedParent?.failureReason).toBe(RESUME_BUDGET_EXHAUSTED_REASON);
136
+ expect(getChildTasks(parent.id).length).toBe(0);
137
+ });
138
+
139
+ test("Case A: supersede race does not create a resume child or repoint tracker_sync", async () => {
140
+ const agent = createAgent({ name: "dead-worker-race", isLead: false, status: "busy" });
141
+ const parent = createTaskExtended("Tracked parent that finishes during heartbeat", {
142
+ agentId: agent.id,
143
+ });
144
+ startTask(parent.id);
145
+
146
+ createTrackerSync({
147
+ provider: "linear",
148
+ entityType: "task",
149
+ swarmId: parent.id,
150
+ externalId: "linear-race-issue",
151
+ externalIdentifier: "ENG-637",
152
+ externalUrl: "https://linear.app/test/issue/ENG-637",
153
+ });
154
+
155
+ const oldTime = new Date(Date.now() - 10 * 60 * 1000).toISOString();
156
+ getDb().run("UPDATE agent_tasks SET lastUpdatedAt = ? WHERE id = ?", [oldTime, parent.id]);
157
+
158
+ setBeforeHeartbeatSupersedeForTests((task) => {
159
+ expect(task.id).toBe(parent.id);
160
+ completeTask(parent.id, "finished by racing worker");
161
+ });
162
+
163
+ const findings = await codeLevelTriage();
164
+
165
+ expect(findings.autoResumedTasks.length).toBe(0);
166
+ expect(findings.autoFailedTasks.length).toBe(0);
167
+
168
+ const updatedParent = getTaskById(parent.id);
169
+ expect(updatedParent?.status).toBe("completed");
170
+ expect(getChildTasks(parent.id).length).toBe(0);
171
+
172
+ expect(getTrackerSync("linear", "task", parent.id)).not.toBeNull();
173
+ const byExternal = getTrackerSyncByExternalId("linear", "task", "linear-race-issue");
174
+ expect(byExternal?.swarmId).toBe(parent.id);
85
175
  });
86
176
 
87
177
  // --------------------------------------------------------------------------