@desplega.ai/agent-swarm 1.53.0 → 1.54.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +2 -0
- package/openapi.json +22 -1
- package/package.json +1 -1
- package/plugin/commands/work-on-task.md +11 -5
- package/plugin/pi-skills/work-on-task/SKILL.md +11 -5
- package/src/be/db.ts +44 -6
- package/src/be/migrations/024_add_was_paused.sql +1 -0
- package/src/commands/runner.ts +46 -1
- package/src/heartbeat/heartbeat.ts +107 -11
- package/src/http/agents.ts +3 -0
- package/src/http/heartbeat.ts +43 -0
- package/src/http/index.ts +4 -2
- package/src/http/poll.ts +3 -0
- package/src/http/tasks.ts +27 -4
- package/src/linear/sync.ts +38 -11
- package/src/linear/templates.ts +17 -0
- package/src/providers/pi-mono-adapter.ts +25 -0
- package/src/scheduler/scheduler.ts +1 -0
- package/src/tests/context-snapshot.test.ts +127 -0
- package/src/tests/events-db.test.ts +0 -1
- package/src/tests/events-http.test.ts +10 -4
- package/src/tests/heartbeat.test.ts +148 -6
- package/src/tests/linear-webhook.test.ts +105 -4
- package/src/tests/workflow-hitl-routing.test.ts +545 -0
- package/src/tools/store-progress.ts +8 -2
- package/src/types.ts +3 -0
- package/src/workflows/engine.ts +59 -18
- package/src/workflows/recovery.ts +4 -4
- package/src/workflows/resume.ts +21 -15
package/README.md
CHANGED
|
@@ -58,6 +58,8 @@ Agent Swarm lets you run a team of AI coding agents that coordinate autonomously
|
|
|
58
58
|
- **Onboarding wizard** — Interactive CLI wizard (`agent-swarm onboard`) to set up a new swarm from scratch with presets, credential collection, and docker-compose generation
|
|
59
59
|
- **Skill system** — Reusable procedural knowledge: create, install, publish, and sync skills from GitHub with scope resolution (agent → swarm → global)
|
|
60
60
|
- **Human-in-the-Loop** — Workflow nodes that pause for human approval or input, with a dashboard UI for reviewing and responding to requests
|
|
61
|
+
- **MCP server management** — Register, install, and manage MCP servers for agents with scope cascade (agent → swarm → global) and auto-injection into worker containers
|
|
62
|
+
- **Context usage tracking** — Monitor context window utilization and compaction events per task with visual indicators in the dashboard
|
|
61
63
|
|
|
62
64
|
## Quick Start
|
|
63
65
|
|
package/openapi.json
CHANGED
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
"openapi": "3.1.0",
|
|
3
3
|
"info": {
|
|
4
4
|
"title": "Agent Swarm API",
|
|
5
|
-
"version": "1.
|
|
5
|
+
"version": "1.53.1",
|
|
6
6
|
"description": "Multi-agent orchestration API for Claude Code, Codex, and Gemini CLI. Enables task distribution, agent communication, and service discovery.\n\nMCP tools are documented separately in [MCP.md](./MCP.md)."
|
|
7
7
|
},
|
|
8
8
|
"servers": [
|
|
@@ -2424,6 +2424,27 @@
|
|
|
2424
2424
|
}
|
|
2425
2425
|
}
|
|
2426
2426
|
},
|
|
2427
|
+
"/api/heartbeat/sweep": {
|
|
2428
|
+
"post": {
|
|
2429
|
+
"summary": "Trigger an immediate heartbeat sweep",
|
|
2430
|
+
"tags": [
|
|
2431
|
+
"Heartbeat"
|
|
2432
|
+
],
|
|
2433
|
+
"security": [
|
|
2434
|
+
{
|
|
2435
|
+
"bearerAuth": []
|
|
2436
|
+
}
|
|
2437
|
+
],
|
|
2438
|
+
"responses": {
|
|
2439
|
+
"200": {
|
|
2440
|
+
"description": "Sweep completed successfully"
|
|
2441
|
+
},
|
|
2442
|
+
"401": {
|
|
2443
|
+
"description": "Unauthorized"
|
|
2444
|
+
}
|
|
2445
|
+
}
|
|
2446
|
+
}
|
|
2447
|
+
},
|
|
2427
2448
|
"/api/memory/index": {
|
|
2428
2449
|
"post": {
|
|
2429
2450
|
"summary": "Ingest content into memory system (async embedding)",
|
package/package.json
CHANGED
|
@@ -21,11 +21,17 @@ Once you have the task details, you should:
|
|
|
21
21
|
- Use `memory-get` on any highly relevant results to get full details
|
|
22
22
|
- This step is NOT optional. Past learnings compound your effectiveness.
|
|
23
23
|
<!-- /claude-only -->
|
|
24
|
-
2.
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
24
|
+
2. **Check Installed Skills (REQUIRED):** Before researching or implementing, review your "Installed Skills" section in the system prompt:
|
|
25
|
+
- If any skill's description or trigger matches this task, invoke it via the `Skill` tool BEFORE doing manual research
|
|
26
|
+
- Skills contain pre-built, tested procedures that save context window and cost
|
|
27
|
+
- Example: task involves Linear → use `linear-interaction` skill, task involves email → use `agentmail-sending` skill
|
|
28
|
+
- Only proceed to manual research/web search if NO installed skill covers the task
|
|
29
|
+
- This step is NOT optional. Skipping it wastes context and money.
|
|
30
|
+
3. Figure out if you need to use any of the available commands to help you with your work (see below for available commands)
|
|
31
|
+
4. Use the `/todos` command to add a new todo item indicating you are starting to work on the task (e.g. "Work on task XXX: <short description>"). This will help on restarts, as it will be easier to remember what you were doing.
|
|
32
|
+
5. Call `store-progress` tool to mark the task as "in-progress" with a progress set to something like "Starting work on the task XXX, blah blah". Additionally use `/swarm-chat` command to notify the swarm, human and lead when applicable. Do not be too verbose, nor spammy.
|
|
33
|
+
6. Start working on the task, providing updates as needed by calling `store-progress` tool, use the `progress` field to indicate what you are doing.
|
|
34
|
+
7. Once you either done or in a dead-end, see the "Completion" section below.
|
|
29
35
|
|
|
30
36
|
### Available commands
|
|
31
37
|
|
|
@@ -13,11 +13,17 @@ Once you get a task assigned, you need to immediately start working on it. To do
|
|
|
13
13
|
|
|
14
14
|
Once you have the task details, you should:
|
|
15
15
|
|
|
16
|
-
1.
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
16
|
+
1. **Check Installed Skills (REQUIRED):** Before researching or implementing, review your "Installed Skills" section in the system prompt:
|
|
17
|
+
- If any skill's description or trigger matches this task, invoke it via the `Skill` tool BEFORE doing manual research
|
|
18
|
+
- Skills contain pre-built, tested procedures that save context window and cost
|
|
19
|
+
- Example: task involves Linear → use `linear-interaction` skill, task involves email → use `agentmail-sending` skill
|
|
20
|
+
- Only proceed to manual research/web search if NO installed skill covers the task
|
|
21
|
+
- This step is NOT optional. Skipping it wastes context and money.
|
|
22
|
+
2. Figure out if you need to perform any research or planning before starting (see below)
|
|
23
|
+
3. Use the `/skill:todos` to add a new todo item indicating you are starting to work on the task (e.g. "Work on task XXX: <short description>"). This will help on restarts, as it will be easier to remember what you were doing.
|
|
24
|
+
4. Call `store-progress` tool to mark the task as "in-progress" with a progress set to something like "Starting work on the task XXX, blah blah". Additionally use `/skill:swarm-chat` to notify the swarm, human and lead when applicable. Do not be too verbose, nor spammy.
|
|
25
|
+
5. Start working on the task, providing updates as needed by calling `store-progress` tool, use the `progress` field to indicate what you are doing.
|
|
26
|
+
6. Once you either done or in a dead-end, see the "Completion" section below.
|
|
21
27
|
|
|
22
28
|
### Research and Planning
|
|
23
29
|
|
package/src/be/db.ts
CHANGED
|
@@ -728,6 +728,7 @@ type AgentTaskRow = {
|
|
|
728
728
|
peakContextPercent: number | null;
|
|
729
729
|
totalContextTokensUsed: number | null;
|
|
730
730
|
contextWindowSize: number | null;
|
|
731
|
+
was_paused: number;
|
|
731
732
|
};
|
|
732
733
|
|
|
733
734
|
function rowToAgentTask(row: AgentTaskRow): AgentTask {
|
|
@@ -781,6 +782,7 @@ function rowToAgentTask(row: AgentTaskRow): AgentTask {
|
|
|
781
782
|
failureReason: row.failureReason ?? undefined,
|
|
782
783
|
output: row.output ?? undefined,
|
|
783
784
|
progress: row.progress ?? undefined,
|
|
785
|
+
wasPaused: !!row.was_paused,
|
|
784
786
|
};
|
|
785
787
|
}
|
|
786
788
|
|
|
@@ -1509,6 +1511,7 @@ export function pauseTask(id: string): AgentTask | null {
|
|
|
1509
1511
|
.prepare<AgentTaskRow, [string]>(
|
|
1510
1512
|
`UPDATE agent_tasks
|
|
1511
1513
|
SET status = 'paused',
|
|
1514
|
+
was_paused = 1,
|
|
1512
1515
|
lastUpdatedAt = strftime('%Y-%m-%dT%H:%M:%fZ', 'now')
|
|
1513
1516
|
WHERE id = ? AND status = 'in_progress'
|
|
1514
1517
|
RETURNING *`,
|
|
@@ -1543,6 +1546,7 @@ export function resumeTask(taskId: string): AgentTask | null {
|
|
|
1543
1546
|
.prepare<AgentTaskRow, [string]>(
|
|
1544
1547
|
`UPDATE agent_tasks
|
|
1545
1548
|
SET status = 'in_progress',
|
|
1549
|
+
was_paused = 1,
|
|
1546
1550
|
lastUpdatedAt = strftime('%Y-%m-%dT%H:%M:%fZ', 'now')
|
|
1547
1551
|
WHERE id = ? AND status = 'paused'
|
|
1548
1552
|
RETURNING *`,
|
|
@@ -5599,6 +5603,18 @@ export function updateActiveSessionProviderSessionId(
|
|
|
5599
5603
|
return result.changes > 0;
|
|
5600
5604
|
}
|
|
5601
5605
|
|
|
5606
|
+
/**
|
|
5607
|
+
* Get the active session for a specific task.
|
|
5608
|
+
* Used by the heartbeat to cross-reference stalled tasks with worker sessions.
|
|
5609
|
+
*/
|
|
5610
|
+
export function getActiveSessionForTask(taskId: string): ActiveSession | null {
|
|
5611
|
+
return (
|
|
5612
|
+
getDb()
|
|
5613
|
+
.prepare<ActiveSession, [string]>("SELECT * FROM active_sessions WHERE taskId = ? LIMIT 1")
|
|
5614
|
+
.get(taskId) ?? null
|
|
5615
|
+
);
|
|
5616
|
+
}
|
|
5617
|
+
|
|
5602
5618
|
/**
|
|
5603
5619
|
* Reassociate session logs from a runner session to a real task ID.
|
|
5604
5620
|
* Used when a pool task is claimed — logs were stored under a random UUID,
|
|
@@ -6222,6 +6238,24 @@ export function getStepByIdempotencyKey(key: string): WorkflowRunStep | null {
|
|
|
6222
6238
|
return row ? rowToWorkflowRunStep(row) : null;
|
|
6223
6239
|
}
|
|
6224
6240
|
|
|
6241
|
+
export function getStepCountForNode(runId: string, nodeId: string): number {
|
|
6242
|
+
const row = getDb()
|
|
6243
|
+
.prepare<{ cnt: number }, [string, string]>(
|
|
6244
|
+
"SELECT COUNT(*) as cnt FROM workflow_run_steps WHERE runId = ? AND nodeId = ?",
|
|
6245
|
+
)
|
|
6246
|
+
.get(runId, nodeId);
|
|
6247
|
+
return row?.cnt ?? 0;
|
|
6248
|
+
}
|
|
6249
|
+
|
|
6250
|
+
export function getLatestStepForNode(runId: string, nodeId: string): WorkflowRunStep | null {
|
|
6251
|
+
const row = getDb()
|
|
6252
|
+
.prepare<WorkflowRunStepRow, [string, string]>(
|
|
6253
|
+
"SELECT * FROM workflow_run_steps WHERE runId = ? AND nodeId = ? ORDER BY startedAt DESC LIMIT 1",
|
|
6254
|
+
)
|
|
6255
|
+
.get(runId, nodeId);
|
|
6256
|
+
return row ? rowToWorkflowRunStep(row) : null;
|
|
6257
|
+
}
|
|
6258
|
+
|
|
6225
6259
|
// --- Workflow Version History ---
|
|
6226
6260
|
|
|
6227
6261
|
type WorkflowVersionRow = {
|
|
@@ -7855,6 +7889,13 @@ export function createContextSnapshot(input: CreateContextSnapshotInput): Contex
|
|
|
7855
7889
|
.run(input.contextPercent, input.taskId);
|
|
7856
7890
|
}
|
|
7857
7891
|
|
|
7892
|
+
// Keep totalContextTokensUsed up to date with the latest known value
|
|
7893
|
+
if (input.contextUsedTokens != null) {
|
|
7894
|
+
getDb()
|
|
7895
|
+
.prepare("UPDATE agent_tasks SET totalContextTokensUsed = ? WHERE id = ?")
|
|
7896
|
+
.run(input.contextUsedTokens, input.taskId);
|
|
7897
|
+
}
|
|
7898
|
+
|
|
7858
7899
|
if (input.eventType === "compaction") {
|
|
7859
7900
|
getDb()
|
|
7860
7901
|
.prepare(
|
|
@@ -7863,13 +7904,10 @@ export function createContextSnapshot(input: CreateContextSnapshotInput): Contex
|
|
|
7863
7904
|
.run(input.taskId);
|
|
7864
7905
|
}
|
|
7865
7906
|
|
|
7866
|
-
if (input.eventType === "completion") {
|
|
7907
|
+
if (input.eventType === "completion" && input.contextTotalTokens != null) {
|
|
7867
7908
|
getDb()
|
|
7868
|
-
.prepare(
|
|
7869
|
-
|
|
7870
|
-
WHERE id = ?`,
|
|
7871
|
-
)
|
|
7872
|
-
.run(input.contextUsedTokens ?? null, input.contextTotalTokens ?? null, input.taskId);
|
|
7909
|
+
.prepare("UPDATE agent_tasks SET contextWindowSize = ? WHERE id = ?")
|
|
7910
|
+
.run(input.contextTotalTokens, input.taskId);
|
|
7873
7911
|
}
|
|
7874
7912
|
|
|
7875
7913
|
return {
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
ALTER TABLE agent_tasks ADD COLUMN was_paused INTEGER NOT NULL DEFAULT 0;
|
package/src/commands/runner.ts
CHANGED
|
@@ -305,6 +305,12 @@ export function humanizeToolName(name: string): string {
|
|
|
305
305
|
export function toolCallToProgress(toolName: string, args: unknown): string | null {
|
|
306
306
|
if (SKIP_PROGRESS_TOOLS.has(toolName)) return null;
|
|
307
307
|
|
|
308
|
+
// Normalize: pi-mono uses lowercase ("read"), Claude uses PascalCase ("Read")
|
|
309
|
+
const normalized =
|
|
310
|
+
toolName.startsWith("mcp__") || toolName.includes("_")
|
|
311
|
+
? toolName
|
|
312
|
+
: toolName.charAt(0).toUpperCase() + toolName.slice(1);
|
|
313
|
+
|
|
308
314
|
const a = args as Record<string, unknown>;
|
|
309
315
|
const shortPath = (p: unknown) => {
|
|
310
316
|
if (typeof p !== "string") return "";
|
|
@@ -313,7 +319,7 @@ export function toolCallToProgress(toolName: string, args: unknown): string | nu
|
|
|
313
319
|
return parts.length > 2 ? parts.slice(-2).join("/") : p;
|
|
314
320
|
};
|
|
315
321
|
|
|
316
|
-
switch (
|
|
322
|
+
switch (normalized) {
|
|
317
323
|
case "Read":
|
|
318
324
|
return `📖 Reading ${shortPath(a.file_path)}`;
|
|
319
325
|
case "Edit":
|
|
@@ -1067,6 +1073,25 @@ async function cleanupActiveSessions(config: ApiConfig): Promise<void> {
|
|
|
1067
1073
|
}
|
|
1068
1074
|
}
|
|
1069
1075
|
|
|
1076
|
+
/** Trigger a heartbeat sweep via the API (lead startup self-check) */
|
|
1077
|
+
async function triggerHeartbeatSweep(config: ApiConfig): Promise<boolean> {
|
|
1078
|
+
try {
|
|
1079
|
+
const headers: Record<string, string> = {
|
|
1080
|
+
"Content-Type": "application/json",
|
|
1081
|
+
"X-Agent-ID": config.agentId,
|
|
1082
|
+
};
|
|
1083
|
+
if (config.apiKey) headers.Authorization = `Bearer ${config.apiKey}`;
|
|
1084
|
+
const resp = await fetch(`${config.apiUrl}/api/heartbeat/sweep`, {
|
|
1085
|
+
method: "POST",
|
|
1086
|
+
headers,
|
|
1087
|
+
});
|
|
1088
|
+
return resp.ok;
|
|
1089
|
+
} catch (err) {
|
|
1090
|
+
console.warn(`[runner] Failed to trigger heartbeat sweep: ${(err as Error).message}`);
|
|
1091
|
+
return false;
|
|
1092
|
+
}
|
|
1093
|
+
}
|
|
1094
|
+
|
|
1070
1095
|
/** Trigger types returned by the poll API */
|
|
1071
1096
|
interface Trigger {
|
|
1072
1097
|
type:
|
|
@@ -1991,6 +2016,9 @@ async function checkCompletedProcesses(
|
|
|
1991
2016
|
failureReason,
|
|
1992
2017
|
},
|
|
1993
2018
|
validator: (data) => data.exitCode === 0,
|
|
2019
|
+
// biome-ignore lint/correctness/noEmptyPattern: data unused, ctx needed
|
|
2020
|
+
filter: ({}, ctx) => ctx.deps.length > 0,
|
|
2021
|
+
conditions: [{ timeout_ms: 3_600_000 }], // 1 hour: process runtime
|
|
1994
2022
|
});
|
|
1995
2023
|
|
|
1996
2024
|
// Commit channel activity cursors after successful processing
|
|
@@ -2703,6 +2731,17 @@ export async function runAgent(config: RunnerConfig, opts: RunnerOptions) {
|
|
|
2703
2731
|
}
|
|
2704
2732
|
// ========== END: Resume paused tasks ==========
|
|
2705
2733
|
|
|
2734
|
+
// ========== Lead startup self-check ==========
|
|
2735
|
+
if (isLead) {
|
|
2736
|
+
console.log(`[${role}] Running startup heartbeat sweep...`);
|
|
2737
|
+
const swept = await triggerHeartbeatSweep(apiConfig);
|
|
2738
|
+
if (swept) {
|
|
2739
|
+
console.log(`[${role}] Startup heartbeat sweep completed`);
|
|
2740
|
+
} else {
|
|
2741
|
+
console.warn(`[${role}] Startup heartbeat sweep failed (non-fatal)`);
|
|
2742
|
+
}
|
|
2743
|
+
}
|
|
2744
|
+
|
|
2706
2745
|
// Track last finished task check for leads (to avoid re-processing)
|
|
2707
2746
|
while (true) {
|
|
2708
2747
|
// Ping server on each iteration to keep status updated
|
|
@@ -2790,6 +2829,9 @@ export async function runAgent(config: RunnerConfig, opts: RunnerOptions) {
|
|
|
2790
2829
|
triggerType: trigger.type,
|
|
2791
2830
|
role,
|
|
2792
2831
|
},
|
|
2832
|
+
// biome-ignore lint/correctness/noEmptyPattern: data unused, ctx needed
|
|
2833
|
+
filter: ({}, ctx) => ctx.deps.length > 0,
|
|
2834
|
+
conditions: [{ timeout_ms: 60_000 }], // 1 min: immediate after poll
|
|
2793
2835
|
});
|
|
2794
2836
|
}
|
|
2795
2837
|
|
|
@@ -3020,6 +3062,9 @@ export async function runAgent(config: RunnerConfig, opts: RunnerOptions) {
|
|
|
3020
3062
|
role,
|
|
3021
3063
|
model: taskModel,
|
|
3022
3064
|
},
|
|
3065
|
+
// biome-ignore lint/correctness/noEmptyPattern: data unused, ctx needed
|
|
3066
|
+
filter: ({}, ctx) => ctx.deps.length > 0,
|
|
3067
|
+
conditions: [{ timeout_ms: 60_000 }], // 1 min: process startup
|
|
3023
3068
|
});
|
|
3024
3069
|
|
|
3025
3070
|
// Attach trigger metadata for logging
|
|
@@ -2,6 +2,9 @@ import {
|
|
|
2
2
|
claimTask,
|
|
3
3
|
cleanupStaleSessions,
|
|
4
4
|
createTaskExtended,
|
|
5
|
+
deleteActiveSession,
|
|
6
|
+
failTask,
|
|
7
|
+
getActiveSessionForTask,
|
|
5
8
|
getActiveTaskCount,
|
|
6
9
|
getAllAgents,
|
|
7
10
|
getDb,
|
|
@@ -29,15 +32,25 @@ import "./templates";
|
|
|
29
32
|
/** Default heartbeat interval: 90 seconds */
|
|
30
33
|
const DEFAULT_INTERVAL_MS = Number(process.env.HEARTBEAT_INTERVAL_MS) || 90_000;
|
|
31
34
|
|
|
32
|
-
/** Stall threshold: tasks
|
|
35
|
+
/** Stall threshold: tasks with fresh worker heartbeat but no task update for this many minutes */
|
|
33
36
|
const STALL_THRESHOLD_MINUTES = Number(process.env.HEARTBEAT_STALL_THRESHOLD_MIN) || 30;
|
|
34
37
|
|
|
38
|
+
/** Stall threshold: tasks with no active session (worker clearly dead) */
|
|
39
|
+
const STALL_THRESHOLD_NO_SESSION_MIN = Number(process.env.HEARTBEAT_STALL_NO_SESSION_MIN) || 5;
|
|
40
|
+
|
|
41
|
+
/** Stall threshold: tasks with stale worker heartbeat */
|
|
42
|
+
const STALL_THRESHOLD_STALE_HEARTBEAT_MIN = Number(process.env.HEARTBEAT_STALL_STALE_HB_MIN) || 15;
|
|
43
|
+
|
|
35
44
|
/** Stale resource cleanup threshold (minutes) */
|
|
36
45
|
const STALE_CLEANUP_THRESHOLD_MINUTES = Number(process.env.HEARTBEAT_STALE_CLEANUP_MIN) || 30;
|
|
37
46
|
|
|
38
47
|
/** Max pool tasks to auto-assign per sweep */
|
|
39
48
|
const MAX_AUTO_ASSIGN_PER_SWEEP = Number(process.env.HEARTBEAT_MAX_AUTO_ASSIGN) || 5;
|
|
40
49
|
|
|
50
|
+
/** Escalation cooldown: minimum time between escalations for the same task set (ms) */
|
|
51
|
+
const ESCALATION_COOLDOWN_MS =
|
|
52
|
+
Number(process.env.HEARTBEAT_ESCALATION_COOLDOWN_MS) || 15 * 60 * 1000;
|
|
53
|
+
|
|
41
54
|
const HEARTBEAT_ESCALATION_MARKER = "[heartbeat-escalation]";
|
|
42
55
|
|
|
43
56
|
// ============================================================================
|
|
@@ -46,6 +59,7 @@ const HEARTBEAT_ESCALATION_MARKER = "[heartbeat-escalation]";
|
|
|
46
59
|
|
|
47
60
|
export interface HeartbeatFindings {
|
|
48
61
|
stalledTasks: AgentTask[];
|
|
62
|
+
autoFailedTasks: Array<{ taskId: string; agentId: string; reason: string }>;
|
|
49
63
|
workerHealthFixes: Array<{ agentId: string; oldStatus: string; newStatus: string }>;
|
|
50
64
|
autoAssigned: Array<{ taskId: string; agentId: string }>;
|
|
51
65
|
staleCleanup: {
|
|
@@ -66,6 +80,9 @@ export interface HeartbeatFindings {
|
|
|
66
80
|
let heartbeatInterval: ReturnType<typeof setInterval> | null = null;
|
|
67
81
|
let isSweeping = false;
|
|
68
82
|
|
|
83
|
+
/** Tracks last escalation time per escalation key to prevent spam */
|
|
84
|
+
const lastEscalationTime: Map<string, number> = new Map();
|
|
85
|
+
|
|
69
86
|
// ============================================================================
|
|
70
87
|
// Tier 1: Preflight Gate
|
|
71
88
|
// ============================================================================
|
|
@@ -106,6 +123,7 @@ export function preflightGate(): boolean {
|
|
|
106
123
|
export async function codeLevelTriage(): Promise<HeartbeatFindings> {
|
|
107
124
|
const findings: HeartbeatFindings = {
|
|
108
125
|
stalledTasks: [],
|
|
126
|
+
autoFailedTasks: [],
|
|
109
127
|
workerHealthFixes: [],
|
|
110
128
|
autoAssigned: [],
|
|
111
129
|
staleCleanup: {
|
|
@@ -118,8 +136,8 @@ export async function codeLevelTriage(): Promise<HeartbeatFindings> {
|
|
|
118
136
|
escalationNeeded: false,
|
|
119
137
|
};
|
|
120
138
|
|
|
121
|
-
// 1. Detect stalled tasks
|
|
122
|
-
|
|
139
|
+
// 1. Detect and remediate stalled tasks (tiered: auto-fail dead workers, escalate ambiguous)
|
|
140
|
+
detectAndRemediateStalledTasks(findings);
|
|
123
141
|
|
|
124
142
|
// 2. Check and fix worker health
|
|
125
143
|
checkWorkerHealth(findings);
|
|
@@ -137,11 +155,72 @@ export async function codeLevelTriage(): Promise<HeartbeatFindings> {
|
|
|
137
155
|
}
|
|
138
156
|
|
|
139
157
|
/**
|
|
140
|
-
*
|
|
158
|
+
* Tiered stall detection and auto-remediation.
|
|
159
|
+
*
|
|
160
|
+
* Cross-checks stalled tasks with active_sessions to determine severity:
|
|
161
|
+
* - No active session → worker is dead → auto-fail (5 min threshold)
|
|
162
|
+
* - Stale session heartbeat → worker likely crashed → auto-fail (15 min threshold)
|
|
163
|
+
* - Fresh session heartbeat → worker alive but task stale → escalate to lead (30 min threshold)
|
|
141
164
|
*/
|
|
142
|
-
function
|
|
143
|
-
|
|
144
|
-
|
|
165
|
+
function detectAndRemediateStalledTasks(findings: HeartbeatFindings): void {
|
|
166
|
+
// Use the shortest threshold to catch all potentially stalled tasks
|
|
167
|
+
const candidates = getStalledInProgressTasks(STALL_THRESHOLD_NO_SESSION_MIN);
|
|
168
|
+
|
|
169
|
+
for (const task of candidates) {
|
|
170
|
+
if (!task.agentId) continue; // Unassigned tasks can't be stalled
|
|
171
|
+
|
|
172
|
+
const session = getActiveSessionForTask(task.id);
|
|
173
|
+
const taskAgeMs = Date.now() - new Date(task.lastUpdatedAt).getTime();
|
|
174
|
+
|
|
175
|
+
if (!session) {
|
|
176
|
+
// Case A: No active session — worker is dead
|
|
177
|
+
if (taskAgeMs >= STALL_THRESHOLD_NO_SESSION_MIN * 60 * 1000) {
|
|
178
|
+
const reason =
|
|
179
|
+
"Auto-failed by heartbeat: worker session not found (no active session for task)";
|
|
180
|
+
const failed = failTask(task.id, reason);
|
|
181
|
+
if (failed) {
|
|
182
|
+
findings.autoFailedTasks.push({ taskId: task.id, agentId: task.agentId, reason });
|
|
183
|
+
console.log(`[Heartbeat] Auto-failed task ${task.id.slice(0, 8)} — no active session`);
|
|
184
|
+
|
|
185
|
+
// Fix agent status if no other active tasks
|
|
186
|
+
const remaining = getActiveTaskCount(task.agentId);
|
|
187
|
+
if (remaining === 0) {
|
|
188
|
+
updateAgentStatus(task.agentId, "idle");
|
|
189
|
+
}
|
|
190
|
+
}
|
|
191
|
+
}
|
|
192
|
+
} else {
|
|
193
|
+
const sessionHeartbeatAgeMs = Date.now() - new Date(session.lastHeartbeatAt).getTime();
|
|
194
|
+
const isStaleHeartbeat =
|
|
195
|
+
sessionHeartbeatAgeMs >= STALL_THRESHOLD_STALE_HEARTBEAT_MIN * 60 * 1000;
|
|
196
|
+
|
|
197
|
+
if (isStaleHeartbeat) {
|
|
198
|
+
// Case B: Session exists but heartbeat is stale — worker likely crashed
|
|
199
|
+
if (taskAgeMs >= STALL_THRESHOLD_STALE_HEARTBEAT_MIN * 60 * 1000) {
|
|
200
|
+
const reason =
|
|
201
|
+
"Auto-failed by heartbeat: worker session heartbeat is stale (likely crashed)";
|
|
202
|
+
const failed = failTask(task.id, reason);
|
|
203
|
+
if (failed) {
|
|
204
|
+
findings.autoFailedTasks.push({ taskId: task.id, agentId: task.agentId, reason });
|
|
205
|
+
deleteActiveSession(task.id);
|
|
206
|
+
console.log(
|
|
207
|
+
`[Heartbeat] Auto-failed task ${task.id.slice(0, 8)} — stale session heartbeat`,
|
|
208
|
+
);
|
|
209
|
+
|
|
210
|
+
const remaining = getActiveTaskCount(task.agentId);
|
|
211
|
+
if (remaining === 0) {
|
|
212
|
+
updateAgentStatus(task.agentId, "idle");
|
|
213
|
+
}
|
|
214
|
+
}
|
|
215
|
+
}
|
|
216
|
+
} else {
|
|
217
|
+
// Case C: Session exists and heartbeat is fresh — ambiguous
|
|
218
|
+
if (taskAgeMs >= STALL_THRESHOLD_MINUTES * 60 * 1000) {
|
|
219
|
+
findings.stalledTasks.push(task);
|
|
220
|
+
}
|
|
221
|
+
}
|
|
222
|
+
}
|
|
223
|
+
}
|
|
145
224
|
}
|
|
146
225
|
|
|
147
226
|
/**
|
|
@@ -232,15 +311,13 @@ async function cleanupStaleResources(findings: HeartbeatFindings): Promise<void>
|
|
|
232
311
|
|
|
233
312
|
/**
|
|
234
313
|
* Evaluate whether findings require escalation to a Claude session (lead agent).
|
|
235
|
-
* Only escalate for
|
|
314
|
+
* Only escalate for ambiguous stalls (worker alive but task not updating).
|
|
236
315
|
*/
|
|
237
316
|
function evaluateEscalation(findings: HeartbeatFindings): void {
|
|
238
|
-
// Stalled tasks are ambiguous — the task might be actively worked on
|
|
239
|
-
// but the worker just hasn't called store-progress recently
|
|
240
317
|
if (findings.stalledTasks.length > 0) {
|
|
241
318
|
findings.escalationNeeded = true;
|
|
242
319
|
const taskIds = findings.stalledTasks.map((t) => t.id.slice(0, 8)).join(", ");
|
|
243
|
-
findings.escalationReason = `${findings.stalledTasks.length} task(s) stalled (no update for ${STALL_THRESHOLD_MINUTES}+ min): ${taskIds}`;
|
|
320
|
+
findings.escalationReason = `${findings.stalledTasks.length} task(s) stalled with active worker (no task update for ${STALL_THRESHOLD_MINUTES}+ min): ${taskIds}`;
|
|
244
321
|
}
|
|
245
322
|
}
|
|
246
323
|
|
|
@@ -255,6 +332,13 @@ function escalateToLead(findings: HeartbeatFindings): void {
|
|
|
255
332
|
}
|
|
256
333
|
|
|
257
334
|
const escalationKey = buildEscalationKey(findings);
|
|
335
|
+
|
|
336
|
+
// Cooldown check — prevent repeated escalations for the same task set
|
|
337
|
+
const lastTime = lastEscalationTime.get(escalationKey);
|
|
338
|
+
if (lastTime && Date.now() - lastTime < ESCALATION_COOLDOWN_MS) {
|
|
339
|
+
return;
|
|
340
|
+
}
|
|
341
|
+
|
|
258
342
|
if (hasActiveEscalationTask(lead.id, escalationKey)) {
|
|
259
343
|
return;
|
|
260
344
|
}
|
|
@@ -294,6 +378,7 @@ function escalateToLead(findings: HeartbeatFindings): void {
|
|
|
294
378
|
priority: 70,
|
|
295
379
|
});
|
|
296
380
|
|
|
381
|
+
lastEscalationTime.set(escalationKey, Date.now());
|
|
297
382
|
console.log(`[Heartbeat] Created triage task for lead ${lead.name}`);
|
|
298
383
|
}
|
|
299
384
|
|
|
@@ -337,6 +422,7 @@ export async function runHeartbeatSweep(): Promise<void> {
|
|
|
337
422
|
if (!preflightGate()) {
|
|
338
423
|
const cleanupOnlyFindings: HeartbeatFindings = {
|
|
339
424
|
stalledTasks: [],
|
|
425
|
+
autoFailedTasks: [],
|
|
340
426
|
workerHealthFixes: [],
|
|
341
427
|
autoAssigned: [],
|
|
342
428
|
staleCleanup: {
|
|
@@ -374,6 +460,9 @@ export async function runHeartbeatSweep(): Promise<void> {
|
|
|
374
460
|
function logFindings(findings: HeartbeatFindings): void {
|
|
375
461
|
const parts: string[] = [];
|
|
376
462
|
|
|
463
|
+
if (findings.autoFailedTasks.length > 0) {
|
|
464
|
+
parts.push(`auto_failed=${findings.autoFailedTasks.length}`);
|
|
465
|
+
}
|
|
377
466
|
if (findings.stalledTasks.length > 0) {
|
|
378
467
|
parts.push(`stalled=${findings.stalledTasks.length}`);
|
|
379
468
|
}
|
|
@@ -432,3 +521,10 @@ export function stopHeartbeat(): void {
|
|
|
432
521
|
console.log("[Heartbeat] Stopped");
|
|
433
522
|
}
|
|
434
523
|
}
|
|
524
|
+
|
|
525
|
+
/**
|
|
526
|
+
* Reset escalation cooldown state. Exported for testing only.
|
|
527
|
+
*/
|
|
528
|
+
export function resetEscalationCooldowns(): void {
|
|
529
|
+
lastEscalationTime.clear();
|
|
530
|
+
}
|
package/src/http/agents.ts
CHANGED
|
@@ -205,6 +205,9 @@ export async function handleAgentRegister(
|
|
|
205
205
|
// Validates that registered happened before reconnected
|
|
206
206
|
return ctx.deps.length > 0;
|
|
207
207
|
},
|
|
208
|
+
// biome-ignore lint/correctness/noEmptyPattern: data unused, ctx needed
|
|
209
|
+
filter: ({}, ctx) => ctx.deps.length > 0,
|
|
210
|
+
conditions: [{ timeout_ms: 86_400_000 }], // 1 day: agents may be offline for extended periods
|
|
208
211
|
});
|
|
209
212
|
}
|
|
210
213
|
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
import type { IncomingMessage, ServerResponse } from "node:http";
|
|
2
|
+
import { runHeartbeatSweep } from "../heartbeat/heartbeat";
|
|
3
|
+
import { route } from "./route-def";
|
|
4
|
+
import { json } from "./utils";
|
|
5
|
+
|
|
6
|
+
// ─── Route Definitions ───────────────────────────────────────────────────────
|
|
7
|
+
|
|
8
|
+
const triggerSweep = route({
|
|
9
|
+
method: "post",
|
|
10
|
+
path: "/api/heartbeat/sweep",
|
|
11
|
+
pattern: ["api", "heartbeat", "sweep"],
|
|
12
|
+
summary: "Trigger an immediate heartbeat sweep",
|
|
13
|
+
tags: ["Heartbeat"],
|
|
14
|
+
responses: {
|
|
15
|
+
200: { description: "Sweep completed successfully" },
|
|
16
|
+
401: { description: "Unauthorized" },
|
|
17
|
+
},
|
|
18
|
+
auth: { apiKey: true },
|
|
19
|
+
});
|
|
20
|
+
|
|
21
|
+
// ─── Handler ─────────────────────────────────────────────────────────────────
|
|
22
|
+
|
|
23
|
+
export async function handleHeartbeat(
|
|
24
|
+
req: IncomingMessage,
|
|
25
|
+
res: ServerResponse,
|
|
26
|
+
pathSegments: string[],
|
|
27
|
+
): Promise<boolean> {
|
|
28
|
+
if (triggerSweep.match(req.method, pathSegments)) {
|
|
29
|
+
const parsed = await triggerSweep.parse(req, res, pathSegments, new URLSearchParams());
|
|
30
|
+
if (!parsed) return true;
|
|
31
|
+
|
|
32
|
+
try {
|
|
33
|
+
await runHeartbeatSweep();
|
|
34
|
+
json(res, { success: true, message: "Heartbeat sweep completed" });
|
|
35
|
+
} catch (err) {
|
|
36
|
+
const message = err instanceof Error ? err.message : "Unknown error during heartbeat sweep";
|
|
37
|
+
json(res, { success: false, error: message }, 500);
|
|
38
|
+
}
|
|
39
|
+
return true;
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
return false;
|
|
43
|
+
}
|
package/src/http/index.ts
CHANGED
|
@@ -4,7 +4,7 @@ import {
|
|
|
4
4
|
type Server,
|
|
5
5
|
type ServerResponse,
|
|
6
6
|
} from "node:http";
|
|
7
|
-
import {
|
|
7
|
+
import { ensure, initialize } from "@desplega.ai/business-use";
|
|
8
8
|
import type { StreamableHTTPServerTransport } from "@modelcontextprotocol/sdk/server/streamableHttp.js";
|
|
9
9
|
import { getEnabledCapabilities, hasCapability } from "@/server";
|
|
10
10
|
import { initAgentMail } from "../agentmail";
|
|
@@ -25,6 +25,7 @@ import { handleDbQuery } from "./db-query";
|
|
|
25
25
|
import { handleEcosystem } from "./ecosystem";
|
|
26
26
|
import { handleEpics } from "./epics";
|
|
27
27
|
import { handleEvents } from "./events";
|
|
28
|
+
import { handleHeartbeat } from "./heartbeat";
|
|
28
29
|
import { handleMcp } from "./mcp";
|
|
29
30
|
import { handleMcpServers } from "./mcp-servers";
|
|
30
31
|
import { handleMemory } from "./memory";
|
|
@@ -120,6 +121,7 @@ const httpServer = createHttpServer(async (req, res) => {
|
|
|
120
121
|
() => handleSkills(req, res, pathSegments, queryParams, myAgentId),
|
|
121
122
|
() => handleMcpServers(req, res, pathSegments, queryParams),
|
|
122
123
|
() => handleMemory(req, res, pathSegments, myAgentId),
|
|
124
|
+
() => handleHeartbeat(req, res, pathSegments),
|
|
123
125
|
() => handleEvents(req, res, pathSegments, queryParams, myAgentId),
|
|
124
126
|
() => handleMcp(req, res, transports),
|
|
125
127
|
];
|
|
@@ -186,7 +188,7 @@ httpServer
|
|
|
186
188
|
.listen(port, async () => {
|
|
187
189
|
console.log(`MCP HTTP server running on http://localhost:${port}/mcp`);
|
|
188
190
|
|
|
189
|
-
|
|
191
|
+
ensure({
|
|
190
192
|
id: "listen",
|
|
191
193
|
flow: "api",
|
|
192
194
|
runId: globalState.__runId!,
|
package/src/http/poll.ts
CHANGED
|
@@ -141,6 +141,9 @@ export async function handlePoll(
|
|
|
141
141
|
previousStatus: pendingTask.status,
|
|
142
142
|
},
|
|
143
143
|
validator: (data) => data.previousStatus === "pending",
|
|
144
|
+
// biome-ignore lint/correctness/noEmptyPattern: data unused, ctx needed
|
|
145
|
+
filter: ({}, ctx) => ctx.deps.length > 0,
|
|
146
|
+
conditions: [{ timeout_ms: 300_000 }], // 5 min: polling interval + queue wait
|
|
144
147
|
});
|
|
145
148
|
|
|
146
149
|
return {
|