@desplega.ai/agent-swarm 1.55.0 → 1.56.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/src/be/db.ts +9 -1
- package/src/be/migrations/025_workflow_run_cancelled_status.sql +0 -2
- package/src/be/migrations/027_heartbeat_md.sql +1 -0
- package/src/be/migrations/runner.ts +8 -0
- package/src/commands/runner.ts +24 -1
- package/src/heartbeat/heartbeat.ts +201 -92
- package/src/heartbeat/templates.ts +64 -9
- package/src/http/agents.ts +5 -2
- package/src/http/heartbeat.ts +29 -1
- package/src/tests/heartbeat-checklist.test.ts +417 -0
- package/src/tests/heartbeat.test.ts +2 -57
- package/src/tests/prompt-template-remaining.test.ts +1 -1
- package/src/tools/update-profile.ts +21 -3
- package/src/types.ts +3 -0
- package/templates/official/lead/CLAUDE.md +3 -0
- package/templates/official/lead/HEARTBEAT.md +12 -0
- package/templates/official/lead/config.json +2 -1
- package/templates/schema.ts +2 -0
package/package.json
CHANGED
package/src/be/db.ts
CHANGED
|
@@ -229,6 +229,7 @@ const VERSIONABLE_FIELDS: VersionableField[] = [
|
|
|
229
229
|
"toolsMd",
|
|
230
230
|
"claudeMd",
|
|
231
231
|
"setupScript",
|
|
232
|
+
"heartbeatMd",
|
|
232
233
|
];
|
|
233
234
|
|
|
234
235
|
function ensureAgentProfileColumns(database: Database): void {
|
|
@@ -399,9 +400,10 @@ function seedContextVersions(): void {
|
|
|
399
400
|
toolsMd: string | null;
|
|
400
401
|
claudeMd: string | null;
|
|
401
402
|
setupScript: string | null;
|
|
403
|
+
heartbeatMd: string | null;
|
|
402
404
|
},
|
|
403
405
|
[]
|
|
404
|
-
>(`SELECT id, soulMd, identityMd, toolsMd, claudeMd, setupScript FROM agents`)
|
|
406
|
+
>(`SELECT id, soulMd, identityMd, toolsMd, claudeMd, setupScript, heartbeatMd FROM agents`)
|
|
405
407
|
.all();
|
|
406
408
|
|
|
407
409
|
for (const agent of agents) {
|
|
@@ -450,6 +452,7 @@ type AgentRow = {
|
|
|
450
452
|
identityMd: string | null;
|
|
451
453
|
setupScript: string | null;
|
|
452
454
|
toolsMd: string | null;
|
|
455
|
+
heartbeatMd: string | null;
|
|
453
456
|
lastActivityAt: string | null;
|
|
454
457
|
createdAt: string;
|
|
455
458
|
lastUpdatedAt: string;
|
|
@@ -471,6 +474,7 @@ function rowToAgent(row: AgentRow): Agent {
|
|
|
471
474
|
identityMd: row.identityMd ?? undefined,
|
|
472
475
|
setupScript: row.setupScript ?? undefined,
|
|
473
476
|
toolsMd: row.toolsMd ?? undefined,
|
|
477
|
+
heartbeatMd: row.heartbeatMd ?? undefined,
|
|
474
478
|
lastActivityAt: row.lastActivityAt ?? undefined,
|
|
475
479
|
createdAt: row.createdAt,
|
|
476
480
|
lastUpdatedAt: row.lastUpdatedAt,
|
|
@@ -2259,6 +2263,7 @@ export function updateAgentProfile(
|
|
|
2259
2263
|
identityMd?: string;
|
|
2260
2264
|
setupScript?: string;
|
|
2261
2265
|
toolsMd?: string;
|
|
2266
|
+
heartbeatMd?: string;
|
|
2262
2267
|
},
|
|
2263
2268
|
meta?: VersionMeta,
|
|
2264
2269
|
): Agent | null {
|
|
@@ -2312,6 +2317,7 @@ export function updateAgentProfile(
|
|
|
2312
2317
|
string | null,
|
|
2313
2318
|
string | null,
|
|
2314
2319
|
string | null,
|
|
2320
|
+
string | null,
|
|
2315
2321
|
string,
|
|
2316
2322
|
string,
|
|
2317
2323
|
]
|
|
@@ -2325,6 +2331,7 @@ export function updateAgentProfile(
|
|
|
2325
2331
|
identityMd = COALESCE(?, identityMd),
|
|
2326
2332
|
setupScript = COALESCE(?, setupScript),
|
|
2327
2333
|
toolsMd = COALESCE(?, toolsMd),
|
|
2334
|
+
heartbeatMd = COALESCE(?, heartbeatMd),
|
|
2328
2335
|
lastUpdatedAt = ?
|
|
2329
2336
|
WHERE id = ? RETURNING *`,
|
|
2330
2337
|
)
|
|
@@ -2337,6 +2344,7 @@ export function updateAgentProfile(
|
|
|
2337
2344
|
updates.identityMd ?? null,
|
|
2338
2345
|
updates.setupScript ?? null,
|
|
2339
2346
|
updates.toolsMd ?? null,
|
|
2347
|
+
updates.heartbeatMd ?? null,
|
|
2340
2348
|
now,
|
|
2341
2349
|
id,
|
|
2342
2350
|
);
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
ALTER TABLE agents ADD COLUMN heartbeatMd TEXT DEFAULT NULL;
|
|
@@ -152,6 +152,12 @@ export function runMigrations(db: Database): void {
|
|
|
152
152
|
}
|
|
153
153
|
|
|
154
154
|
// 5. Run pending migrations
|
|
155
|
+
// Disable FK checks for the entire migration pass. Individual migrations
|
|
156
|
+
// (008, 025, 026) need to DROP + recreate tables, which can violate FKs
|
|
157
|
+
// mid-transaction. PRAGMA foreign_keys cannot be changed inside a transaction,
|
|
158
|
+
// so we disable it here, outside any transaction, and re-enable after.
|
|
159
|
+
db.run("PRAGMA foreign_keys = OFF");
|
|
160
|
+
|
|
155
161
|
for (const migration of migrations) {
|
|
156
162
|
const existing = applied.get(migration.version);
|
|
157
163
|
|
|
@@ -184,4 +190,6 @@ export function runMigrations(db: Database): void {
|
|
|
184
190
|
const elapsed = (performance.now() - start).toFixed(1);
|
|
185
191
|
console.debug(`[migrations] Applied: ${migration.name} (${elapsed}ms)`);
|
|
186
192
|
}
|
|
193
|
+
|
|
194
|
+
db.run("PRAGMA foreign_keys = ON");
|
|
187
195
|
}
|
package/src/commands/runner.ts
CHANGED
|
@@ -1991,6 +1991,7 @@ export async function runAgent(config: RunnerConfig, opts: RunnerOptions) {
|
|
|
1991
1991
|
let agentSetupScript: string | undefined;
|
|
1992
1992
|
let agentToolsMd: string | undefined;
|
|
1993
1993
|
let agentClaudeMd: string | undefined;
|
|
1994
|
+
let agentHeartbeatMd: string | undefined;
|
|
1994
1995
|
let agentProfileName: string | undefined;
|
|
1995
1996
|
let agentDescription: string | undefined;
|
|
1996
1997
|
let agentSkillsSummary: { name: string; description: string }[] | undefined;
|
|
@@ -2170,6 +2171,7 @@ export async function runAgent(config: RunnerConfig, opts: RunnerOptions) {
|
|
|
2170
2171
|
claudeMd?: string;
|
|
2171
2172
|
setupScript?: string;
|
|
2172
2173
|
toolsMd?: string;
|
|
2174
|
+
heartbeatMd?: string;
|
|
2173
2175
|
name?: string;
|
|
2174
2176
|
description?: string;
|
|
2175
2177
|
};
|
|
@@ -2178,12 +2180,19 @@ export async function runAgent(config: RunnerConfig, opts: RunnerOptions) {
|
|
|
2178
2180
|
agentSetupScript = profile.setupScript;
|
|
2179
2181
|
agentToolsMd = profile.toolsMd;
|
|
2180
2182
|
agentClaudeMd = profile.claudeMd;
|
|
2183
|
+
agentHeartbeatMd = profile.heartbeatMd;
|
|
2181
2184
|
agentProfileName = profile.name;
|
|
2182
2185
|
agentDescription = profile.description;
|
|
2183
2186
|
|
|
2184
2187
|
// Generate default templates if missing (runner registers via POST /api/agents
|
|
2185
2188
|
// which doesn't generate templates like join-swarm does)
|
|
2186
|
-
if (
|
|
2189
|
+
if (
|
|
2190
|
+
!agentSoulMd ||
|
|
2191
|
+
!agentIdentityMd ||
|
|
2192
|
+
!agentToolsMd ||
|
|
2193
|
+
!agentClaudeMd ||
|
|
2194
|
+
!agentHeartbeatMd
|
|
2195
|
+
) {
|
|
2187
2196
|
// Use already-fetched template (from pre-registration step)
|
|
2188
2197
|
if (cachedTemplate) {
|
|
2189
2198
|
const ctx = {
|
|
@@ -2202,6 +2211,8 @@ export async function runAgent(config: RunnerConfig, opts: RunnerOptions) {
|
|
|
2202
2211
|
agentClaudeMd = interpolate(cachedTemplate.files.claudeMd, ctx).result;
|
|
2203
2212
|
if (!agentSetupScript)
|
|
2204
2213
|
agentSetupScript = interpolate(cachedTemplate.files.setupScript, ctx).result;
|
|
2214
|
+
if (!agentHeartbeatMd)
|
|
2215
|
+
agentHeartbeatMd = interpolate(cachedTemplate.files.heartbeatMd, ctx).result;
|
|
2205
2216
|
console.log(`[${role}] Applied template: ${templateId}`);
|
|
2206
2217
|
}
|
|
2207
2218
|
|
|
@@ -2226,6 +2237,8 @@ export async function runAgent(config: RunnerConfig, opts: RunnerOptions) {
|
|
|
2226
2237
|
if (!profile.claudeMd && agentClaudeMd) profileUpdate.claudeMd = agentClaudeMd;
|
|
2227
2238
|
if (!profile.setupScript && agentSetupScript)
|
|
2228
2239
|
profileUpdate.setupScript = agentSetupScript;
|
|
2240
|
+
if (!profile.heartbeatMd && agentHeartbeatMd)
|
|
2241
|
+
profileUpdate.heartbeatMd = agentHeartbeatMd;
|
|
2229
2242
|
|
|
2230
2243
|
await fetch(`${apiUrl}/api/agents/${agentId}/profile`, {
|
|
2231
2244
|
method: "PUT",
|
|
@@ -2364,6 +2377,16 @@ export async function runAgent(config: RunnerConfig, opts: RunnerOptions) {
|
|
|
2364
2377
|
}
|
|
2365
2378
|
}
|
|
2366
2379
|
|
|
2380
|
+
// Write HEARTBEAT.md to workspace (lead's periodic checklist)
|
|
2381
|
+
if (agentHeartbeatMd) {
|
|
2382
|
+
try {
|
|
2383
|
+
await Bun.write("/workspace/HEARTBEAT.md", agentHeartbeatMd);
|
|
2384
|
+
console.log(`[${role}] Wrote HEARTBEAT.md to workspace`);
|
|
2385
|
+
} catch (err) {
|
|
2386
|
+
console.warn(`[${role}] Could not write HEARTBEAT.md: ${(err as Error).message}`);
|
|
2387
|
+
}
|
|
2388
|
+
}
|
|
2389
|
+
|
|
2367
2390
|
// Write CLAUDE.md to workspace (agent-level instructions)
|
|
2368
2391
|
if (agentClaudeMd) {
|
|
2369
2392
|
try {
|
|
@@ -47,11 +47,12 @@ const STALE_CLEANUP_THRESHOLD_MINUTES = Number(process.env.HEARTBEAT_STALE_CLEAN
|
|
|
47
47
|
/** Max pool tasks to auto-assign per sweep */
|
|
48
48
|
const MAX_AUTO_ASSIGN_PER_SWEEP = Number(process.env.HEARTBEAT_MAX_AUTO_ASSIGN) || 5;
|
|
49
49
|
|
|
50
|
-
/**
|
|
51
|
-
const
|
|
52
|
-
Number(process.env.
|
|
50
|
+
/** Heartbeat checklist interval: how often to check HEARTBEAT.md (default: 30 min) */
|
|
51
|
+
const HEARTBEAT_CHECKLIST_INTERVAL_MS =
|
|
52
|
+
Number(process.env.HEARTBEAT_CHECKLIST_INTERVAL_MS) || 30 * 60 * 1000;
|
|
53
53
|
|
|
54
|
-
|
|
54
|
+
/** Whether to disable the heartbeat checklist entirely */
|
|
55
|
+
const HEARTBEAT_CHECKLIST_DISABLE = Boolean(process.env.HEARTBEAT_CHECKLIST_DISABLE);
|
|
55
56
|
|
|
56
57
|
// ============================================================================
|
|
57
58
|
// Types
|
|
@@ -69,8 +70,6 @@ export interface HeartbeatFindings {
|
|
|
69
70
|
inboxProcessing: number;
|
|
70
71
|
workflowRuns: number;
|
|
71
72
|
};
|
|
72
|
-
escalationNeeded: boolean;
|
|
73
|
-
escalationReason?: string;
|
|
74
73
|
}
|
|
75
74
|
|
|
76
75
|
// ============================================================================
|
|
@@ -78,11 +77,9 @@ export interface HeartbeatFindings {
|
|
|
78
77
|
// ============================================================================
|
|
79
78
|
|
|
80
79
|
let heartbeatInterval: ReturnType<typeof setInterval> | null = null;
|
|
80
|
+
let checklistInterval: ReturnType<typeof setInterval> | null = null;
|
|
81
81
|
let isSweeping = false;
|
|
82
82
|
|
|
83
|
-
/** Tracks last escalation time per escalation key to prevent spam */
|
|
84
|
-
const lastEscalationTime: Map<string, number> = new Map();
|
|
85
|
-
|
|
86
83
|
// ============================================================================
|
|
87
84
|
// Tier 1: Preflight Gate
|
|
88
85
|
// ============================================================================
|
|
@@ -133,10 +130,9 @@ export async function codeLevelTriage(): Promise<HeartbeatFindings> {
|
|
|
133
130
|
inboxProcessing: 0,
|
|
134
131
|
workflowRuns: 0,
|
|
135
132
|
},
|
|
136
|
-
escalationNeeded: false,
|
|
137
133
|
};
|
|
138
134
|
|
|
139
|
-
// 1. Detect and remediate stalled tasks (tiered: auto-fail dead workers
|
|
135
|
+
// 1. Detect and remediate stalled tasks (tiered: auto-fail dead workers)
|
|
140
136
|
detectAndRemediateStalledTasks(findings);
|
|
141
137
|
|
|
142
138
|
// 2. Check and fix worker health
|
|
@@ -148,9 +144,6 @@ export async function codeLevelTriage(): Promise<HeartbeatFindings> {
|
|
|
148
144
|
// 4. Cleanup stale resources (including workflow run recovery)
|
|
149
145
|
await cleanupStaleResources(findings);
|
|
150
146
|
|
|
151
|
-
// 5. Determine if escalation is needed
|
|
152
|
-
evaluateEscalation(findings);
|
|
153
|
-
|
|
154
147
|
return findings;
|
|
155
148
|
}
|
|
156
149
|
|
|
@@ -306,102 +299,150 @@ async function cleanupStaleResources(findings: HeartbeatFindings): Promise<void>
|
|
|
306
299
|
}
|
|
307
300
|
|
|
308
301
|
// ============================================================================
|
|
309
|
-
//
|
|
302
|
+
// Heartbeat Checklist (HEARTBEAT.md-based periodic check)
|
|
310
303
|
// ============================================================================
|
|
311
304
|
|
|
312
305
|
/**
|
|
313
|
-
*
|
|
314
|
-
*
|
|
306
|
+
* Check if content is effectively empty (only headers, comments, empty items).
|
|
307
|
+
* Returns true if there are no actionable items — the checklist should be skipped.
|
|
315
308
|
*/
|
|
316
|
-
function
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
const taskIds = findings.stalledTasks.map((t) => t.id.slice(0, 8)).join(", ");
|
|
320
|
-
findings.escalationReason = `${findings.stalledTasks.length} task(s) stalled with active worker (no task update for ${STALL_THRESHOLD_MINUTES}+ min): ${taskIds}`;
|
|
321
|
-
}
|
|
322
|
-
}
|
|
309
|
+
export function isEffectivelyEmpty(content: string): boolean {
|
|
310
|
+
const lines = content.split("\n");
|
|
311
|
+
let inComment = false;
|
|
323
312
|
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
|
|
313
|
+
for (const line of lines) {
|
|
314
|
+
const trimmed = line.trim();
|
|
315
|
+
|
|
316
|
+
// Track multi-line HTML comments
|
|
317
|
+
if (inComment) {
|
|
318
|
+
if (trimmed.includes("-->")) {
|
|
319
|
+
inComment = false;
|
|
320
|
+
}
|
|
321
|
+
continue;
|
|
322
|
+
}
|
|
333
323
|
|
|
334
|
-
|
|
324
|
+
if (trimmed.startsWith("<!--")) {
|
|
325
|
+
if (!trimmed.includes("-->")) {
|
|
326
|
+
inComment = true;
|
|
327
|
+
}
|
|
328
|
+
continue;
|
|
329
|
+
}
|
|
335
330
|
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
if (lastTime && Date.now() - lastTime < ESCALATION_COOLDOWN_MS) {
|
|
339
|
-
return;
|
|
340
|
-
}
|
|
331
|
+
// Skip blank lines
|
|
332
|
+
if (trimmed === "") continue;
|
|
341
333
|
|
|
342
|
-
|
|
343
|
-
|
|
334
|
+
// Skip markdown headers
|
|
335
|
+
if (/^#{1,6}\s/.test(trimmed)) continue;
|
|
336
|
+
|
|
337
|
+
// Skip empty list items (just a marker with no text)
|
|
338
|
+
if (/^[-*+]\s*\[\s*\]\s*$/.test(trimmed)) continue;
|
|
339
|
+
if (/^[-*+]\s*$/.test(trimmed)) continue;
|
|
340
|
+
|
|
341
|
+
// If we get here, there's real content
|
|
342
|
+
return false;
|
|
344
343
|
}
|
|
345
344
|
|
|
346
|
-
|
|
347
|
-
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
|
|
345
|
+
return true;
|
|
346
|
+
}
|
|
347
|
+
|
|
348
|
+
/**
|
|
349
|
+
* Gather current system status as a markdown string for the lead's checklist task.
|
|
350
|
+
*/
|
|
351
|
+
export function gatherSystemStatus(): string {
|
|
352
|
+
const stats = getTaskStats();
|
|
353
|
+
const stalledTasks = getStalledInProgressTasks(STALL_THRESHOLD_MINUTES);
|
|
354
|
+
const agents = getAllAgents();
|
|
355
|
+
const idleWorkers = getIdleWorkersWithCapacity();
|
|
356
|
+
const poolTasks = getUnassignedPoolTasks(10);
|
|
357
|
+
|
|
358
|
+
const sections: string[] = [];
|
|
359
|
+
|
|
360
|
+
// Task overview
|
|
361
|
+
sections.push("## Task Overview [auto-generated]");
|
|
362
|
+
sections.push(`- In Progress: ${stats.in_progress ?? 0}`);
|
|
363
|
+
sections.push(`- Pending: ${stats.pending ?? 0}`);
|
|
364
|
+
sections.push(`- Unassigned: ${stats.unassigned ?? 0}`);
|
|
365
|
+
sections.push(`- Completed (24h): ${stats.completed ?? 0}`);
|
|
366
|
+
sections.push(`- Failed (24h): ${stats.failed ?? 0}`);
|
|
367
|
+
|
|
368
|
+
// Stalled tasks
|
|
369
|
+
if (stalledTasks.length > 0) {
|
|
370
|
+
sections.push("");
|
|
371
|
+
sections.push("## Stalled Tasks [auto-generated]");
|
|
372
|
+
for (const task of stalledTasks) {
|
|
351
373
|
const agentSlice = task.agentId?.slice(0, 8) ?? "unassigned";
|
|
352
|
-
|
|
353
|
-
`-
|
|
374
|
+
sections.push(
|
|
375
|
+
`- [${task.id.slice(0, 8)}] "${task.task.slice(0, 60)}" — assigned to ${agentSlice}, last update: ${task.lastUpdatedAt}`,
|
|
354
376
|
);
|
|
355
377
|
}
|
|
356
|
-
lines.push(
|
|
357
|
-
"\nCheck if these tasks are genuinely stuck or just working without calling store-progress. " +
|
|
358
|
-
"If stuck, consider cancelling and reassigning.",
|
|
359
|
-
);
|
|
360
|
-
stalledTasksSection = lines.join("\n");
|
|
361
378
|
}
|
|
362
379
|
|
|
363
|
-
|
|
364
|
-
|
|
365
|
-
const
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
|
|
380
|
+
// Agent status
|
|
381
|
+
const idle = agents.filter((a) => a.status === "idle");
|
|
382
|
+
const busy = agents.filter((a) => a.status === "busy");
|
|
383
|
+
const offline = agents.filter((a) => a.status === "offline");
|
|
384
|
+
sections.push("");
|
|
385
|
+
sections.push("## Agent Status [auto-generated]");
|
|
386
|
+
sections.push(
|
|
387
|
+
`- Online: ${idle.length + busy.length} (${idle.length} idle, ${busy.length} busy), Offline: ${offline.length}`,
|
|
388
|
+
);
|
|
369
389
|
|
|
370
|
-
|
|
371
|
-
|
|
390
|
+
// Available work
|
|
391
|
+
if (poolTasks.length > 0 || idleWorkers.length > 0) {
|
|
392
|
+
sections.push("");
|
|
393
|
+
sections.push("## Available Work [auto-generated]");
|
|
394
|
+
if (poolTasks.length > 0) {
|
|
395
|
+
sections.push(`- ${poolTasks.length} unassigned pool task(s) waiting`);
|
|
396
|
+
}
|
|
397
|
+
if (idleWorkers.length > 0) {
|
|
398
|
+
sections.push(`- ${idleWorkers.length} idle worker(s) with capacity`);
|
|
399
|
+
}
|
|
372
400
|
}
|
|
373
401
|
|
|
374
|
-
|
|
375
|
-
agentId: lead.id,
|
|
376
|
-
taskType: "heartbeat",
|
|
377
|
-
tags: ["heartbeat", "triage", "auto-generated"],
|
|
378
|
-
priority: 70,
|
|
379
|
-
});
|
|
380
|
-
|
|
381
|
-
lastEscalationTime.set(escalationKey, Date.now());
|
|
382
|
-
console.log(`[Heartbeat] Created triage task for lead ${lead.name}`);
|
|
402
|
+
return sections.join("\n");
|
|
383
403
|
}
|
|
384
404
|
|
|
385
|
-
|
|
386
|
-
|
|
387
|
-
|
|
388
|
-
|
|
389
|
-
|
|
390
|
-
|
|
405
|
+
/**
|
|
406
|
+
* Check HEARTBEAT.md content and create a checklist task for the lead if needed.
|
|
407
|
+
*/
|
|
408
|
+
export async function checkHeartbeatChecklist(): Promise<void> {
|
|
409
|
+
const lead = getLeadAgent();
|
|
410
|
+
if (!lead) return;
|
|
411
|
+
|
|
412
|
+
const heartbeatMd = lead.heartbeatMd;
|
|
413
|
+
if (!heartbeatMd) return;
|
|
391
414
|
|
|
392
|
-
|
|
415
|
+
if (isEffectivelyEmpty(heartbeatMd)) return;
|
|
416
|
+
|
|
417
|
+
// Dedup: skip if lead already has an active heartbeat-checklist task
|
|
393
418
|
const existing = getDb()
|
|
394
|
-
.prepare<{ id: string }, [string
|
|
419
|
+
.prepare<{ id: string }, [string]>(
|
|
395
420
|
`SELECT id FROM agent_tasks
|
|
396
421
|
WHERE agentId = ?
|
|
397
|
-
AND taskType = 'heartbeat'
|
|
422
|
+
AND taskType = 'heartbeat-checklist'
|
|
398
423
|
AND status NOT IN ('completed', 'failed', 'cancelled')
|
|
399
|
-
AND task LIKE ?
|
|
400
424
|
LIMIT 1`,
|
|
401
425
|
)
|
|
402
|
-
.get(
|
|
426
|
+
.get(lead.id);
|
|
427
|
+
if (existing) return;
|
|
428
|
+
|
|
429
|
+
const systemStatus = gatherSystemStatus();
|
|
403
430
|
|
|
404
|
-
|
|
431
|
+
const result = resolveTemplate("heartbeat.checklist", {
|
|
432
|
+
system_status: systemStatus,
|
|
433
|
+
heartbeat_content: heartbeatMd,
|
|
434
|
+
});
|
|
435
|
+
|
|
436
|
+
if (result.skipped) return;
|
|
437
|
+
|
|
438
|
+
createTaskExtended(result.text, {
|
|
439
|
+
agentId: lead.id,
|
|
440
|
+
taskType: "heartbeat-checklist",
|
|
441
|
+
tags: ["checklist", "auto-generated"],
|
|
442
|
+
priority: 60,
|
|
443
|
+
});
|
|
444
|
+
|
|
445
|
+
console.log(`[Heartbeat] Checklist task created for lead ${lead.name}`);
|
|
405
446
|
}
|
|
406
447
|
|
|
407
448
|
// ============================================================================
|
|
@@ -409,7 +450,7 @@ function hasActiveEscalationTask(leadAgentId: string, escalationKey: string): bo
|
|
|
409
450
|
// ============================================================================
|
|
410
451
|
|
|
411
452
|
/**
|
|
412
|
-
* Run a single heartbeat sweep (Tier 1 → Tier 2
|
|
453
|
+
* Run a single heartbeat sweep (Tier 1 → Tier 2).
|
|
413
454
|
*/
|
|
414
455
|
export async function runHeartbeatSweep(): Promise<void> {
|
|
415
456
|
if (isSweeping) {
|
|
@@ -432,7 +473,6 @@ export async function runHeartbeatSweep(): Promise<void> {
|
|
|
432
473
|
inboxProcessing: 0,
|
|
433
474
|
workflowRuns: 0,
|
|
434
475
|
},
|
|
435
|
-
escalationNeeded: false,
|
|
436
476
|
};
|
|
437
477
|
await cleanupStaleResources(cleanupOnlyFindings);
|
|
438
478
|
logFindings(cleanupOnlyFindings);
|
|
@@ -444,11 +484,6 @@ export async function runHeartbeatSweep(): Promise<void> {
|
|
|
444
484
|
|
|
445
485
|
// Log findings summary
|
|
446
486
|
logFindings(findings);
|
|
447
|
-
|
|
448
|
-
// Tier 3: Escalate if needed
|
|
449
|
-
if (findings.escalationNeeded) {
|
|
450
|
-
escalateToLead(findings);
|
|
451
|
-
}
|
|
452
487
|
} finally {
|
|
453
488
|
isSweeping = false;
|
|
454
489
|
}
|
|
@@ -508,6 +543,9 @@ export function startHeartbeat(intervalMs = DEFAULT_INTERVAL_MS): void {
|
|
|
508
543
|
heartbeatInterval = setInterval(() => {
|
|
509
544
|
runHeartbeatSweep();
|
|
510
545
|
}, intervalMs);
|
|
546
|
+
|
|
547
|
+
// Also start the checklist interval
|
|
548
|
+
startHeartbeatChecklist();
|
|
511
549
|
}
|
|
512
550
|
|
|
513
551
|
/**
|
|
@@ -520,11 +558,82 @@ export function stopHeartbeat(): void {
|
|
|
520
558
|
isSweeping = false;
|
|
521
559
|
console.log("[Heartbeat] Stopped");
|
|
522
560
|
}
|
|
561
|
+
stopHeartbeatChecklist();
|
|
562
|
+
}
|
|
563
|
+
|
|
564
|
+
/**
|
|
565
|
+
* Create a one-off boot triage task for the lead after a server restart.
|
|
566
|
+
* Uses the same HEARTBEAT.md content but with reboot-specific context prepended.
|
|
567
|
+
*/
|
|
568
|
+
export async function createBootTriageTask(): Promise<void> {
|
|
569
|
+
const lead = getLeadAgent();
|
|
570
|
+
if (!lead) return;
|
|
571
|
+
|
|
572
|
+
const heartbeatMd = lead.heartbeatMd ?? "";
|
|
573
|
+
|
|
574
|
+
// Dedup: skip if lead already has an active boot-triage task
|
|
575
|
+
const existing = getDb()
|
|
576
|
+
.prepare<{ id: string }, [string]>(
|
|
577
|
+
`SELECT id FROM agent_tasks
|
|
578
|
+
WHERE agentId = ?
|
|
579
|
+
AND taskType = 'boot-triage'
|
|
580
|
+
AND status NOT IN ('completed', 'failed', 'cancelled')
|
|
581
|
+
LIMIT 1`,
|
|
582
|
+
)
|
|
583
|
+
.get(lead.id);
|
|
584
|
+
if (existing) return;
|
|
585
|
+
|
|
586
|
+
const systemStatus = gatherSystemStatus();
|
|
587
|
+
|
|
588
|
+
const result = resolveTemplate("heartbeat.boot-triage", {
|
|
589
|
+
system_status: systemStatus,
|
|
590
|
+
heartbeat_content: isEffectivelyEmpty(heartbeatMd)
|
|
591
|
+
? "_No standing orders configured._"
|
|
592
|
+
: heartbeatMd,
|
|
593
|
+
});
|
|
594
|
+
|
|
595
|
+
if (result.skipped) return;
|
|
596
|
+
|
|
597
|
+
createTaskExtended(result.text, {
|
|
598
|
+
agentId: lead.id,
|
|
599
|
+
taskType: "boot-triage",
|
|
600
|
+
tags: ["boot", "triage", "auto-generated"],
|
|
601
|
+
priority: 70, // Higher than regular checklist (60)
|
|
602
|
+
});
|
|
603
|
+
|
|
604
|
+
console.log(`[Heartbeat] Boot triage task created for lead ${lead.name}`);
|
|
605
|
+
}
|
|
606
|
+
|
|
607
|
+
/**
|
|
608
|
+
* Start the heartbeat checklist polling loop (separate from the infrastructure sweep).
|
|
609
|
+
*/
|
|
610
|
+
export function startHeartbeatChecklist(intervalMs = HEARTBEAT_CHECKLIST_INTERVAL_MS): void {
|
|
611
|
+
if (HEARTBEAT_CHECKLIST_DISABLE) {
|
|
612
|
+
console.log("[Heartbeat] Checklist disabled via HEARTBEAT_CHECKLIST_DISABLE");
|
|
613
|
+
return;
|
|
614
|
+
}
|
|
615
|
+
if (checklistInterval) {
|
|
616
|
+
return; // Already running
|
|
617
|
+
}
|
|
618
|
+
|
|
619
|
+
console.log(`[Heartbeat] Checklist starting with ${intervalMs}ms interval`);
|
|
620
|
+
|
|
621
|
+
// Boot triage replaces the first checklist (30s delay to let system settle)
|
|
622
|
+
setTimeout(() => createBootTriageTask(), 30_000);
|
|
623
|
+
|
|
624
|
+
// Recurring checklist starts from the second interval onward
|
|
625
|
+
checklistInterval = setInterval(() => {
|
|
626
|
+
checkHeartbeatChecklist();
|
|
627
|
+
}, intervalMs);
|
|
523
628
|
}
|
|
524
629
|
|
|
525
630
|
/**
|
|
526
|
-
*
|
|
631
|
+
* Stop the heartbeat checklist polling loop.
|
|
527
632
|
*/
|
|
528
|
-
export function
|
|
529
|
-
|
|
633
|
+
export function stopHeartbeatChecklist(): void {
|
|
634
|
+
if (checklistInterval) {
|
|
635
|
+
clearInterval(checklistInterval);
|
|
636
|
+
checklistInterval = null;
|
|
637
|
+
console.log("[Heartbeat] Checklist stopped");
|
|
638
|
+
}
|
|
530
639
|
}
|
|
@@ -8,23 +8,78 @@
|
|
|
8
8
|
import { registerTemplate } from "../prompts/registry";
|
|
9
9
|
|
|
10
10
|
// ============================================================================
|
|
11
|
-
//
|
|
11
|
+
// Heartbeat checklist
|
|
12
12
|
// ============================================================================
|
|
13
13
|
|
|
14
14
|
registerTemplate({
|
|
15
|
-
eventType: "heartbeat.
|
|
15
|
+
eventType: "heartbeat.checklist",
|
|
16
16
|
header: "",
|
|
17
|
-
defaultBody: `Task Type:
|
|
18
|
-
Goal:
|
|
17
|
+
defaultBody: `Task Type: Heartbeat Checklist
|
|
18
|
+
Goal: Review system status and your standing orders, take action if needed.
|
|
19
19
|
|
|
20
|
-
|
|
21
|
-
{{
|
|
20
|
+
## Current System Status [auto-generated]
|
|
21
|
+
{{system_status}}
|
|
22
|
+
|
|
23
|
+
## Your Standing Orders (from HEARTBEAT.md)
|
|
24
|
+
{{heartbeat_content}}
|
|
25
|
+
|
|
26
|
+
## Instructions
|
|
27
|
+
1. Review the system status above for anything that needs attention (stalled tasks, idle workers with available work, anomalies).
|
|
28
|
+
2. Review your standing orders for any periodic checks or actions.
|
|
29
|
+
3. If something needs attention — take action now using your available tools (create tasks, post to Slack, cancel stuck tasks, etc.).
|
|
30
|
+
4. If everything looks healthy and no standing orders are actionable — complete this task with a brief "All clear" summary.
|
|
31
|
+
5. Do NOT create another heartbeat-checklist task — the system handles scheduling.`,
|
|
32
|
+
variables: [
|
|
33
|
+
{
|
|
34
|
+
name: "system_status",
|
|
35
|
+
description: "Auto-generated markdown section with current system status",
|
|
36
|
+
},
|
|
37
|
+
{
|
|
38
|
+
name: "heartbeat_content",
|
|
39
|
+
description: "The lead agent's HEARTBEAT.md standing orders",
|
|
40
|
+
},
|
|
41
|
+
],
|
|
42
|
+
category: "event",
|
|
43
|
+
});
|
|
44
|
+
|
|
45
|
+
// ============================================================================
|
|
46
|
+
// Boot triage (one-off after container restart)
|
|
47
|
+
// ============================================================================
|
|
48
|
+
|
|
49
|
+
registerTemplate({
|
|
50
|
+
eventType: "heartbeat.boot-triage",
|
|
51
|
+
header: "",
|
|
52
|
+
defaultBody: `Task Type: Boot Triage
|
|
53
|
+
Goal: The system just restarted — assess current state and triage based on your role.
|
|
54
|
+
|
|
55
|
+
## Boot Event [auto-generated]
|
|
56
|
+
The API server has just restarted (possible pod rotation or deployment). This is a one-off triage task — not a recurring checklist. Review the current state, identify anything that needs immediate attention, and take action.
|
|
57
|
+
|
|
58
|
+
## Current System Status [auto-generated]
|
|
59
|
+
{{system_status}}
|
|
60
|
+
|
|
61
|
+
## Your Standing Orders (from HEARTBEAT.md)
|
|
62
|
+
{{heartbeat_content}}
|
|
63
|
+
|
|
64
|
+
## Instructions
|
|
65
|
+
1. **Acknowledge the reboot** — note that the system just restarted and any in-flight work may have been interrupted.
|
|
66
|
+
2. Review the system status above. Pay special attention to:
|
|
67
|
+
- Tasks that were in-progress before the restart (they may have been auto-failed by the startup sweep)
|
|
68
|
+
- Workers that may need to re-register
|
|
69
|
+
- Any stalled or orphaned work
|
|
70
|
+
3. Review your standing orders for any checks that are relevant post-reboot.
|
|
71
|
+
4. Take action using your available tools (re-create failed tasks, notify affected parties, etc.).
|
|
72
|
+
5. Complete this task with a summary of what you found and what actions you took.
|
|
73
|
+
6. Do NOT create another boot-triage task — this is a one-off event.`,
|
|
22
74
|
variables: [
|
|
23
75
|
{
|
|
24
|
-
name: "
|
|
25
|
-
description: "
|
|
76
|
+
name: "system_status",
|
|
77
|
+
description: "Auto-generated markdown section with current system status",
|
|
78
|
+
},
|
|
79
|
+
{
|
|
80
|
+
name: "heartbeat_content",
|
|
81
|
+
description: "The lead agent's HEARTBEAT.md standing orders",
|
|
26
82
|
},
|
|
27
|
-
{ name: "escalation_marker", description: "Heartbeat escalation marker with key" },
|
|
28
83
|
],
|
|
29
84
|
category: "event",
|
|
30
85
|
});
|