@desplega.ai/agent-swarm 1.92.0 → 1.92.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +1 -1
- package/openapi.json +276 -3
- package/package.json +6 -6
- package/plugin/skills/pages/SKILL.md +5 -2
- package/src/be/db.ts +327 -20
- package/src/be/memory/constants.ts +2 -1
- package/src/be/memory/providers/openai-embedding.ts +2 -5
- package/src/be/memory/providers/sqlite-store.ts +293 -76
- package/src/be/memory/types.ts +35 -0
- package/src/be/migrations/084_script_run_journal_duration.sql +5 -0
- package/src/be/migrations/085_script_runs_kind.sql +9 -0
- package/src/be/migrations/086_pages_default_authed.sql +64 -0
- package/src/be/migrations/087_skill_files.sql +19 -0
- package/src/be/modelsdev-cache.json +264 -328
- package/src/be/seed-scripts/catalog/boot-triage.ts +221 -0
- package/src/be/seed-scripts/catalog/catalog-report.ts +457 -0
- package/src/be/seed-scripts/catalog/compound-insights.ts +94 -0
- package/src/be/seed-scripts/catalog/gh-pr-snapshot.ts +1 -1
- package/src/be/seed-scripts/catalog/memory-eval.ts +1059 -0
- package/src/be/seed-scripts/catalog/ops-catalog-audit.ts +34 -439
- package/src/be/seed-scripts/catalog/schedule-health.ts +78 -2
- package/src/be/seed-scripts/catalog/task-failure-audit.ts +48 -1
- package/src/be/seed-scripts/index.ts +32 -4
- package/src/be/seed-skills/index.ts +0 -7
- package/src/be/skill-sync.ts +91 -7
- package/src/commands/runner.ts +6 -2
- package/src/heartbeat/templates.ts +20 -16
- package/src/http/index.ts +41 -7
- package/src/http/mcp-user.ts +23 -0
- package/src/http/mcp.ts +58 -0
- package/src/http/memory.ts +58 -0
- package/src/http/pages.ts +1 -1
- package/src/http/script-runs.ts +2 -0
- package/src/http/scripts.ts +39 -2
- package/src/http/skills.ts +225 -0
- package/src/providers/claude-adapter.ts +56 -24
- package/src/script-workflows/workflow-ctx.ts +7 -3
- package/src/scripts-runtime/sdk-allowlist.ts +1 -0
- package/src/scripts-runtime/swarm-sdk.ts +13 -0
- package/src/scripts-runtime/types/stdlib.d.ts +1 -0
- package/src/scripts-runtime/types/swarm-sdk.d.ts +1 -0
- package/src/server.ts +2 -0
- package/src/tests/claude-adapter-binary.test.ts +135 -81
- package/src/tests/create-page-tool.test.ts +19 -2
- package/src/tests/heartbeat-checklist.test.ts +36 -0
- package/src/tests/mcp-transport-gc.test.ts +58 -0
- package/src/tests/memory-health-endpoint.test.ts +78 -0
- package/src/tests/memory-store.test.ts +221 -1
- package/src/tests/pages-http.test.ts +20 -2
- package/src/tests/pages-storage.test.ts +26 -0
- package/src/tests/scripts-mcp-e2e.test.ts +53 -0
- package/src/tests/seed-scripts.test.ts +123 -3
- package/src/tests/skill-files-http.test.ts +171 -0
- package/src/tests/skill-files.test.ts +162 -0
- package/src/tests/skill-get-file-tool.test.ts +110 -0
- package/src/tests/skill-sync.test.ts +125 -6
- package/src/tools/create-page.ts +2 -2
- package/src/tools/skills/index.ts +1 -0
- package/src/tools/skills/skill-get-file.ts +80 -0
- package/src/tools/tool-config.ts +2 -1
- package/src/types.ts +20 -0
- package/src/utils/internal-ai/complete-structured.ts +2 -2
- package/templates/schedules/daily-blocker-digest/content.md +68 -54
- package/templates/schedules/daily-compounding-reflection/content.md +4 -4
- package/templates/schedules/daily-hn-briefing/content.md +5 -5
- package/templates/schedules/daily-workflow-health-audit/content.md +6 -6
- package/templates/schedules/gtm-weekly-review/content.md +9 -9
- package/templates/schedules/weekly-dependabot-triage/content.md +24 -20
- package/templates/skills/agentmail-sending/content.md +6 -7
- package/templates/skills/desloppify/content.md +8 -9
- package/templates/skills/jira-interaction/content.md +25 -33
- package/templates/skills/kapso-whatsapp/content.md +29 -30
- package/templates/skills/linear-interaction/content.md +8 -9
- package/templates/skills/profile-corruption-escalation/content.md +44 -85
- package/templates/skills/sprite-cli/content.md +4 -5
- package/templates/skills/turso-interaction/content.md +14 -17
- package/templates/skills/workflow-iterate/content.md +38 -391
- package/templates/skills/x-api-interactions/content.md +4 -6
- package/templates/skills/scheduled-task-resilience/config.json +0 -14
- package/templates/skills/scheduled-task-resilience/content.md +0 -95
|
@@ -22,6 +22,9 @@ import { getScript, upsertScriptByName } from "../scripts/db";
|
|
|
22
22
|
import { extractArgsJsonSchema } from "../scripts/extract-schema";
|
|
23
23
|
import { typecheckScript } from "../scripts/typecheck";
|
|
24
24
|
import type { Seeder, SeedItem } from "../seed/types";
|
|
25
|
+
import bootTriageSrc from "./catalog/boot-triage.ts" with { type: "text" };
|
|
26
|
+
// @ts-expect-error Bun text imports return the raw source string for bundling standalone scripts.
|
|
27
|
+
import catalogReportSrc from "./catalog/catalog-report.ts" with { type: "text" };
|
|
25
28
|
import compoundInsightsSrc from "./catalog/compound-insights.ts" with { type: "text" };
|
|
26
29
|
import dateResolveSrc from "./catalog/date-resolve.ts" with { type: "text" };
|
|
27
30
|
import fetchReadableSrc from "./catalog/fetch-readable.ts" with { type: "text" };
|
|
@@ -30,6 +33,7 @@ import groupCountSrc from "./catalog/group-count.ts" with { type: "text" };
|
|
|
30
33
|
import jsonQuerySrc from "./catalog/json-query.ts" with { type: "text" };
|
|
31
34
|
import linearIssueSrc from "./catalog/linear-issue.ts" with { type: "text" };
|
|
32
35
|
import memoryDedupCheckSrc from "./catalog/memory-dedup-check.ts" with { type: "text" };
|
|
36
|
+
import memoryEvalSrc from "./catalog/memory-eval.ts" with { type: "text" };
|
|
33
37
|
import opsCatalogAuditSrc from "./catalog/ops-catalog-audit.ts" with { type: "text" };
|
|
34
38
|
import scheduleHealthSrc from "./catalog/schedule-health.ts" with { type: "text" };
|
|
35
39
|
import slackThreadFlattenSrc from "./catalog/slack-thread-flatten.ts" with { type: "text" };
|
|
@@ -50,6 +54,14 @@ export type SeedScript = {
|
|
|
50
54
|
// module's default export, so the cast restores the real shape.
|
|
51
55
|
const asText = (s: unknown): string => s as string;
|
|
52
56
|
|
|
57
|
+
const CATALOG_REPORT_IMPORT_RE = /^import\s+\{[^}]*\}\s+from "\.\/catalog-report";\n\n?/m;
|
|
58
|
+
|
|
59
|
+
function bundleCatalogReport(source: string): string {
|
|
60
|
+
const helper = asText(catalogReportSrc);
|
|
61
|
+
if (!CATALOG_REPORT_IMPORT_RE.test(source)) return source;
|
|
62
|
+
return `${helper}\n\n${source.replace(CATALOG_REPORT_IMPORT_RE, "")}`;
|
|
63
|
+
}
|
|
64
|
+
|
|
53
65
|
export const SEED_SCRIPTS: SeedScript[] = [
|
|
54
66
|
{
|
|
55
67
|
name: "gh-pr-snapshot",
|
|
@@ -103,7 +115,7 @@ export const SEED_SCRIPTS: SeedScript[] = [
|
|
|
103
115
|
"Scan recently failed swarm tasks and cluster them by failure reason, agent or schedule to surface recurring problems.",
|
|
104
116
|
intent:
|
|
105
117
|
"Find patterns in swarm task failures — which agent, schedule or error keeps breaking — for a reliability review.",
|
|
106
|
-
source: asText(taskFailureAuditSrc),
|
|
118
|
+
source: bundleCatalogReport(asText(taskFailureAuditSrc)),
|
|
107
119
|
},
|
|
108
120
|
{
|
|
109
121
|
name: "memory-dedup-check",
|
|
@@ -150,7 +162,7 @@ export const SEED_SCRIPTS: SeedScript[] = [
|
|
|
150
162
|
"Per-schedule failure rate check over recent tasks — flags schedules with failure rates above a configurable threshold.",
|
|
151
163
|
intent:
|
|
152
164
|
"Find unhealthy schedules that keep failing — for daily compounding, reliability reviews, or ops triage.",
|
|
153
|
-
source: asText(scheduleHealthSrc),
|
|
165
|
+
source: bundleCatalogReport(asText(scheduleHealthSrc)),
|
|
154
166
|
},
|
|
155
167
|
{
|
|
156
168
|
name: "tool-usage",
|
|
@@ -166,7 +178,15 @@ export const SEED_SCRIPTS: SeedScript[] = [
|
|
|
166
178
|
"All-in-one swarm-wide daily ops snapshot: task completion/failure summary, real failure clusters (excludes superseded/cancelled bookkeeping), schedule health flags, tool usage top-25, memory health/pollution stats, seed-script candidate tool triplets, and a per-agent breakdown. Aggregates across ALL agents via direct read-only SQL.",
|
|
167
179
|
intent:
|
|
168
180
|
"Single-call daily compounding Phase 0 helper — replaces ~25 raw tool roundtrips with one compressed JSON result covering every agent. For daily evolution, self-scripting candidates, ops reviews, or heartbeat context.",
|
|
169
|
-
source: asText(compoundInsightsSrc),
|
|
181
|
+
source: bundleCatalogReport(asText(compoundInsightsSrc)),
|
|
182
|
+
},
|
|
183
|
+
{
|
|
184
|
+
name: "memory-eval",
|
|
185
|
+
description:
|
|
186
|
+
"3-axis memory quality evaluation: carry-forward context (do follow-up tasks retrieve useful memories from prior tasks?), follow preferences (are CLAUDE.md/IDENTITY.md/SOUL.md/TOOLS.md memories retrieved and useful?), and stay current (what fraction of retrieved memories are fresh vs stale?). Outputs a baseline report to agent-fs + a swarm Page.",
|
|
187
|
+
intent:
|
|
188
|
+
"Measure memory system health across OpenAI Dreaming-inspired axes — before/after baseline for architecture changes, blog-post numbers, daily quality monitoring.",
|
|
189
|
+
source: asText(memoryEvalSrc),
|
|
170
190
|
},
|
|
171
191
|
{
|
|
172
192
|
name: "ops-catalog-audit",
|
|
@@ -174,7 +194,15 @@ export const SEED_SCRIPTS: SeedScript[] = [
|
|
|
174
194
|
"Audit-as-code catalog check for schedules, workflows, and prompt/template drift. Clusters actionable findings by goal and can publish an authed HTML report page.",
|
|
175
195
|
intent:
|
|
176
196
|
"Re-run the ops inventory audit in one call: duplicate/dead schedules, code-work routing risks, enabled workflow fixtures, structured-output gate gaps, prompt registry drift, stale hosts, and systemDefault skill duplicates.",
|
|
177
|
-
source: asText(opsCatalogAuditSrc),
|
|
197
|
+
source: bundleCatalogReport(asText(opsCatalogAuditSrc)),
|
|
198
|
+
},
|
|
199
|
+
{
|
|
200
|
+
name: "boot-triage",
|
|
201
|
+
description:
|
|
202
|
+
"Post-restart heartbeat triage snapshot: deploy restart PR context, recent real failures, stuck offline-agent work, orphaned tasks, and superseded tasks missing resume children.",
|
|
203
|
+
intent:
|
|
204
|
+
"Run immediately after a swarm restart to gather deterministic boot triage data in one read-only call before the Lead decides what to retry, cancel, or escalate.",
|
|
205
|
+
source: asText(bootTriageSrc),
|
|
178
206
|
},
|
|
179
207
|
];
|
|
180
208
|
|
|
@@ -25,12 +25,6 @@ import kvStorageContent from "../../../templates/skills/kv-storage/content.md" w
|
|
|
25
25
|
};
|
|
26
26
|
import pagesConfig from "../../../templates/skills/pages/config.json" with { type: "text" };
|
|
27
27
|
import pagesContent from "../../../templates/skills/pages/content.md" with { type: "text" };
|
|
28
|
-
import scheduledTaskResilienceConfig from "../../../templates/skills/scheduled-task-resilience/config.json" with {
|
|
29
|
-
type: "text",
|
|
30
|
-
};
|
|
31
|
-
import scheduledTaskResilienceContent from "../../../templates/skills/scheduled-task-resilience/content.md" with {
|
|
32
|
-
type: "text",
|
|
33
|
-
};
|
|
34
28
|
import scriptWorkflowsConfig from "../../../templates/skills/script-workflows/config.json" with {
|
|
35
29
|
type: "text",
|
|
36
30
|
};
|
|
@@ -77,7 +71,6 @@ const BUILT_IN_SKILL_SOURCES = [
|
|
|
77
71
|
{ config: artifactsConfig, body: artifactsContent },
|
|
78
72
|
{ config: kvStorageConfig, body: kvStorageContent },
|
|
79
73
|
{ config: pagesConfig, body: pagesContent },
|
|
80
|
-
{ config: scheduledTaskResilienceConfig, body: scheduledTaskResilienceContent },
|
|
81
74
|
{ config: scriptWorkflowsConfig, body: scriptWorkflowsContent },
|
|
82
75
|
{ config: swarmScriptsConfig, body: swarmScriptsContent },
|
|
83
76
|
{ config: workflowIterateConfig, body: workflowIterateContent },
|
package/src/be/skill-sync.ts
CHANGED
|
@@ -8,10 +8,11 @@
|
|
|
8
8
|
* This runs on the API side — workers call it via POST /api/skills/sync-filesystem.
|
|
9
9
|
*/
|
|
10
10
|
|
|
11
|
+
import type { Dirent } from "node:fs";
|
|
11
12
|
import { existsSync, mkdirSync, readdirSync, rmSync, writeFileSync } from "node:fs";
|
|
12
13
|
import { homedir } from "node:os";
|
|
13
|
-
import { join } from "node:path";
|
|
14
|
-
import { getAgentSkills } from "./db";
|
|
14
|
+
import { dirname, join } from "node:path";
|
|
15
|
+
import { getAgentSkills, getSkillFiles } from "./db";
|
|
15
16
|
|
|
16
17
|
export interface SkillSyncResult {
|
|
17
18
|
synced: number;
|
|
@@ -29,11 +30,68 @@ export interface SkillSyncResult {
|
|
|
29
30
|
*/
|
|
30
31
|
const SWARM_MARKER_FILE = ".swarm-managed";
|
|
31
32
|
|
|
33
|
+
function reconcileManagedSkillFiles(skillDir: string, currentRelativeFiles: Set<string>): number {
|
|
34
|
+
if (!existsSync(join(skillDir, SWARM_MARKER_FILE))) return 0;
|
|
35
|
+
|
|
36
|
+
let removed = 0;
|
|
37
|
+
|
|
38
|
+
const walk = (dir: string, relativeDir = ""): boolean => {
|
|
39
|
+
let entries: Dirent[];
|
|
40
|
+
try {
|
|
41
|
+
entries = readdirSync(dir, { withFileTypes: true });
|
|
42
|
+
} catch {
|
|
43
|
+
return false;
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
let hasEntries = false;
|
|
47
|
+
for (const entry of entries) {
|
|
48
|
+
const relativePath = relativeDir ? `${relativeDir}/${entry.name}` : entry.name;
|
|
49
|
+
const fullPath = join(dir, entry.name);
|
|
50
|
+
|
|
51
|
+
if (entry.isDirectory()) {
|
|
52
|
+
const childHasEntries = walk(fullPath, relativePath);
|
|
53
|
+
if (!childHasEntries) {
|
|
54
|
+
try {
|
|
55
|
+
rmSync(fullPath, { recursive: true, force: true });
|
|
56
|
+
} catch {
|
|
57
|
+
hasEntries = true;
|
|
58
|
+
}
|
|
59
|
+
} else {
|
|
60
|
+
hasEntries = true;
|
|
61
|
+
}
|
|
62
|
+
continue;
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
if (
|
|
66
|
+
relativePath === "SKILL.md" ||
|
|
67
|
+
relativePath === SWARM_MARKER_FILE ||
|
|
68
|
+
currentRelativeFiles.has(relativePath)
|
|
69
|
+
) {
|
|
70
|
+
hasEntries = true;
|
|
71
|
+
continue;
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
try {
|
|
75
|
+
rmSync(fullPath, { force: true });
|
|
76
|
+
removed++;
|
|
77
|
+
} catch {
|
|
78
|
+
hasEntries = true;
|
|
79
|
+
}
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
return hasEntries;
|
|
83
|
+
};
|
|
84
|
+
|
|
85
|
+
walk(skillDir);
|
|
86
|
+
return removed;
|
|
87
|
+
}
|
|
88
|
+
|
|
32
89
|
/**
|
|
33
90
|
* Sync agent's installed skills to the filesystem.
|
|
34
91
|
*
|
|
35
92
|
* For simple skills (content in DB): writes SKILL.md to ~/.claude/skills/<name>/
|
|
36
|
-
* For complex skills
|
|
93
|
+
* For DB-backed complex skills: writes SKILL.md plus bundled skill_files rows.
|
|
94
|
+
* Legacy complex skills without skill_files remain handled by npx in entrypoint.
|
|
37
95
|
*/
|
|
38
96
|
export function syncSkillsToFilesystem(
|
|
39
97
|
agentId: string,
|
|
@@ -44,6 +102,7 @@ export function syncSkillsToFilesystem(
|
|
|
44
102
|
const home = homeOverride ?? homedir();
|
|
45
103
|
const errors: string[] = [];
|
|
46
104
|
let synced = 0;
|
|
105
|
+
let removed = 0;
|
|
47
106
|
|
|
48
107
|
// Directories to write to
|
|
49
108
|
const skillDirs: string[] = [];
|
|
@@ -67,7 +126,8 @@ export function syncSkillsToFilesystem(
|
|
|
67
126
|
|
|
68
127
|
for (const skill of skills) {
|
|
69
128
|
if (!skill.isActive || !skill.isEnabled) continue;
|
|
70
|
-
|
|
129
|
+
const bundledFiles = skill.isComplex ? getSkillFiles(skill.id) : [];
|
|
130
|
+
if (skill.isComplex && bundledFiles.length === 0) continue; // Legacy complex skills handled by npx
|
|
71
131
|
if (!skill.content) continue;
|
|
72
132
|
|
|
73
133
|
// Sanitize skill name to prevent path traversal (strip /, .., and non-safe chars)
|
|
@@ -75,6 +135,9 @@ export function syncSkillsToFilesystem(
|
|
|
75
135
|
if (!safeName) continue;
|
|
76
136
|
|
|
77
137
|
writtenNames.add(safeName);
|
|
138
|
+
const currentBundledFilePaths = new Set(
|
|
139
|
+
bundledFiles.filter((file) => !file.isBinary).map((file) => file.path),
|
|
140
|
+
);
|
|
78
141
|
|
|
79
142
|
for (const baseDir of skillDirs) {
|
|
80
143
|
const skillDir = join(baseDir, safeName);
|
|
@@ -83,14 +146,36 @@ export function syncSkillsToFilesystem(
|
|
|
83
146
|
|
|
84
147
|
try {
|
|
85
148
|
mkdirSync(skillDir, { recursive: true });
|
|
149
|
+
removed += reconcileManagedSkillFiles(skillDir, currentBundledFilePaths);
|
|
86
150
|
writeFileSync(skillFile, skill.content, "utf-8");
|
|
87
151
|
writeFileSync(markerFile, "", "utf-8");
|
|
88
152
|
synced++;
|
|
89
153
|
} catch (err) {
|
|
90
|
-
|
|
91
|
-
|
|
154
|
+
const msg = err instanceof Error ? err.message : "Unknown error";
|
|
155
|
+
errors.push(`${skill.name} -> ${skillDir}: ${msg}`);
|
|
156
|
+
console.error(
|
|
157
|
+
`[skill-sync] Failed to write SKILL.md for ${skill.name} to ${skillDir}: ${msg}`,
|
|
92
158
|
);
|
|
93
159
|
}
|
|
160
|
+
|
|
161
|
+
for (const file of bundledFiles) {
|
|
162
|
+
if (file.isBinary) {
|
|
163
|
+
console.log(`[skill-sync] Skipping binary skill file ${skill.name}/${file.path}`);
|
|
164
|
+
continue;
|
|
165
|
+
}
|
|
166
|
+
|
|
167
|
+
const targetPath = join(skillDir, file.path);
|
|
168
|
+
try {
|
|
169
|
+
mkdirSync(dirname(targetPath), { recursive: true });
|
|
170
|
+
writeFileSync(targetPath, file.content, "utf-8");
|
|
171
|
+
} catch (err) {
|
|
172
|
+
const msg = err instanceof Error ? err.message : "Unknown error";
|
|
173
|
+
errors.push(`${skill.name}/${file.path} -> ${targetPath}: ${msg}`);
|
|
174
|
+
console.error(
|
|
175
|
+
`[skill-sync] Failed to write bundled file ${skill.name}/${file.path} to ${targetPath}: ${msg}`,
|
|
176
|
+
);
|
|
177
|
+
}
|
|
178
|
+
}
|
|
94
179
|
}
|
|
95
180
|
}
|
|
96
181
|
|
|
@@ -98,7 +183,6 @@ export function syncSkillsToFilesystem(
|
|
|
98
183
|
// present). Leaves user-installed personal skills alone — important on
|
|
99
184
|
// local dev where ~/.codex/skills holds skills the user installed
|
|
100
185
|
// outside the swarm.
|
|
101
|
-
let removed = 0;
|
|
102
186
|
for (const baseDir of skillDirs) {
|
|
103
187
|
if (!existsSync(baseDir)) continue;
|
|
104
188
|
|
package/src/commands/runner.ts
CHANGED
|
@@ -3127,6 +3127,7 @@ async function checkCompletedProcesses(
|
|
|
3127
3127
|
state: RunnerState,
|
|
3128
3128
|
role: string,
|
|
3129
3129
|
apiConfig?: ApiConfig,
|
|
3130
|
+
cancelledSignaled?: Set<string>,
|
|
3130
3131
|
): Promise<void> {
|
|
3131
3132
|
const completedTasks: Array<{
|
|
3132
3133
|
taskId: string;
|
|
@@ -3161,6 +3162,9 @@ async function checkCompletedProcesses(
|
|
|
3161
3162
|
// Remove completed tasks from the map and ensure they're marked as finished
|
|
3162
3163
|
for (const { taskId, result, cursorUpdates, workingDir, credentialInfo } of completedTasks) {
|
|
3163
3164
|
state.activeTasks.delete(taskId);
|
|
3165
|
+
vcsDetectedTasks.delete(taskId);
|
|
3166
|
+
vcsCheckTimestamps.delete(taskId);
|
|
3167
|
+
cancelledSignaled?.delete(taskId);
|
|
3164
3168
|
|
|
3165
3169
|
if (apiConfig) {
|
|
3166
3170
|
removeActiveSession(apiConfig, taskId);
|
|
@@ -4122,7 +4126,7 @@ export async function runAgent(config: RunnerConfig, opts: RunnerOptions) {
|
|
|
4122
4126
|
|
|
4123
4127
|
// Wait if at capacity (though unlikely on fresh startup)
|
|
4124
4128
|
while (state.activeTasks.size >= state.maxConcurrent) {
|
|
4125
|
-
await checkCompletedProcesses(state, role, apiConfig);
|
|
4129
|
+
await checkCompletedProcesses(state, role, apiConfig, cancelledSignaled);
|
|
4126
4130
|
await Bun.sleep(1000);
|
|
4127
4131
|
}
|
|
4128
4132
|
|
|
@@ -4341,7 +4345,7 @@ export async function runAgent(config: RunnerConfig, opts: RunnerOptions) {
|
|
|
4341
4345
|
await pingServer(apiConfig, role);
|
|
4342
4346
|
|
|
4343
4347
|
// Check for completed processes first and ensure tasks are marked as finished
|
|
4344
|
-
await checkCompletedProcesses(state, role, apiConfig);
|
|
4348
|
+
await checkCompletedProcesses(state, role, apiConfig, cancelledSignaled);
|
|
4345
4349
|
|
|
4346
4350
|
// Live HARNESS_PROVIDER reconciliation. Re-fetches `swarm_config` (overlaid
|
|
4347
4351
|
// on env) and swaps the adapter if the resolved provider changed —
|
|
@@ -27,17 +27,19 @@ Goal: Review system status and your standing orders, take action if needed.
|
|
|
27
27
|
|
|
28
28
|
## Instructions
|
|
29
29
|
1. **Read your HEARTBEAT.md** — run \`read /workspace/HEARTBEAT.md\` to get the latest standing orders (the snapshot above may be slightly stale).
|
|
30
|
-
2.
|
|
31
|
-
3. **
|
|
30
|
+
2. **Prune tracked HEARTBEAT items FIRST.** Active Blockers + Watch Items + Open Discussion combined must stay at ≤10 items; 20 is the absolute max only when genuinely super busy. This cap does not apply to evergreen Standing Orders / Governance / Playbook-index reference sections. Before adding anything, re-check every tracked item against its lift trigger and remove anything resolved, stale, or past its trigger date. Lift incident detail to memory; never keep it inline in HEARTBEAT.md. Every new tracked item must include an explicit lift trigger + date; if it has no removal condition, do not add it. If the tracked list is already at/over cap, prune before (or instead of) adding — the cap is binding.
|
|
31
|
+
3. **Run seeded heartbeat data-gathering.** Use \`script-run\` with global script \`Heartbeat Audit\` and pass the current HEARTBEAT.md text as \`heartbeatMarkdown\`. It covers Rules #10/#13/#15/#16/#17: resolved stale PRs, pool-target risk schedules, schedule/provider failure clusters, and whether daily-blocker-digest ran today. The Slack thread-reply check (Rule #11) stays Lead-side; the script runtime has no Slack token.
|
|
32
|
+
4. Review the system status above plus the \`Heartbeat Audit\` result for anything that needs attention (stalled tasks, idle workers with available work, anomalies).
|
|
33
|
+
5. **CRITICAL — Reboot failure triage:** Failures with reason "worker session not found" or "worker session heartbeat is stale" indicate tasks that were INTERRUPTED by a server restart. These are NOT "expected auto-cleanup" — they represent work that was lost mid-execution. For each such failure:
|
|
32
34
|
- Check what the task was (via \`get-task-details\` with the task ID from the failure)
|
|
33
35
|
- If a retry task was auto-created (tagged \`reboot-retry\`), verify it is progressing
|
|
34
36
|
- If no retry exists and the work is still needed, re-create the task
|
|
35
37
|
- Do NOT dismiss these as "expected" or "auto-cleanup"
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
38
|
+
6. Review your standing orders for any periodic checks or actions.
|
|
39
|
+
7. If something needs attention — take action now using your available tools (create tasks, post to Slack, cancel stuck tasks, etc.).
|
|
40
|
+
8. If everything looks healthy and no standing orders are actionable — complete this task with a brief "All clear" summary. You may NOT say "All clear" if reboot-related failures exist that haven't been triaged.
|
|
41
|
+
9. Do NOT create another heartbeat-checklist task — the system handles scheduling.
|
|
42
|
+
10. **Update HEARTBEAT.md only after pruning.** Keep it current, but keep tracked items capped: remove resolved items, add only dated lift-triggered items, and lift detail to memory instead of growing the file.`,
|
|
41
43
|
variables: [
|
|
42
44
|
{
|
|
43
45
|
name: "system_status",
|
|
@@ -73,22 +75,24 @@ The API server has just restarted (deployment, pod rotation, or crash). An aggre
|
|
|
73
75
|
{{heartbeat_content}}
|
|
74
76
|
|
|
75
77
|
## Instructions
|
|
76
|
-
1. **
|
|
78
|
+
1. **Prune tracked HEARTBEAT items FIRST.** Active Blockers + Watch Items + Open Discussion combined must stay at ≤10 items; 20 is the absolute max only when genuinely super busy. This cap does not apply to evergreen Standing Orders / Governance / Playbook-index reference sections. Before adding anything, re-check every tracked item against its lift trigger and remove anything resolved, stale, or past its trigger date. Lift incident detail to memory; never keep it inline in HEARTBEAT.md. Every new tracked item must include an explicit lift trigger + date; if it has no removal condition, do not add it. If the tracked list is already at/over cap, prune before (or instead of) adding — the cap is binding.
|
|
79
|
+
2. **Run seeded boot triage.** Use \`script-run\` with global script \`boot-triage\` to gather deploy-restart PR context, recent real failures, stuck offline-agent work, orphaned pending/offered tasks, and superseded tasks missing resume children in one read-only call.
|
|
80
|
+
3. **Triage reboot-interrupted work FIRST.** If the "Reboot-Interrupted Work" section above or the \`boot-triage\` result lists tasks:
|
|
77
81
|
- For each task: verify the retry is progressing via \`get-task-details\` with the retry task ID
|
|
78
82
|
- If a retry failed or is stuck, re-create the task manually
|
|
79
83
|
- If the work is no longer needed, cancel the retry task
|
|
80
84
|
- You MUST address every item — do NOT skip this section
|
|
81
|
-
|
|
85
|
+
4. **Verify supersede + resume worked end-to-end.** Worker crashes / OOMs are recovered via supersede (parent → \`superseded\`) + a fresh \`taskType=resume\` child created by the heartbeat sweep (DES-523). Sanity check:
|
|
82
86
|
- List recent \`superseded\` tasks: \`list-tasks status=superseded\` (last ~hour).
|
|
83
87
|
- For each, confirm a child task with \`taskType=resume\` and a non-terminal status exists. If a superseded task is missing its resume child, the work is silently dropped — recreate the task manually.
|
|
84
88
|
- Look for \`in_progress\` tasks older than 5 min on agents that show as offline — the sweep should have caught them. If any remain, recreate as needed.
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
89
|
+
5. **Check orphaned tasks.** If the "Orphaned Tasks" section or \`boot-triage\` result lists pending/offered tasks assigned to offline workers, re-assign or cancel them.
|
|
90
|
+
6. Review agent status — are all expected workers online? If not, note which are missing.
|
|
91
|
+
7. Review your standing orders for any post-reboot checks.
|
|
92
|
+
8. Take action using your available tools.
|
|
93
|
+
9. Complete this task with a summary of what you found and what actions you took. Include the status of each reboot-interrupted task.
|
|
94
|
+
10. Do NOT create another boot-triage task — this is a one-off event.
|
|
95
|
+
11. **Update HEARTBEAT.md only after pruning.** If the reboot revealed a pattern worth monitoring, add it only as a dated lift-triggered tracked item and only if the cap still holds after pruning. Lift incident detail to memory instead of growing HEARTBEAT.md.`,
|
|
92
96
|
variables: [
|
|
93
97
|
{
|
|
94
98
|
name: "system_status",
|
package/src/http/index.ts
CHANGED
|
@@ -43,12 +43,17 @@ import { handleHeartbeat } from "./heartbeat";
|
|
|
43
43
|
import { handleInboxState } from "./inbox-state";
|
|
44
44
|
import { handleIntegrations } from "./integrations";
|
|
45
45
|
import { handleKv } from "./kv";
|
|
46
|
-
import {
|
|
46
|
+
import {
|
|
47
|
+
closeIdleMcpTransports,
|
|
48
|
+
DEFAULT_MCP_TRANSPORT_IDLE_TIMEOUT_MS,
|
|
49
|
+
handleMcp,
|
|
50
|
+
type McpTransportActivity,
|
|
51
|
+
} from "./mcp";
|
|
47
52
|
import { handleMcpBridge } from "./mcp-bridge";
|
|
48
53
|
import { handleMcpOAuth, startMcpOAuthPendingGc, stopMcpOAuthPendingGc } from "./mcp-oauth";
|
|
49
54
|
import { handleMcpServers } from "./mcp-servers";
|
|
50
|
-
import { handleMcpUser } from "./mcp-user";
|
|
51
|
-
import { handleMemory } from "./memory";
|
|
55
|
+
import { closeIdleMcpUserTransports, handleMcpUser } from "./mcp-user";
|
|
56
|
+
import { handleMemory, startMemoryGc, stopMemoryGc } from "./memory";
|
|
52
57
|
import { handleMetrics } from "./metrics";
|
|
53
58
|
import { handlePageProxy } from "./page-proxy";
|
|
54
59
|
import { handlePages } from "./pages";
|
|
@@ -99,12 +104,15 @@ const globalState = globalThis as typeof globalThis & {
|
|
|
99
104
|
__transports?: Record<string, StreamableHTTPServerTransport>;
|
|
100
105
|
__transportsUser?: Record<string, StreamableHTTPServerTransport>;
|
|
101
106
|
__sessionUsers?: Record<string, string>;
|
|
107
|
+
__transportActivity?: McpTransportActivity;
|
|
108
|
+
__transportActivityUser?: McpTransportActivity;
|
|
102
109
|
__sigintRegistered?: boolean;
|
|
103
110
|
__apiGcInterval?: ReturnType<typeof setInterval>;
|
|
104
111
|
__runId?: string;
|
|
105
112
|
};
|
|
106
113
|
|
|
107
114
|
const API_GC_INTERVAL_MS = 5 * 60 * 1000;
|
|
115
|
+
const MCP_TRANSPORT_IDLE_TIMEOUT_MS = DEFAULT_MCP_TRANSPORT_IDLE_TIMEOUT_MS;
|
|
108
116
|
|
|
109
117
|
type GcCapableGlobal = typeof globalThis & { gc?: () => void };
|
|
110
118
|
|
|
@@ -130,11 +138,25 @@ function startApiGcInterval() {
|
|
|
130
138
|
|
|
131
139
|
const gc = (globalThis as GcCapableGlobal).gc;
|
|
132
140
|
if (typeof gc !== "function") {
|
|
133
|
-
console.log("[HTTP] Explicit GC unavailable;
|
|
134
|
-
return;
|
|
141
|
+
console.log("[HTTP] Explicit GC unavailable; idle MCP transport sweeps remain enabled");
|
|
135
142
|
}
|
|
136
143
|
|
|
137
144
|
const interval = setInterval(() => {
|
|
145
|
+
const closedOwnerTransports = closeIdleMcpTransports(transports, transportActivity, {
|
|
146
|
+
idleTimeoutMs: MCP_TRANSPORT_IDLE_TIMEOUT_MS,
|
|
147
|
+
label: "MCP",
|
|
148
|
+
});
|
|
149
|
+
const closedUserTransports = closeIdleMcpUserTransports(
|
|
150
|
+
transportsUser,
|
|
151
|
+
sessionUsers,
|
|
152
|
+
transportActivityUser,
|
|
153
|
+
{ idleTimeoutMs: MCP_TRANSPORT_IDLE_TIMEOUT_MS },
|
|
154
|
+
);
|
|
155
|
+
if (closedOwnerTransports > 0 || closedUserTransports > 0) {
|
|
156
|
+
console.log(
|
|
157
|
+
`[HTTP] Closed ${closedOwnerTransports} owner MCP and ${closedUserTransports} user MCP idle transport(s)`,
|
|
158
|
+
);
|
|
159
|
+
}
|
|
138
160
|
scheduleApiGc("periodic API sweep");
|
|
139
161
|
}, API_GC_INTERVAL_MS);
|
|
140
162
|
interval.unref?.();
|
|
@@ -151,6 +173,8 @@ const transports: Record<string, StreamableHTTPServerTransport> = globalState.__
|
|
|
151
173
|
const transportsUser: Record<string, StreamableHTTPServerTransport> =
|
|
152
174
|
globalState.__transportsUser ?? {};
|
|
153
175
|
const sessionUsers: Record<string, string> = globalState.__sessionUsers ?? {};
|
|
176
|
+
const transportActivity: McpTransportActivity = globalState.__transportActivity ?? {};
|
|
177
|
+
const transportActivityUser: McpTransportActivity = globalState.__transportActivityUser ?? {};
|
|
154
178
|
|
|
155
179
|
const httpServer = createHttpServer(async (req, res) => {
|
|
156
180
|
const startTime = performance.now();
|
|
@@ -291,8 +315,8 @@ const httpServer = createHttpServer(async (req, res) => {
|
|
|
291
315
|
() => handleSessions(req, res, pathSegments, queryParams),
|
|
292
316
|
() => handleInboxState(req, res, pathSegments, queryParams),
|
|
293
317
|
() => handleTaskTemplates(req, res, pathSegments, queryParams),
|
|
294
|
-
() => handleMcp(req, res, transports),
|
|
295
|
-
() => handleMcpUser(req, res, transportsUser, sessionUsers),
|
|
318
|
+
() => handleMcp(req, res, transports, transportActivity),
|
|
319
|
+
() => handleMcpUser(req, res, transportsUser, sessionUsers, transportActivityUser),
|
|
296
320
|
];
|
|
297
321
|
|
|
298
322
|
try {
|
|
@@ -334,6 +358,8 @@ globalState.__httpServer = httpServer;
|
|
|
334
358
|
globalState.__transports = transports;
|
|
335
359
|
globalState.__transportsUser = transportsUser;
|
|
336
360
|
globalState.__sessionUsers = sessionUsers;
|
|
361
|
+
globalState.__transportActivity = transportActivity;
|
|
362
|
+
globalState.__transportActivityUser = transportActivityUser;
|
|
337
363
|
|
|
338
364
|
async function shutdown() {
|
|
339
365
|
console.log("Shutting down HTTP server...");
|
|
@@ -362,6 +388,9 @@ async function shutdown() {
|
|
|
362
388
|
// Stop MCP OAuth pending-session garbage collector
|
|
363
389
|
stopMcpOAuthPendingGc();
|
|
364
390
|
|
|
391
|
+
// Stop memory expired-row garbage collector
|
|
392
|
+
stopMemoryGc();
|
|
393
|
+
|
|
365
394
|
if (globalState.__apiGcInterval) {
|
|
366
395
|
clearInterval(globalState.__apiGcInterval);
|
|
367
396
|
delete globalState.__apiGcInterval;
|
|
@@ -372,6 +401,7 @@ async function shutdown() {
|
|
|
372
401
|
console.log(`[HTTP] Closing transport ${id}`);
|
|
373
402
|
transport.close();
|
|
374
403
|
delete transports[id];
|
|
404
|
+
delete transportActivity[id];
|
|
375
405
|
}
|
|
376
406
|
|
|
377
407
|
for (const [id, transport] of Object.entries(transportsUser)) {
|
|
@@ -379,6 +409,7 @@ async function shutdown() {
|
|
|
379
409
|
transport.close();
|
|
380
410
|
delete transportsUser[id];
|
|
381
411
|
delete sessionUsers[id];
|
|
412
|
+
delete transportActivityUser[id];
|
|
382
413
|
}
|
|
383
414
|
|
|
384
415
|
// Close all active connections forcefully
|
|
@@ -522,6 +553,9 @@ httpServer
|
|
|
522
553
|
|
|
523
554
|
// Start MCP OAuth pending-session garbage collector (5-min tick)
|
|
524
555
|
startMcpOAuthPendingGc();
|
|
556
|
+
|
|
557
|
+
// Start expired-memory garbage collector (1-hour tick, immediate first run)
|
|
558
|
+
startMemoryGc();
|
|
525
559
|
})
|
|
526
560
|
.on("error", (err) => {
|
|
527
561
|
console.error("HTTP Server Error:", err);
|
package/src/http/mcp-user.ts
CHANGED
|
@@ -5,6 +5,7 @@ import { isInitializeRequest } from "@modelcontextprotocol/sdk/types.js";
|
|
|
5
5
|
import { resolveUserByToken } from "@/be/users";
|
|
6
6
|
import { createUserServer } from "@/server-user";
|
|
7
7
|
import type { User } from "@/types";
|
|
8
|
+
import { closeIdleMcpTransports, type McpTransportActivity, markMcpTransportActivity } from "./mcp";
|
|
8
9
|
|
|
9
10
|
function unauthorized(res: ServerResponse): true {
|
|
10
11
|
res.writeHead(401, { "Content-Type": "application/json" });
|
|
@@ -32,6 +33,7 @@ export async function handleMcpUser(
|
|
|
32
33
|
res: ServerResponse,
|
|
33
34
|
transports: Record<string, StreamableHTTPServerTransport>,
|
|
34
35
|
sessionUsers: Record<string, string>,
|
|
36
|
+
sessionActivity: McpTransportActivity = {},
|
|
35
37
|
): Promise<boolean> {
|
|
36
38
|
const sessionId = req.headers["mcp-session-id"] as string | undefined;
|
|
37
39
|
|
|
@@ -57,16 +59,19 @@ export async function handleMcpUser(
|
|
|
57
59
|
|
|
58
60
|
if (sessionId && transports[sessionId]) {
|
|
59
61
|
transport = transports[sessionId];
|
|
62
|
+
markMcpTransportActivity(sessionActivity, sessionId);
|
|
60
63
|
} else if (!sessionId && isInitializeRequest(body)) {
|
|
61
64
|
transport = new StreamableHTTPServerTransport({
|
|
62
65
|
sessionIdGenerator: () => randomUUID(),
|
|
63
66
|
onsessioninitialized: (id) => {
|
|
64
67
|
transports[id] = transport;
|
|
65
68
|
sessionUsers[id] = user.id;
|
|
69
|
+
markMcpTransportActivity(sessionActivity, id);
|
|
66
70
|
},
|
|
67
71
|
onsessionclosed: (id) => {
|
|
68
72
|
delete transports[id];
|
|
69
73
|
delete sessionUsers[id];
|
|
74
|
+
delete sessionActivity[id];
|
|
70
75
|
},
|
|
71
76
|
});
|
|
72
77
|
|
|
@@ -74,6 +79,7 @@ export async function handleMcpUser(
|
|
|
74
79
|
if (transport.sessionId) {
|
|
75
80
|
delete transports[transport.sessionId];
|
|
76
81
|
delete sessionUsers[transport.sessionId];
|
|
82
|
+
delete sessionActivity[transport.sessionId];
|
|
77
83
|
}
|
|
78
84
|
};
|
|
79
85
|
|
|
@@ -92,11 +98,13 @@ export async function handleMcpUser(
|
|
|
92
98
|
}
|
|
93
99
|
|
|
94
100
|
await transport.handleRequest(req, res, body);
|
|
101
|
+
markMcpTransportActivity(sessionActivity, transport.sessionId);
|
|
95
102
|
return true;
|
|
96
103
|
}
|
|
97
104
|
|
|
98
105
|
if (req.method === "GET" || req.method === "DELETE") {
|
|
99
106
|
if (sessionId && transports[sessionId]) {
|
|
107
|
+
markMcpTransportActivity(sessionActivity, sessionId);
|
|
100
108
|
await transports[sessionId].handleRequest(req, res);
|
|
101
109
|
return true;
|
|
102
110
|
}
|
|
@@ -109,3 +117,18 @@ export async function handleMcpUser(
|
|
|
109
117
|
res.end("Method not allowed");
|
|
110
118
|
return true;
|
|
111
119
|
}
|
|
120
|
+
|
|
121
|
+
export function closeIdleMcpUserTransports(
|
|
122
|
+
transports: Record<string, StreamableHTTPServerTransport>,
|
|
123
|
+
sessionUsers: Record<string, string>,
|
|
124
|
+
sessionActivity: McpTransportActivity,
|
|
125
|
+
options: { now?: number; idleTimeoutMs?: number } = {},
|
|
126
|
+
): number {
|
|
127
|
+
return closeIdleMcpTransports(transports, sessionActivity, {
|
|
128
|
+
...options,
|
|
129
|
+
label: "user MCP",
|
|
130
|
+
onClose: (id) => {
|
|
131
|
+
delete sessionUsers[id];
|
|
132
|
+
},
|
|
133
|
+
});
|
|
134
|
+
}
|