@cleocode/core 2026.3.57 → 2026.3.59
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/agents/agent-registry.d.ts +206 -0
- package/dist/agents/agent-registry.d.ts.map +1 -0
- package/dist/agents/agent-schema.d.ts.map +1 -1
- package/dist/agents/execution-learning.d.ts +223 -0
- package/dist/agents/execution-learning.d.ts.map +1 -0
- package/dist/agents/health-monitor.d.ts +161 -0
- package/dist/agents/health-monitor.d.ts.map +1 -0
- package/dist/agents/index.d.ts +4 -1
- package/dist/agents/index.d.ts.map +1 -1
- package/dist/agents/retry.d.ts +57 -4
- package/dist/agents/retry.d.ts.map +1 -1
- package/dist/backfill/index.d.ts +83 -0
- package/dist/backfill/index.d.ts.map +1 -0
- package/dist/bootstrap.d.ts +1 -1
- package/dist/config.d.ts +47 -0
- package/dist/config.d.ts.map +1 -1
- package/dist/index.d.ts +2 -1
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +6985 -5068
- package/dist/index.js.map +4 -4
- package/dist/intelligence/adaptive-validation.d.ts +151 -0
- package/dist/intelligence/adaptive-validation.d.ts.map +1 -0
- package/dist/intelligence/impact.d.ts +34 -1
- package/dist/intelligence/impact.d.ts.map +1 -1
- package/dist/intelligence/index.d.ts +7 -2
- package/dist/intelligence/index.d.ts.map +1 -1
- package/dist/intelligence/types.d.ts +60 -0
- package/dist/intelligence/types.d.ts.map +1 -1
- package/dist/internal.d.ts +8 -4
- package/dist/internal.d.ts.map +1 -1
- package/dist/lib/index.d.ts +10 -0
- package/dist/lib/index.d.ts.map +1 -0
- package/dist/lib/retry.d.ts +128 -0
- package/dist/lib/retry.d.ts.map +1 -0
- package/dist/nexus/sharing/index.d.ts +48 -2
- package/dist/nexus/sharing/index.d.ts.map +1 -1
- package/dist/sessions/session-enforcement.d.ts.map +1 -1
- package/dist/stats/index.d.ts +1 -0
- package/dist/stats/index.d.ts.map +1 -1
- package/dist/stats/workflow-telemetry.d.ts +89 -0
- package/dist/stats/workflow-telemetry.d.ts.map +1 -0
- package/dist/store/brain-schema.d.ts.map +1 -1
- package/dist/store/converters.d.ts.map +1 -1
- package/dist/store/cross-db-cleanup.d.ts +93 -0
- package/dist/store/cross-db-cleanup.d.ts.map +1 -0
- package/dist/store/db-helpers.d.ts.map +1 -1
- package/dist/store/migration-sqlite.d.ts.map +1 -1
- package/dist/store/sqlite-data-accessor.d.ts.map +1 -1
- package/dist/store/sqlite.d.ts.map +1 -1
- package/dist/store/task-store.d.ts.map +1 -1
- package/dist/store/tasks-schema.d.ts +18 -3
- package/dist/store/tasks-schema.d.ts.map +1 -1
- package/dist/store/validation-schemas.d.ts +32 -0
- package/dist/store/validation-schemas.d.ts.map +1 -1
- package/dist/tasks/add.d.ts +10 -1
- package/dist/tasks/add.d.ts.map +1 -1
- package/dist/tasks/complete.d.ts.map +1 -1
- package/dist/tasks/enforcement.d.ts +22 -0
- package/dist/tasks/enforcement.d.ts.map +1 -0
- package/dist/tasks/epic-enforcement.d.ts +199 -0
- package/dist/tasks/epic-enforcement.d.ts.map +1 -0
- package/dist/tasks/index.d.ts +1 -1
- package/dist/tasks/index.d.ts.map +1 -1
- package/dist/tasks/pipeline-stage.d.ts +181 -0
- package/dist/tasks/pipeline-stage.d.ts.map +1 -0
- package/dist/tasks/update.d.ts +2 -0
- package/dist/tasks/update.d.ts.map +1 -1
- package/migrations/drizzle-brain/20260321000001_t033-brain-indexes/migration.sql +12 -0
- package/migrations/drizzle-brain/20260321000001_t033-brain-indexes/snapshot.json +1232 -0
- package/migrations/drizzle-tasks/20260321000000_t033-connection-health/migration.sql +518 -0
- package/migrations/drizzle-tasks/20260321000000_t033-connection-health/snapshot.json +4312 -0
- package/migrations/drizzle-tasks/20260321000002_t060-pipeline-stage-binding/migration.sql +82 -0
- package/migrations/drizzle-tasks/20260321000002_t060-pipeline-stage-binding/snapshot.json +9 -0
- package/package.json +5 -5
- package/schemas/config.schema.json +37 -1547
- package/src/__tests__/sharing.test.ts +24 -0
- package/src/agents/__tests__/agent-registry.test.ts +351 -0
- package/src/agents/__tests__/execution-learning.test.ts +684 -0
- package/src/agents/__tests__/health-monitor.test.ts +332 -0
- package/src/agents/__tests__/registry.test.ts +30 -2
- package/src/agents/agent-registry.ts +394 -0
- package/src/agents/agent-schema.ts +5 -0
- package/src/agents/execution-learning.ts +675 -0
- package/src/agents/health-monitor.ts +279 -0
- package/src/agents/index.ts +37 -1
- package/src/agents/retry.ts +57 -4
- package/src/backfill/index.ts +309 -0
- package/src/bootstrap.ts +1 -1
- package/src/config.ts +126 -0
- package/src/index.ts +8 -1
- package/src/intelligence/__tests__/adaptive-validation.test.ts +694 -0
- package/src/intelligence/__tests__/impact.test.ts +165 -1
- package/src/intelligence/adaptive-validation.ts +764 -0
- package/src/intelligence/impact.ts +203 -0
- package/src/intelligence/index.ts +19 -0
- package/src/intelligence/types.ts +76 -0
- package/src/internal.ts +39 -0
- package/src/lib/__tests__/retry.test.ts +321 -0
- package/src/lib/index.ts +16 -0
- package/src/lib/retry.ts +224 -0
- package/src/lifecycle/__tests__/chain-store.test.ts +7 -0
- package/src/lifecycle/__tests__/tessera-engine.test.ts +52 -0
- package/src/nexus/sharing/index.ts +142 -2
- package/src/sessions/__tests__/session-edge-cases.test.ts +24 -1
- package/src/sessions/session-enforcement.ts +13 -2
- package/src/stats/index.ts +7 -0
- package/src/stats/workflow-telemetry.ts +502 -0
- package/src/store/__tests__/migration-safety.test.ts +3 -0
- package/src/store/__tests__/session-store.test.ts +132 -1
- package/src/store/__tests__/task-store.test.ts +22 -1
- package/src/store/__tests__/test-db-helper.ts +29 -2
- package/src/store/brain-schema.ts +4 -1
- package/src/store/converters.ts +2 -0
- package/src/store/cross-db-cleanup.ts +192 -0
- package/src/store/db-helpers.ts +2 -0
- package/src/store/migration-sqlite.ts +6 -0
- package/src/store/sqlite-data-accessor.ts +20 -28
- package/src/store/sqlite.ts +14 -2
- package/src/store/task-store.ts +6 -0
- package/src/store/tasks-schema.ts +59 -20
- package/src/tasks/__tests__/add.test.ts +16 -0
- package/src/tasks/__tests__/complete-unblocks.test.ts +10 -1
- package/src/tasks/__tests__/complete.test.ts +11 -2
- package/src/tasks/__tests__/epic-enforcement.test.ts +909 -0
- package/src/tasks/__tests__/minimal-test.test.ts +28 -0
- package/src/tasks/__tests__/pipeline-stage.test.ts +403 -0
- package/src/tasks/__tests__/update.test.ts +40 -6
- package/src/tasks/add.ts +128 -2
- package/src/tasks/complete.ts +29 -17
- package/src/tasks/enforcement.ts +127 -0
- package/src/tasks/epic-enforcement.ts +364 -0
- package/src/tasks/index.ts +1 -0
- package/src/tasks/pipeline-stage.ts +293 -0
- package/src/tasks/update.ts +62 -0
- package/templates/config.template.json +34 -111
- package/templates/global-config.template.json +24 -40
|
@@ -0,0 +1,279 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Agent Health Monitoring -- Heartbeat and crash detection for live agent instances.
|
|
3
|
+
*
|
|
4
|
+
* Provides the public-facing health API specified by T039:
|
|
5
|
+
* - `recordHeartbeat` — update last_heartbeat for a live agent
|
|
6
|
+
* - `checkAgentHealth` — check health of a specific agent by ID
|
|
7
|
+
* - `detectStaleAgents` — find agents whose heartbeat is older than threshold
|
|
8
|
+
* - `detectCrashedAgents` — find active agents with no heartbeat for >3 min
|
|
9
|
+
*
|
|
10
|
+
* These functions delegate to the lower-level `registry.ts` primitives
|
|
11
|
+
* (`heartbeat`, `checkAgentHealth`, `listAgentInstances`) and add the
|
|
12
|
+
* named, task-spec-aligned surface required for T039.
|
|
13
|
+
*
|
|
14
|
+
* @module agents/health-monitor
|
|
15
|
+
* @task T039
|
|
16
|
+
* @epic T038
|
|
17
|
+
*/
|
|
18
|
+
|
|
19
|
+
import type { AgentInstanceRow, AgentInstanceStatus } from './agent-schema.js';
|
|
20
|
+
import { heartbeat, listAgentInstances, markCrashed } from './registry.js';
|
|
21
|
+
|
|
22
|
+
// ============================================================================
|
|
23
|
+
// Constants
|
|
24
|
+
// ============================================================================
|
|
25
|
+
|
|
26
|
+
/** Default heartbeat interval (30 seconds) per BRAIN spec. */
|
|
27
|
+
export const HEARTBEAT_INTERVAL_MS = 30_000;
|
|
28
|
+
|
|
29
|
+
/** Default staleness threshold: 3 minutes without a heartbeat. */
|
|
30
|
+
export const STALE_THRESHOLD_MS = 3 * 60_000;
|
|
31
|
+
|
|
32
|
+
/** Statuses considered "alive" for health-check purposes. */
|
|
33
|
+
const ALIVE_STATUSES: AgentInstanceStatus[] = ['starting', 'active', 'idle'];
|
|
34
|
+
|
|
35
|
+
// ============================================================================
|
|
36
|
+
// Types
|
|
37
|
+
// ============================================================================
|
|
38
|
+
|
|
39
|
+
/**
|
|
40
|
+
* Health status of a specific agent instance.
|
|
41
|
+
*/
|
|
42
|
+
export interface AgentHealthStatus {
|
|
43
|
+
/** Agent instance ID. */
|
|
44
|
+
agentId: string;
|
|
45
|
+
/** Current DB status. */
|
|
46
|
+
status: AgentInstanceStatus;
|
|
47
|
+
/** ISO timestamp of the last recorded heartbeat. */
|
|
48
|
+
lastHeartbeat: string;
|
|
49
|
+
/** Milliseconds since the last heartbeat (at call time). */
|
|
50
|
+
heartbeatAgeMs: number;
|
|
51
|
+
/** Whether the agent is considered healthy (heartbeat within threshold). */
|
|
52
|
+
healthy: boolean;
|
|
53
|
+
/** Whether the agent is considered stale (heartbeat older than threshold). */
|
|
54
|
+
stale: boolean;
|
|
55
|
+
/** Threshold used for staleness determination (ms). */
|
|
56
|
+
thresholdMs: number;
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
// ============================================================================
|
|
60
|
+
// recordHeartbeat
|
|
61
|
+
// ============================================================================
|
|
62
|
+
|
|
63
|
+
/**
|
|
64
|
+
* Record a heartbeat for an agent instance.
|
|
65
|
+
*
|
|
66
|
+
* Updates `last_heartbeat` to the current time and returns the agent's
|
|
67
|
+
* current {@link AgentInstanceStatus}. Returns `null` if the agent does not
|
|
68
|
+
* exist or is already in a terminal state (`stopped` / `crashed`).
|
|
69
|
+
*
|
|
70
|
+
* This is the primary mechanism by which long-running agents signal liveness.
|
|
71
|
+
* Call this every {@link HEARTBEAT_INTERVAL_MS} (30 s) from the agent loop.
|
|
72
|
+
*
|
|
73
|
+
* @param agentId - The agent instance ID (e.g. `agt_20260322120000_a1b2c3`)
|
|
74
|
+
* @param cwd - Working directory used to resolve the tasks.db path (optional)
|
|
75
|
+
* @returns The agent's current status, or `null` if not found / terminal
|
|
76
|
+
*
|
|
77
|
+
* @remarks
|
|
78
|
+
* Terminal agents (`stopped`, `crashed`) will NOT have their heartbeat
|
|
79
|
+
* updated — the existing status is returned as-is so callers can detect
|
|
80
|
+
* external shutdown signals.
|
|
81
|
+
*
|
|
82
|
+
* @example
|
|
83
|
+
* ```ts
|
|
84
|
+
* // Inside the agent's main loop:
|
|
85
|
+
* const heartbeatTimer = setInterval(async () => {
|
|
86
|
+
* const status = await recordHeartbeat(agentId);
|
|
87
|
+
* if (status === 'stopped' || status === null) {
|
|
88
|
+
* // Orchestrator shut us down — exit cleanly
|
|
89
|
+
* clearInterval(heartbeatTimer);
|
|
90
|
+
* process.exit(0);
|
|
91
|
+
* }
|
|
92
|
+
* }, HEARTBEAT_INTERVAL_MS);
|
|
93
|
+
* ```
|
|
94
|
+
*/
|
|
95
|
+
export async function recordHeartbeat(
|
|
96
|
+
agentId: string,
|
|
97
|
+
cwd?: string,
|
|
98
|
+
): Promise<AgentInstanceStatus | null> {
|
|
99
|
+
return heartbeat(agentId, cwd);
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
// ============================================================================
|
|
103
|
+
// checkAgentHealth
|
|
104
|
+
// ============================================================================
|
|
105
|
+
|
|
106
|
+
/**
|
|
107
|
+
* Check the health of a specific agent instance by ID.
|
|
108
|
+
*
|
|
109
|
+
* Queries the agent's current record and returns a structured
|
|
110
|
+
* {@link AgentHealthStatus} describing staleness, heartbeat age, and
|
|
111
|
+
* whether the agent is considered healthy relative to `thresholdMs`.
|
|
112
|
+
*
|
|
113
|
+
* Returns `null` if the agent ID is not found in the database.
|
|
114
|
+
*
|
|
115
|
+
* @param agentId - The agent instance ID to check
|
|
116
|
+
* @param thresholdMs - Staleness threshold in milliseconds (default: 3 minutes)
|
|
117
|
+
* @param cwd - Working directory used to resolve the tasks.db path (optional)
|
|
118
|
+
* @returns {@link AgentHealthStatus} or `null` if the agent does not exist
|
|
119
|
+
*
|
|
120
|
+
* @remarks
|
|
121
|
+
* Returns null if the agent is not found. A non-null result includes
|
|
122
|
+
* staleness status based on the threshold comparison.
|
|
123
|
+
*
|
|
124
|
+
* @example
|
|
125
|
+
* ```ts
|
|
126
|
+
* const health = await checkAgentHealth('agt_20260322120000_a1b2c3');
|
|
127
|
+
* if (health && health.stale) {
|
|
128
|
+
* console.log(`Agent stale for ${health.heartbeatAgeMs}ms — presumed crashed`);
|
|
129
|
+
* }
|
|
130
|
+
* ```
|
|
131
|
+
*/
|
|
132
|
+
export async function checkAgentHealth(
|
|
133
|
+
agentId: string,
|
|
134
|
+
thresholdMs: number = STALE_THRESHOLD_MS,
|
|
135
|
+
cwd?: string,
|
|
136
|
+
): Promise<AgentHealthStatus | null> {
|
|
137
|
+
const all = await listAgentInstances(undefined, cwd);
|
|
138
|
+
const agent = all.find((a) => a.id === agentId);
|
|
139
|
+
if (!agent) return null;
|
|
140
|
+
|
|
141
|
+
return buildHealthStatus(agent, thresholdMs);
|
|
142
|
+
}
|
|
143
|
+
|
|
144
|
+
// ============================================================================
|
|
145
|
+
// detectStaleAgents
|
|
146
|
+
// ============================================================================
|
|
147
|
+
|
|
148
|
+
/**
|
|
149
|
+
* Find all non-terminal agents whose last heartbeat is older than `thresholdMs`.
|
|
150
|
+
*
|
|
151
|
+
* "Stale" means an agent with status `starting`, `active`, or `idle` has
|
|
152
|
+
* not sent a heartbeat within the threshold window. This is a precursor to
|
|
153
|
+
* crash detection — a stale agent may still recover if it is under heavy load.
|
|
154
|
+
*
|
|
155
|
+
* Agents with status `stopped` or `crashed` are excluded — they are already
|
|
156
|
+
* in a terminal state and do not participate in the heartbeat protocol.
|
|
157
|
+
*
|
|
158
|
+
* @param thresholdMs - Staleness threshold in ms (default: 3 minutes / 180 000 ms)
|
|
159
|
+
* @param cwd - Working directory used to resolve the tasks.db path (optional)
|
|
160
|
+
* @returns Array of {@link AgentHealthStatus} for each stale agent, sorted by
|
|
161
|
+
* heartbeat age descending (most-stale first)
|
|
162
|
+
*
|
|
163
|
+
* @remarks
|
|
164
|
+
* The default threshold matches the crash-detection window specified in T039:
|
|
165
|
+
* "timeout detection after 3 minutes".
|
|
166
|
+
*
|
|
167
|
+
* @example
|
|
168
|
+
* ```ts
|
|
169
|
+
* const stale = await detectStaleAgents();
|
|
170
|
+
* for (const s of stale) {
|
|
171
|
+
* console.log(`${s.agentId} has been stale for ${s.heartbeatAgeMs / 1000}s`);
|
|
172
|
+
* }
|
|
173
|
+
* ```
|
|
174
|
+
*/
|
|
175
|
+
export async function detectStaleAgents(
|
|
176
|
+
thresholdMs: number = STALE_THRESHOLD_MS,
|
|
177
|
+
cwd?: string,
|
|
178
|
+
): Promise<AgentHealthStatus[]> {
|
|
179
|
+
const agents = await listAgentInstances({ status: ALIVE_STATUSES }, cwd);
|
|
180
|
+
|
|
181
|
+
return agents
|
|
182
|
+
.map((a) => buildHealthStatus(a, thresholdMs))
|
|
183
|
+
.filter((s) => s.stale)
|
|
184
|
+
.sort((a, b) => b.heartbeatAgeMs - a.heartbeatAgeMs);
|
|
185
|
+
}
|
|
186
|
+
|
|
187
|
+
// ============================================================================
|
|
188
|
+
// detectCrashedAgents
|
|
189
|
+
// ============================================================================
|
|
190
|
+
|
|
191
|
+
/**
|
|
192
|
+
* Find agents with status `active` whose heartbeat has been silent for
|
|
193
|
+
* longer than `thresholdMs`, and mark them as `crashed` in the database.
|
|
194
|
+
*
|
|
195
|
+
* An agent is considered crashed when it:
|
|
196
|
+
* 1. Has status `active` (not `idle`, `starting`, `stopped`, or `crashed`)
|
|
197
|
+
* 2. Has not sent a heartbeat for longer than `thresholdMs`
|
|
198
|
+
*
|
|
199
|
+
* Each detected agent is immediately marked `crashed` via {@link markCrashed},
|
|
200
|
+
* incrementing its error count and writing a reason to `agent_error_log`.
|
|
201
|
+
*
|
|
202
|
+
* @param thresholdMs - Crash threshold in ms (default: 3 minutes / 180 000 ms)
|
|
203
|
+
* @param cwd - Working directory used to resolve the tasks.db path (optional)
|
|
204
|
+
* @returns Array of agent instance rows for each agent that was just marked
|
|
205
|
+
* `crashed`, sorted by last heartbeat ascending (oldest first).
|
|
206
|
+
*
|
|
207
|
+
* @remarks
|
|
208
|
+
* This function is WRITE-side: it mutates the database. Callers should run
|
|
209
|
+
* it on a schedule (e.g. every 60 s) from an orchestrator or health watchdog.
|
|
210
|
+
* For a read-only view, use {@link detectStaleAgents} instead.
|
|
211
|
+
*
|
|
212
|
+
* @example
|
|
213
|
+
* ```ts
|
|
214
|
+
* // Inside an orchestrator health watchdog:
|
|
215
|
+
* const crashed = await detectCrashedAgents();
|
|
216
|
+
* if (crashed.length > 0) {
|
|
217
|
+
* logger.warn({ crashed: crashed.map(a => a.id) }, 'Agents marked crashed');
|
|
218
|
+
* }
|
|
219
|
+
* ```
|
|
220
|
+
*/
|
|
221
|
+
export async function detectCrashedAgents(
|
|
222
|
+
thresholdMs: number = STALE_THRESHOLD_MS,
|
|
223
|
+
cwd?: string,
|
|
224
|
+
): Promise<AgentInstanceRow[]> {
|
|
225
|
+
// Only consider agents that are explicitly 'active' — idle/starting agents
|
|
226
|
+
// may not yet have established a regular heartbeat interval.
|
|
227
|
+
const activeAgents = await listAgentInstances({ status: 'active' }, cwd);
|
|
228
|
+
const cutoff = new Date(Date.now() - thresholdMs).toISOString();
|
|
229
|
+
|
|
230
|
+
const crashed: AgentInstanceRow[] = [];
|
|
231
|
+
|
|
232
|
+
for (const agent of activeAgents) {
|
|
233
|
+
if (agent.lastHeartbeat < cutoff) {
|
|
234
|
+
const updated = await markCrashed(
|
|
235
|
+
agent.id,
|
|
236
|
+
`Heartbeat timeout — no heartbeat for >${Math.round(thresholdMs / 1000)}s`,
|
|
237
|
+
cwd,
|
|
238
|
+
);
|
|
239
|
+
if (updated) {
|
|
240
|
+
crashed.push(updated);
|
|
241
|
+
}
|
|
242
|
+
}
|
|
243
|
+
}
|
|
244
|
+
|
|
245
|
+
// Sort oldest-heartbeat first (most severely stale)
|
|
246
|
+
crashed.sort((a, b) => {
|
|
247
|
+
const aHb = a.lastHeartbeat ?? '';
|
|
248
|
+
const bHb = b.lastHeartbeat ?? '';
|
|
249
|
+
return aHb < bHb ? -1 : aHb > bHb ? 1 : 0;
|
|
250
|
+
});
|
|
251
|
+
|
|
252
|
+
return crashed;
|
|
253
|
+
}
|
|
254
|
+
|
|
255
|
+
// ============================================================================
|
|
256
|
+
// Internal helpers
|
|
257
|
+
// ============================================================================
|
|
258
|
+
|
|
259
|
+
/**
|
|
260
|
+
* Build an {@link AgentHealthStatus} from a raw agent row.
|
|
261
|
+
*/
|
|
262
|
+
function buildHealthStatus(agent: AgentInstanceRow, thresholdMs: number): AgentHealthStatus {
|
|
263
|
+
const lastHeartbeatMs = new Date(agent.lastHeartbeat).getTime();
|
|
264
|
+
const heartbeatAgeMs = Date.now() - lastHeartbeatMs;
|
|
265
|
+
const stale = ALIVE_STATUSES.includes(agent.status as AgentInstanceStatus)
|
|
266
|
+
? heartbeatAgeMs > thresholdMs
|
|
267
|
+
: false;
|
|
268
|
+
const healthy = !stale && ALIVE_STATUSES.includes(agent.status as AgentInstanceStatus);
|
|
269
|
+
|
|
270
|
+
return {
|
|
271
|
+
agentId: agent.id,
|
|
272
|
+
status: agent.status as AgentInstanceStatus,
|
|
273
|
+
lastHeartbeat: agent.lastHeartbeat,
|
|
274
|
+
heartbeatAgeMs,
|
|
275
|
+
healthy,
|
|
276
|
+
stale,
|
|
277
|
+
thresholdMs,
|
|
278
|
+
};
|
|
279
|
+
}
|
package/src/agents/index.ts
CHANGED
|
@@ -11,6 +11,17 @@
|
|
|
11
11
|
* @module agents
|
|
12
12
|
*/
|
|
13
13
|
|
|
14
|
+
// Load-balancing registry: task-count capacity, specializations, performance recording
|
|
15
|
+
export {
|
|
16
|
+
type AgentCapacity,
|
|
17
|
+
type AgentPerformanceMetrics,
|
|
18
|
+
getAgentCapacity,
|
|
19
|
+
getAgentSpecializations,
|
|
20
|
+
getAgentsByCapacity,
|
|
21
|
+
MAX_TASKS_PER_AGENT,
|
|
22
|
+
recordAgentPerformance,
|
|
23
|
+
updateAgentSpecializations,
|
|
24
|
+
} from './agent-registry.js';
|
|
14
25
|
// Schema & types
|
|
15
26
|
export {
|
|
16
27
|
AGENT_INSTANCE_STATUSES,
|
|
@@ -34,10 +45,35 @@ export {
|
|
|
34
45
|
isOverloaded,
|
|
35
46
|
updateCapacity,
|
|
36
47
|
} from './capacity.js';
|
|
48
|
+
// Execution learning, failure pattern tracking, and self-healing
|
|
49
|
+
export {
|
|
50
|
+
type AgentExecutionEvent,
|
|
51
|
+
type AgentExecutionOutcome,
|
|
52
|
+
type AgentPerformanceSummary,
|
|
53
|
+
getAgentPerformanceHistory,
|
|
54
|
+
getSelfHealingSuggestions,
|
|
55
|
+
type HealingSuggestion,
|
|
56
|
+
processAgentLifecycleEvent,
|
|
57
|
+
recordAgentExecution,
|
|
58
|
+
recordFailurePattern,
|
|
59
|
+
storeHealingStrategy,
|
|
60
|
+
} from './execution-learning.js';
|
|
61
|
+
// Health monitoring (T039)
|
|
62
|
+
export {
|
|
63
|
+
type AgentHealthStatus,
|
|
64
|
+
checkAgentHealth,
|
|
65
|
+
detectCrashedAgents,
|
|
66
|
+
detectStaleAgents,
|
|
67
|
+
HEARTBEAT_INTERVAL_MS,
|
|
68
|
+
recordHeartbeat,
|
|
69
|
+
STALE_THRESHOLD_MS,
|
|
70
|
+
} from './health-monitor.js';
|
|
37
71
|
// Registry (CRUD, heartbeat, health, errors)
|
|
72
|
+
// Note: registry.checkAgentHealth (thresholdMs, cwd) -> AgentInstanceRow[] is exported
|
|
73
|
+
// as findStaleAgentRows to avoid conflict with health-monitor.checkAgentHealth (T039).
|
|
38
74
|
export {
|
|
39
75
|
type AgentHealthReport,
|
|
40
|
-
checkAgentHealth,
|
|
76
|
+
checkAgentHealth as findStaleAgentRows,
|
|
41
77
|
classifyError,
|
|
42
78
|
deregisterAgent,
|
|
43
79
|
generateAgentId,
|
package/src/agents/retry.ts
CHANGED
|
@@ -47,6 +47,17 @@ export const DEFAULT_RETRY_POLICY: Readonly<RetryPolicy> = Object.freeze({
|
|
|
47
47
|
|
|
48
48
|
/**
|
|
49
49
|
* Create a retry policy by merging overrides with the default policy.
|
|
50
|
+
*
|
|
51
|
+
* @remarks
|
|
52
|
+
* Unspecified fields fall back to {@link DEFAULT_RETRY_POLICY}.
|
|
53
|
+
*
|
|
54
|
+
* @param overrides - Partial policy to merge with defaults
|
|
55
|
+
* @returns A complete RetryPolicy
|
|
56
|
+
*
|
|
57
|
+
* @example
|
|
58
|
+
* ```ts
|
|
59
|
+
* const policy = createRetryPolicy({ maxRetries: 5 });
|
|
60
|
+
* ```
|
|
50
61
|
*/
|
|
51
62
|
export function createRetryPolicy(overrides?: Partial<RetryPolicy>): RetryPolicy {
|
|
52
63
|
return { ...DEFAULT_RETRY_POLICY, ...overrides };
|
|
@@ -55,8 +66,19 @@ export function createRetryPolicy(overrides?: Partial<RetryPolicy>): RetryPolicy
|
|
|
55
66
|
/**
|
|
56
67
|
* Calculate the delay for a given retry attempt using exponential backoff.
|
|
57
68
|
*
|
|
58
|
-
*
|
|
69
|
+
* @remarks
|
|
70
|
+
* Formula: `min(baseDelay * multiplier^attempt, maxDelay) + jitter`.
|
|
59
71
|
* Jitter adds 0-25% randomness to prevent thundering herd.
|
|
72
|
+
*
|
|
73
|
+
* @param attempt - Zero-based attempt index
|
|
74
|
+
* @param policy - Retry policy with delay configuration
|
|
75
|
+
* @returns Delay in milliseconds before the next attempt
|
|
76
|
+
*
|
|
77
|
+
* @example
|
|
78
|
+
* ```ts
|
|
79
|
+
* const delay = calculateDelay(1, createRetryPolicy());
|
|
80
|
+
* // => ~2000ms (with jitter)
|
|
81
|
+
* ```
|
|
60
82
|
*/
|
|
61
83
|
export function calculateDelay(attempt: number, policy: RetryPolicy): number {
|
|
62
84
|
const exponentialDelay = policy.baseDelayMs * policy.backoffMultiplier ** attempt;
|
|
@@ -73,6 +95,20 @@ export function calculateDelay(attempt: number, policy: RetryPolicy): number {
|
|
|
73
95
|
/**
|
|
74
96
|
* Determine whether an error should be retried based on its classification
|
|
75
97
|
* and the retry policy.
|
|
98
|
+
*
|
|
99
|
+
* @remarks
|
|
100
|
+
* Permanent errors are never retried. Retriable errors are always retried
|
|
101
|
+
* (within attempt limits). Unknown errors defer to `policy.retryOnUnknown`.
|
|
102
|
+
*
|
|
103
|
+
* @param error - The caught error to classify
|
|
104
|
+
* @param attempt - Current attempt number (0-based)
|
|
105
|
+
* @param policy - Retry policy with limits and classification rules
|
|
106
|
+
* @returns True if the error should be retried
|
|
107
|
+
*
|
|
108
|
+
* @example
|
|
109
|
+
* ```ts
|
|
110
|
+
* if (shouldRetry(err, attempt, policy)) { /* retry *\/ }
|
|
111
|
+
* ```
|
|
76
112
|
*/
|
|
77
113
|
export function shouldRetry(error: unknown, attempt: number, policy: RetryPolicy): boolean {
|
|
78
114
|
if (attempt >= policy.maxRetries) return false;
|
|
@@ -102,13 +138,20 @@ export interface RetryResult<T> {
|
|
|
102
138
|
/**
|
|
103
139
|
* Wrap an async function with retry logic using configurable exponential backoff.
|
|
104
140
|
*
|
|
105
|
-
*
|
|
106
|
-
*
|
|
107
|
-
*
|
|
141
|
+
* @remarks
|
|
142
|
+
* Agent-specific variant that integrates with error classification from the
|
|
143
|
+
* agent registry. For a dependency-free generic retry, use `lib/retry.ts`.
|
|
108
144
|
*
|
|
145
|
+
* @typeParam T - The resolved type of the async function
|
|
109
146
|
* @param fn - The async function to execute with retries
|
|
110
147
|
* @param policy - Retry policy (uses DEFAULT_RETRY_POLICY if not provided)
|
|
111
148
|
* @returns The result of the operation with retry metadata
|
|
149
|
+
*
|
|
150
|
+
* @example
|
|
151
|
+
* ```ts
|
|
152
|
+
* const result = await withRetry(() => fetchAgentTask(agentId));
|
|
153
|
+
* if (!result.success) console.error(result.error);
|
|
154
|
+
* ```
|
|
112
155
|
*/
|
|
113
156
|
export async function withRetry<T>(
|
|
114
157
|
fn: () => Promise<T>,
|
|
@@ -176,9 +219,19 @@ export interface AgentRecoveryResult {
|
|
|
176
219
|
* classified as 'permanent' are abandoned. Agents with retriable errors
|
|
177
220
|
* are reset to 'starting' for the orchestration layer to re-assign.
|
|
178
221
|
*
|
|
222
|
+
* @remarks
|
|
223
|
+
* Two-phase process: first detects stale agents via heartbeat threshold,
|
|
224
|
+
* then evaluates each crashed agent's error history for recoverability.
|
|
225
|
+
*
|
|
179
226
|
* @param thresholdMs - Heartbeat threshold for crash detection (default: 30000)
|
|
180
227
|
* @param cwd - Working directory
|
|
181
228
|
* @returns Recovery results for each crashed agent
|
|
229
|
+
*
|
|
230
|
+
* @example
|
|
231
|
+
* ```ts
|
|
232
|
+
* const results = await recoverCrashedAgents(60_000);
|
|
233
|
+
* results.filter(r => r.recovered).forEach(r => console.log(r.agentId));
|
|
234
|
+
* ```
|
|
182
235
|
*/
|
|
183
236
|
export async function recoverCrashedAgents(
|
|
184
237
|
thresholdMs: number = 30_000,
|