@adaptic/maestro 1.9.0 → 1.9.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json
CHANGED
|
@@ -43,7 +43,7 @@
|
|
|
43
43
|
* logger optional fn({ ts, level, …rest }) → void for tests.
|
|
44
44
|
*/
|
|
45
45
|
|
|
46
|
-
import { existsSync, readFileSync, writeFileSync } from "node:fs";
|
|
46
|
+
import { existsSync, readFileSync, writeFileSync, mkdirSync, appendFileSync, openSync, closeSync, statSync, unlinkSync } from "node:fs";
|
|
47
47
|
import { join } from "node:path";
|
|
48
48
|
import { spawn } from "node:child_process";
|
|
49
49
|
import { homedir } from "node:os";
|
|
@@ -75,6 +75,17 @@ const DEFAULT_SPAWN_TIMEOUT_MS = 30 * 60_000;
|
|
|
75
75
|
// preferable to thrashing Claude / hitting usage limits.
|
|
76
76
|
const MAX_CONCURRENT_SUB_SESSIONS = 1;
|
|
77
77
|
|
|
78
|
+
// Retry policy. Most cadence failures are systemic (broken prompt, bad
|
|
79
|
+
// auth, transient API errors) — 5 retries doesn't help, it just amplifies
|
|
80
|
+
// the burn. 2 retries with exponential back-off is the right balance.
|
|
81
|
+
const DEFAULT_MAX_ATTEMPTS = 2;
|
|
82
|
+
const BACKOFF_SCHEDULE_MS = [0, 30_000, 120_000]; // 1st retry +30s, 2nd retry +2m
|
|
83
|
+
|
|
84
|
+
// Circuit breaker — when 3 same-cadence failures land in a row, stop
|
|
85
|
+
// spawning that cadence for 30 minutes. Prevents launchd-rate runaway.
|
|
86
|
+
const CIRCUIT_OPEN_THRESHOLD = 3;
|
|
87
|
+
const CIRCUIT_OPEN_DURATION_MS = 30 * 60_000;
|
|
88
|
+
|
|
78
89
|
// ---------------------------------------------------------------------------
|
|
79
90
|
// Helpers
|
|
80
91
|
// ---------------------------------------------------------------------------
|
|
@@ -129,8 +140,13 @@ function resolveClaudeBin() {
|
|
|
129
140
|
|
|
130
141
|
/**
|
|
131
142
|
* Spawn a sub-session running the cadence's trigger prompt and resolve
|
|
132
|
-
* with { exit_code, durationMs }. Reads the prompt at call
|
|
133
|
-
* latest version (possibly upgraded between ticks) is always
|
|
143
|
+
* with { exit_code, durationMs, stderr_tail }. Reads the prompt at call
|
|
144
|
+
* time so the latest version (possibly upgraded between ticks) is always
|
|
145
|
+
* used.
|
|
146
|
+
*
|
|
147
|
+
* Robustness: stdout + stderr are tee'd to logs/cadence-bus/subsessions/
|
|
148
|
+
* so non-zero exits remain diagnosable after the fact. The last ~4 KB of
|
|
149
|
+
* stderr is also captured in-memory and surfaced on the failure event.
|
|
134
150
|
*/
|
|
135
151
|
function realSpawnSession({ agentRoot, cadence, promptPath, timeoutMs, log }) {
|
|
136
152
|
return new Promise((resolveOut) => {
|
|
@@ -169,12 +185,27 @@ function realSpawnSession({ agentRoot, cadence, promptPath, timeoutMs, log }) {
|
|
|
169
185
|
};
|
|
170
186
|
const started = Date.now();
|
|
171
187
|
|
|
172
|
-
log
|
|
188
|
+
// Per-run log file. Pattern is short enough to be tail-friendly.
|
|
189
|
+
const logsDir = join(agentRoot, "logs", "cadence-bus", "subsessions");
|
|
190
|
+
mkdirSync(logsDir, { recursive: true });
|
|
191
|
+
const date = new Date().toISOString().slice(0, 10);
|
|
192
|
+
const stamp = new Date().toISOString().replace(/[:.]/g, "-");
|
|
193
|
+
const stdoutPath = join(logsDir, `${date}-${cadence}-${stamp}.stdout.log`);
|
|
194
|
+
const stderrPath = join(logsDir, `${date}-${cadence}-${stamp}.stderr.log`);
|
|
195
|
+
const stdoutFd = openSync(stdoutPath, "a");
|
|
196
|
+
const stderrFd = openSync(stderrPath, "a");
|
|
197
|
+
|
|
198
|
+
log({ level: "info", stage: "subsession_spawn", cadence, bin, stdout: stdoutPath, stderr: stderrPath });
|
|
173
199
|
|
|
174
200
|
let child;
|
|
175
201
|
try {
|
|
176
|
-
|
|
202
|
+
// stdio:
|
|
203
|
+
// 0 ignore (claude --print reads prompt from argv, not stdin)
|
|
204
|
+
// 1 → file (capture stdout for later inspection)
|
|
205
|
+
// 2 → file (capture stderr — critical for diagnosing exit-1)
|
|
206
|
+
child = spawn(bin, args, { cwd: agentRoot, env, stdio: ["ignore", stdoutFd, stderrFd] });
|
|
177
207
|
} catch (err) {
|
|
208
|
+
try { closeSync(stdoutFd); closeSync(stderrFd); } catch { /* */ }
|
|
178
209
|
resolveOut({ ok: false, exit_code: -4, error: `spawn failed: ${err.message}` });
|
|
179
210
|
return;
|
|
180
211
|
}
|
|
@@ -188,8 +219,22 @@ function realSpawnSession({ agentRoot, cadence, promptPath, timeoutMs, log }) {
|
|
|
188
219
|
|
|
189
220
|
child.on("exit", (code, signal) => {
|
|
190
221
|
clearTimeout(timer);
|
|
222
|
+
try { closeSync(stdoutFd); closeSync(stderrFd); } catch { /* */ }
|
|
191
223
|
const durationMs = Date.now() - started;
|
|
192
224
|
const exit_code = typeof code === "number" ? code : (signal ? -1 : -5);
|
|
225
|
+
|
|
226
|
+
// Pull tail of stderr (and stdout if stderr empty) for the failure
|
|
227
|
+
// surface. Best-effort; we never block on file size.
|
|
228
|
+
let stderrTail = "";
|
|
229
|
+
try {
|
|
230
|
+
const body = readFileSync(stderrPath, "utf-8");
|
|
231
|
+
stderrTail = body.slice(-4096);
|
|
232
|
+
if (!stderrTail.trim()) {
|
|
233
|
+
const so = readFileSync(stdoutPath, "utf-8");
|
|
234
|
+
stderrTail = so.slice(-4096);
|
|
235
|
+
}
|
|
236
|
+
} catch { /* file may not exist if spawn ENOENT before fd-redirect */ }
|
|
237
|
+
|
|
193
238
|
// Record cost-ledger row. Token counts are 0 until we parse the
|
|
194
239
|
// session's JSON output; for now exit-code + duration are enough
|
|
195
240
|
// to spot pathological retry loops.
|
|
@@ -208,16 +253,29 @@ function realSpawnSession({ agentRoot, cadence, promptPath, timeoutMs, log }) {
|
|
|
208
253
|
], { stdio: "ignore", env: { ...env, AGENT_ROOT: agentRoot } }).unref();
|
|
209
254
|
}
|
|
210
255
|
} catch { /* cost tracking is best-effort */ }
|
|
256
|
+
|
|
257
|
+
// Clean up empty log files so the directory doesn't accumulate
|
|
258
|
+
// hundreds of zero-byte successes.
|
|
259
|
+
try {
|
|
260
|
+
|
|
261
|
+
if (statSync(stdoutPath).size === 0) unlinkSync(stdoutPath);
|
|
262
|
+
if (statSync(stderrPath).size === 0) unlinkSync(stderrPath);
|
|
263
|
+
} catch { /* */ }
|
|
264
|
+
|
|
211
265
|
resolveOut({
|
|
212
266
|
ok: exit_code === 0,
|
|
213
267
|
exit_code,
|
|
214
268
|
signal: signal || null,
|
|
215
269
|
duration_ms: durationMs,
|
|
270
|
+
stderr_tail: stderrTail || null,
|
|
271
|
+
stdout_path: stdoutPath,
|
|
272
|
+
stderr_path: stderrPath,
|
|
216
273
|
});
|
|
217
274
|
});
|
|
218
275
|
|
|
219
276
|
child.on("error", (err) => {
|
|
220
277
|
clearTimeout(timer);
|
|
278
|
+
try { closeSync(stdoutFd); closeSync(stderrFd); } catch { /* */ }
|
|
221
279
|
const durationMs = Date.now() - started;
|
|
222
280
|
resolveOut({ ok: false, exit_code: -6, error: err.message, duration_ms: durationMs });
|
|
223
281
|
});
|
|
@@ -242,6 +300,11 @@ export function startConsumer(opts = {}) {
|
|
|
242
300
|
const maxSpawnMs = opts.maxSpawnMs ?? DEFAULT_SPAWN_TIMEOUT_MS;
|
|
243
301
|
const spawnSession = opts.spawnSession || realSpawnSession;
|
|
244
302
|
const userLogger = opts.logger;
|
|
303
|
+
// Test / tuning hooks for the reliability layer.
|
|
304
|
+
const backoffSchedule = opts.backoffSchedule || BACKOFF_SCHEDULE_MS;
|
|
305
|
+
const circuitThreshold = opts.circuitThreshold ?? CIRCUIT_OPEN_THRESHOLD;
|
|
306
|
+
const circuitDurationMs = opts.circuitDurationMs ?? CIRCUIT_OPEN_DURATION_MS;
|
|
307
|
+
const maxAttempts = opts.maxAttempts ?? DEFAULT_MAX_ATTEMPTS;
|
|
245
308
|
|
|
246
309
|
const stats = {
|
|
247
310
|
started_at: new Date().toISOString(),
|
|
@@ -249,6 +312,8 @@ export function startConsumer(opts = {}) {
|
|
|
249
312
|
inline: 0,
|
|
250
313
|
escalated: 0,
|
|
251
314
|
skipped_emergency_stop: 0,
|
|
315
|
+
skipped_circuit_open: 0,
|
|
316
|
+
skipped_backoff: 0,
|
|
252
317
|
dlq: 0,
|
|
253
318
|
retries: 0,
|
|
254
319
|
spawn_failures: 0,
|
|
@@ -261,6 +326,75 @@ export function startConsumer(opts = {}) {
|
|
|
261
326
|
let timers = [];
|
|
262
327
|
let activeSubSessions = 0;
|
|
263
328
|
|
|
329
|
+
// Per-cadence reliability state. Tracks consecutive failure count and
|
|
330
|
+
// the earliest moment we'll allow another spawn for that cadence.
|
|
331
|
+
// Persists nothing — circuit state is in-memory only. On daemon restart
|
|
332
|
+
// we get a fresh slate; that's intentional (operators expect a restart
|
|
333
|
+
// to mean "try again now").
|
|
334
|
+
const cadenceState = new Map(); // cadence → { failures, openUntil, nextAllowedAt }
|
|
335
|
+
|
|
336
|
+
function getCadenceState(cadence) {
|
|
337
|
+
let s = cadenceState.get(cadence);
|
|
338
|
+
if (!s) { s = { failures: 0, openUntil: 0, nextAllowedAt: 0 }; cadenceState.set(cadence, s); }
|
|
339
|
+
return s;
|
|
340
|
+
}
|
|
341
|
+
|
|
342
|
+
function recordSubsessionSuccess(cadence) {
|
|
343
|
+
const s = getCadenceState(cadence);
|
|
344
|
+
s.failures = 0;
|
|
345
|
+
s.openUntil = 0;
|
|
346
|
+
s.nextAllowedAt = 0;
|
|
347
|
+
}
|
|
348
|
+
|
|
349
|
+
function recordSubsessionFailure(cadence) {
|
|
350
|
+
const s = getCadenceState(cadence);
|
|
351
|
+
s.failures += 1;
|
|
352
|
+
// Exponential back-off honouring the (test-overridable) schedule.
|
|
353
|
+
const idx = Math.min(s.failures, backoffSchedule.length - 1);
|
|
354
|
+
s.nextAllowedAt = Date.now() + backoffSchedule[idx];
|
|
355
|
+
if (s.failures >= circuitThreshold) {
|
|
356
|
+
s.openUntil = Date.now() + circuitDurationMs;
|
|
357
|
+
log({ level: "error", stage: "circuit_opened", cadence, failures: s.failures, open_until: new Date(s.openUntil).toISOString() });
|
|
358
|
+
writeCircuitFile();
|
|
359
|
+
}
|
|
360
|
+
}
|
|
361
|
+
|
|
362
|
+
function writeCircuitFile() {
|
|
363
|
+
// Persist the open-circuit snapshot so doctor + the operator can see
|
|
364
|
+
// which cadences are currently held back without scraping logs.
|
|
365
|
+
const open = {};
|
|
366
|
+
for (const [cad, s] of cadenceState.entries()) {
|
|
367
|
+
if (s.openUntil > Date.now()) {
|
|
368
|
+
open[cad] = { failures: s.failures, open_until: new Date(s.openUntil).toISOString() };
|
|
369
|
+
}
|
|
370
|
+
}
|
|
371
|
+
const path = join(agentRoot, "state/cadence-bus/circuit-open.json");
|
|
372
|
+
try {
|
|
373
|
+
if (Object.keys(open).length === 0) {
|
|
374
|
+
// Remove the file when nothing is open.
|
|
375
|
+
|
|
376
|
+
try { unlinkSync(path); } catch { /* */ }
|
|
377
|
+
} else {
|
|
378
|
+
writeFileSync(path, JSON.stringify({ generated: new Date().toISOString(), open }, null, 2) + "\n");
|
|
379
|
+
}
|
|
380
|
+
} catch { /* best-effort */ }
|
|
381
|
+
}
|
|
382
|
+
|
|
383
|
+
function isCadenceAllowed(cadence) {
|
|
384
|
+
const s = getCadenceState(cadence);
|
|
385
|
+
const now = Date.now();
|
|
386
|
+
if (s.openUntil > now) return { allowed: false, reason: "circuit-open", retry_at: s.openUntil };
|
|
387
|
+
if (s.nextAllowedAt > now) return { allowed: false, reason: "backoff", retry_at: s.nextAllowedAt };
|
|
388
|
+
// Circuit closes automatically when openUntil passes.
|
|
389
|
+
if (s.openUntil && s.openUntil <= now) {
|
|
390
|
+
s.openUntil = 0;
|
|
391
|
+
s.failures = 0;
|
|
392
|
+
log({ level: "info", stage: "circuit_closed", cadence });
|
|
393
|
+
writeCircuitFile();
|
|
394
|
+
}
|
|
395
|
+
return { allowed: true };
|
|
396
|
+
}
|
|
397
|
+
|
|
264
398
|
function log(entry) {
|
|
265
399
|
const enriched = { ts: new Date().toISOString(), ...entry };
|
|
266
400
|
logBusEvent(agentRoot, enriched);
|
|
@@ -280,6 +414,32 @@ export function startConsumer(opts = {}) {
|
|
|
280
414
|
}
|
|
281
415
|
|
|
282
416
|
async function escalate(event) {
|
|
417
|
+
// Circuit-breaker / back-off gate. If this cadence is currently held
|
|
418
|
+
// back, requeue without spawning. The event keeps its attempt count
|
|
419
|
+
// because the failure was upstream (not a per-event problem).
|
|
420
|
+
const gate = isCadenceAllowed(event.cadence);
|
|
421
|
+
if (!gate.allowed) {
|
|
422
|
+
log({
|
|
423
|
+
level: "warn",
|
|
424
|
+
stage: gate.reason === "circuit-open" ? "skipped_circuit_open" : "skipped_backoff",
|
|
425
|
+
id: event.id,
|
|
426
|
+
cadence: event.cadence,
|
|
427
|
+
retry_at: new Date(gate.retry_at).toISOString(),
|
|
428
|
+
});
|
|
429
|
+
if (gate.reason === "circuit-open") stats.skipped_circuit_open += 1;
|
|
430
|
+
else stats.skipped_backoff += 1;
|
|
431
|
+
// Put the event back in inbox WITHOUT bumping attempts so it doesn't
|
|
432
|
+
// burn its retry budget while the circuit is open.
|
|
433
|
+
const paths2 = getBusPaths(agentRoot);
|
|
434
|
+
try {
|
|
435
|
+
const event2 = { ...event, attempts: Math.max(0, (event.attempts || 1) - 1) };
|
|
436
|
+
writeFileSync(join(paths2.inbox, `${event.id}.json`), JSON.stringify(event2, null, 2) + "\n");
|
|
437
|
+
|
|
438
|
+
try { unlinkSync(join(paths2.claimed, `${event.id}.json`)); } catch { /* */ }
|
|
439
|
+
} catch { /* best-effort */ }
|
|
440
|
+
return { ok: false, decision: gate.reason };
|
|
441
|
+
}
|
|
442
|
+
|
|
283
443
|
if (activeSubSessions >= MAX_CONCURRENT_SUB_SESSIONS) {
|
|
284
444
|
// Re-queue and try again next tick. Single-owner cadence consumer
|
|
285
445
|
// means this can only happen when a prior tick is still running —
|
|
@@ -291,7 +451,15 @@ export function startConsumer(opts = {}) {
|
|
|
291
451
|
cadence: event.cadence,
|
|
292
452
|
active_subsessions: activeSubSessions,
|
|
293
453
|
});
|
|
294
|
-
|
|
454
|
+
// Re-queue without burning the retry budget — concurrent-spawn isn't
|
|
455
|
+
// a per-event failure.
|
|
456
|
+
const paths2 = getBusPaths(agentRoot);
|
|
457
|
+
try {
|
|
458
|
+
const event2 = { ...event, attempts: Math.max(0, (event.attempts || 1) - 1) };
|
|
459
|
+
writeFileSync(join(paths2.inbox, `${event.id}.json`), JSON.stringify(event2, null, 2) + "\n");
|
|
460
|
+
|
|
461
|
+
try { unlinkSync(join(paths2.claimed, `${event.id}.json`)); } catch { /* */ }
|
|
462
|
+
} catch { /* best-effort */ }
|
|
295
463
|
stats.retries += 1;
|
|
296
464
|
return { ok: false, decision: "deferred" };
|
|
297
465
|
}
|
|
@@ -334,14 +502,31 @@ export function startConsumer(opts = {}) {
|
|
|
334
502
|
prompt: promptPath,
|
|
335
503
|
exit_code: result.exit_code,
|
|
336
504
|
duration_ms: result.duration_ms,
|
|
505
|
+
stdout_path: result.stdout_path || null,
|
|
506
|
+
stderr_path: result.stderr_path || null,
|
|
337
507
|
});
|
|
508
|
+
recordSubsessionSuccess(event.cadence);
|
|
338
509
|
stats.escalated += 1;
|
|
339
510
|
stats.last_decision = "escalated";
|
|
340
511
|
return { ok: true, decision: "escalated", exit_code: result.exit_code };
|
|
341
512
|
}
|
|
342
|
-
|
|
513
|
+
// Failure path: log + cap retries low. The exact stderr tail comes
|
|
514
|
+
// from the spawn helper so we never DLQ "blind" again.
|
|
515
|
+
const stderrTail = (result.stderr_tail || "").trim().split("\n").slice(-3).join(" | ");
|
|
516
|
+
log({
|
|
517
|
+
level: "error",
|
|
518
|
+
stage: "subsession_failed",
|
|
519
|
+
id: event.id,
|
|
520
|
+
cadence: event.cadence,
|
|
521
|
+
exit_code: result.exit_code,
|
|
522
|
+
duration_ms: result.duration_ms,
|
|
523
|
+
error: result.error || stderrTail || `exit ${result.exit_code}`,
|
|
524
|
+
stderr_path: result.stderr_path || null,
|
|
525
|
+
});
|
|
343
526
|
stats.spawn_failures += 1;
|
|
344
|
-
|
|
527
|
+
recordSubsessionFailure(event.cadence);
|
|
528
|
+
const reason = result.error || (stderrTail ? `exit ${result.exit_code}: ${stderrTail}` : `exit ${result.exit_code}`);
|
|
529
|
+
const outcome = failTick(agentRoot, event.id, reason, { maxAttempts });
|
|
345
530
|
if (outcome?.destination === "dlq") stats.dlq += 1;
|
|
346
531
|
else stats.retries += 1;
|
|
347
532
|
return { ok: false, decision: outcome?.destination || "failed" };
|
|
@@ -428,19 +613,38 @@ export function startConsumer(opts = {}) {
|
|
|
428
613
|
recoverStaleClaims(agentRoot);
|
|
429
614
|
|
|
430
615
|
let processed = 0;
|
|
431
|
-
|
|
432
|
-
//
|
|
616
|
+
let escalatedThisTick = 0;
|
|
617
|
+
// Drain inline events as much as the consumer can in one tick; cap
|
|
618
|
+
// sub-session escalations at 1 per tick so a fast-failing cadence
|
|
619
|
+
// can't burn a whole minute's worth of retries inside a single poll.
|
|
620
|
+
// The next poll (DEFAULT_POLL_MS later) will pick up where we left off.
|
|
433
621
|
while (!stopping) {
|
|
434
622
|
const claim = claimNextTick(agentRoot);
|
|
435
623
|
if (!claim) break;
|
|
436
624
|
const event = claim.event;
|
|
437
625
|
activeTick = event.id;
|
|
626
|
+
let didEscalate = false;
|
|
438
627
|
try {
|
|
628
|
+
const def = getCadenceDef(event.cadence);
|
|
629
|
+
const willEscalate = !def || (def.mode !== "inline" && (def.mode !== "guarded" || true));
|
|
630
|
+
// Roughly: if it's not a registry-inline cadence, we MAY escalate.
|
|
631
|
+
// We don't yet know if the guard will say inline; processEvent
|
|
632
|
+
// will tell us via stats. Use the escalated stats delta as the
|
|
633
|
+
// signal that an actual sub-session ran this iteration.
|
|
634
|
+
const before = stats.escalated + stats.spawn_failures + stats.skipped_circuit_open + stats.skipped_backoff;
|
|
439
635
|
await processEvent(event);
|
|
636
|
+
const after = stats.escalated + stats.spawn_failures + stats.skipped_circuit_open + stats.skipped_backoff;
|
|
637
|
+
if (after > before) didEscalate = true;
|
|
638
|
+
// Silence unused var warning.
|
|
639
|
+
void willEscalate;
|
|
440
640
|
} finally {
|
|
441
641
|
activeTick = null;
|
|
442
642
|
}
|
|
443
643
|
processed += 1;
|
|
644
|
+
if (didEscalate) escalatedThisTick += 1;
|
|
645
|
+
// Hard cap: at most ONE sub-session spawn per tick. Inline ticks
|
|
646
|
+
// keep draining freely (they're cheap).
|
|
647
|
+
if (escalatedThisTick >= 1) break;
|
|
444
648
|
if (processed >= 16) break; // soft batch cap
|
|
445
649
|
}
|
|
446
650
|
return { processed };
|
|
@@ -210,9 +210,16 @@ test("unknown cadence with no prompt file DLQ's immediately", async () => {
|
|
|
210
210
|
test("spawn failure retries within the budget, then DLQs", async () => {
|
|
211
211
|
const root = await makeAgentRoot();
|
|
212
212
|
plantPrompt(root, "weekly-strategic-memo");
|
|
213
|
+
// Disable back-off + raise circuit threshold so the test exercises the
|
|
214
|
+
// retry-then-DLQ path without waiting for back-off windows. The
|
|
215
|
+
// real defaults (30s/2m back-off, 3-failure circuit) are exercised by
|
|
216
|
+
// dedicated tests below.
|
|
213
217
|
const consumer = startConsumer({
|
|
214
218
|
agentRoot: root,
|
|
215
219
|
pollMs: 25,
|
|
220
|
+
backoffSchedule: [0, 0, 0],
|
|
221
|
+
circuitThreshold: 999,
|
|
222
|
+
maxAttempts: 2,
|
|
216
223
|
spawnSession: async () => ({ ok: false, exit_code: 1, error: "always-fail", duration_ms: 1 }),
|
|
217
224
|
});
|
|
218
225
|
try {
|
|
@@ -226,6 +233,68 @@ test("spawn failure retries within the budget, then DLQs", async () => {
|
|
|
226
233
|
}
|
|
227
234
|
});
|
|
228
235
|
|
|
236
|
+
test("circuit breaker opens after consecutive failures and blocks further spawns", async () => {
|
|
237
|
+
const root = await makeAgentRoot();
|
|
238
|
+
plantPrompt(root, "weekly-strategic-memo");
|
|
239
|
+
let spawnCount = 0;
|
|
240
|
+
const consumer = startConsumer({
|
|
241
|
+
agentRoot: root,
|
|
242
|
+
pollMs: 20,
|
|
243
|
+
backoffSchedule: [0, 0, 0],
|
|
244
|
+
circuitThreshold: 2,
|
|
245
|
+
circuitDurationMs: 60_000, // 1 min — long enough for the assertion window
|
|
246
|
+
maxAttempts: 1, // each event DLQs on first failure so we don't conflate retry-counts
|
|
247
|
+
spawnSession: async () => { spawnCount++; return { ok: false, exit_code: 1, error: "fail", duration_ms: 1 }; },
|
|
248
|
+
});
|
|
249
|
+
try {
|
|
250
|
+
// Enqueue 5 events; circuit should open after 2 failures, blocking the rest.
|
|
251
|
+
for (let i = 0; i < 5; i++) {
|
|
252
|
+
enqueueTick({ cadence: "weekly-strategic-memo", source: "launchd", agentRoot: root });
|
|
253
|
+
}
|
|
254
|
+
const opened = await waitFor(() => consumer.getStats().skipped_circuit_open >= 1, { timeoutMs: 10_000 });
|
|
255
|
+
assert.ok(opened, `circuit should open; stats=${JSON.stringify(consumer.getStats())}`);
|
|
256
|
+
// Spawn count must NOT keep climbing once the circuit is open.
|
|
257
|
+
const spawnsAtOpen = spawnCount;
|
|
258
|
+
await new Promise((r) => setTimeout(r, 500));
|
|
259
|
+
assert.equal(spawnCount, spawnsAtOpen, `spawns must stop once circuit opens (was ${spawnsAtOpen}, now ${spawnCount})`);
|
|
260
|
+
} finally {
|
|
261
|
+
await consumer.stop();
|
|
262
|
+
await rmRoot(root);
|
|
263
|
+
}
|
|
264
|
+
});
|
|
265
|
+
|
|
266
|
+
test("back-off skips re-spawning until the cooldown elapses", async () => {
|
|
267
|
+
const root = await makeAgentRoot();
|
|
268
|
+
plantPrompt(root, "weekly-strategic-memo");
|
|
269
|
+
let spawnCount = 0;
|
|
270
|
+
const consumer = startConsumer({
|
|
271
|
+
agentRoot: root,
|
|
272
|
+
pollMs: 20,
|
|
273
|
+
backoffSchedule: [0, 300, 300], // 300ms cooldown after each failure
|
|
274
|
+
circuitThreshold: 999,
|
|
275
|
+
maxAttempts: 1,
|
|
276
|
+
spawnSession: async () => { spawnCount++; return { ok: false, exit_code: 1, error: "fail", duration_ms: 1 }; },
|
|
277
|
+
});
|
|
278
|
+
try {
|
|
279
|
+
// Enqueue 2 events back-to-back. The 1st triggers a spawn (fails). The
|
|
280
|
+
// 2nd should be held back by the 300ms back-off window.
|
|
281
|
+
enqueueTick({ cadence: "weekly-strategic-memo", source: "launchd", agentRoot: root });
|
|
282
|
+
enqueueTick({ cadence: "weekly-strategic-memo", source: "launchd", agentRoot: root });
|
|
283
|
+
await waitFor(() => spawnCount >= 1, { timeoutMs: 5_000 });
|
|
284
|
+
const spawnsBeforeWait = spawnCount;
|
|
285
|
+
// During the back-off window no new spawn should fire.
|
|
286
|
+
await new Promise((r) => setTimeout(r, 150));
|
|
287
|
+
assert.ok(spawnCount === spawnsBeforeWait, `spawns must wait for back-off (was ${spawnsBeforeWait}, now ${spawnCount})`);
|
|
288
|
+
assert.ok(consumer.getStats().skipped_backoff >= 1, "skipped_backoff should be recorded");
|
|
289
|
+
// After the window passes, the next event should be processed.
|
|
290
|
+
await waitFor(() => spawnCount > spawnsBeforeWait, { timeoutMs: 5_000 });
|
|
291
|
+
assert.ok(spawnCount > spawnsBeforeWait, "spawning resumes after back-off");
|
|
292
|
+
} finally {
|
|
293
|
+
await consumer.stop();
|
|
294
|
+
await rmRoot(root);
|
|
295
|
+
}
|
|
296
|
+
});
|
|
297
|
+
|
|
229
298
|
// ---------------------------------------------------------------------------
|
|
230
299
|
// Emergency stop
|
|
231
300
|
// ---------------------------------------------------------------------------
|