@adaptic/maestro 1.9.0 → 1.9.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.env.example +21 -3
- package/bin/maestro.mjs +37 -0
- package/package.json +1 -1
- package/scripts/daemon/cadence-consumer.mjs +228 -10
- package/scripts/daemon/cadence-consumer.test.mjs +69 -0
package/.env.example
CHANGED
|
@@ -16,11 +16,29 @@
|
|
|
16
16
|
# The agent's reasoning engines. At minimum you need Anthropic (Claude).
|
|
17
17
|
#
|
|
18
18
|
|
|
19
|
-
# REQUIRED — Primary reasoning engine
|
|
20
|
-
#
|
|
21
|
-
#
|
|
19
|
+
# REQUIRED — Primary reasoning engine. Two ways to authenticate:
|
|
20
|
+
#
|
|
21
|
+
# Option A — API key (pay-per-token)
|
|
22
|
+
# Set ANTHROPIC_API_KEY below to a valid sk-ant-api03-... key.
|
|
23
|
+
# Get one: https://console.anthropic.com/settings/keys
|
|
24
|
+
#
|
|
25
|
+
# Option B — Claude Code subscription (Pro/Max, OAuth via Keychain)
|
|
26
|
+
# LEAVE ANTHROPIC_API_KEY EMPTY *and* set MAESTRO_PREFER_SUBSCRIPTION_AUTH=1.
|
|
27
|
+
# This tells the cadence consumer to strip ANTHROPIC_API_KEY from every
|
|
28
|
+
# sub-session spawn so claude --print falls back to the keychain OAuth
|
|
29
|
+
# token. Most agents on a Mac mini with a Claude Code subscription
|
|
30
|
+
# should use this option — routine cadence ticks cost zero API credits.
|
|
31
|
+
#
|
|
32
|
+
# Doctor validates the key against api.anthropic.com on every run; an
|
|
33
|
+
# invalid key here will cascade 401s through every sub-session spawn.
|
|
22
34
|
ANTHROPIC_API_KEY=
|
|
23
35
|
|
|
36
|
+
# OPTIONAL — When set to 1, the cadence consumer strips ANTHROPIC_API_KEY
|
|
37
|
+
# from every claude --print sub-session env so claude falls back to
|
|
38
|
+
# Claude Code subscription auth (Keychain OAuth). Use this when the
|
|
39
|
+
# agent's Mac has a Claude Code Pro/Max subscription.
|
|
40
|
+
MAESTRO_PREFER_SUBSCRIPTION_AUTH=
|
|
41
|
+
|
|
24
42
|
# OPTIONAL — Supplemental model access (GPT-4, embeddings)
|
|
25
43
|
# Get your key: https://platform.openai.com/api-keys
|
|
26
44
|
# Subscription: OpenAI API plan (pay-per-token)
|
package/bin/maestro.mjs
CHANGED
|
@@ -1462,6 +1462,43 @@ function doctor() {
|
|
|
1462
1462
|
check("ANTHROPIC_API_KEY", true);
|
|
1463
1463
|
check("SLACK_USER_TOKEN", false);
|
|
1464
1464
|
check("GMAIL_APP_PASSWORD", false);
|
|
1465
|
+
|
|
1466
|
+
// Auth validity: if ANTHROPIC_API_KEY is set, ping the API to
|
|
1467
|
+
// verify it works. An invalid key in .env will silently be sent
|
|
1468
|
+
// to every `claude --print` sub-session and cause cascading 401s
|
|
1469
|
+
// (exactly the ravi-ai inbox-processor runaway). Better to catch
|
|
1470
|
+
// it here. Skips the check if the user opted out via
|
|
1471
|
+
// MAESTRO_PREFER_SUBSCRIPTION_AUTH=1 (subscription wins).
|
|
1472
|
+
const keyMatch = env.match(/^ANTHROPIC_API_KEY=(.+)$/m);
|
|
1473
|
+
const preferSubsMatch = env.match(/^MAESTRO_PREFER_SUBSCRIPTION_AUTH=(.+)$/m);
|
|
1474
|
+
const preferSubs = preferSubsMatch && /^1|true|yes$/i.test(preferSubsMatch[1].trim());
|
|
1475
|
+
if (keyMatch && !preferSubs) {
|
|
1476
|
+
const key = keyMatch[1].trim().replace(/^"|"$/g, "");
|
|
1477
|
+
try {
|
|
1478
|
+
const result = spawnSync("curl", [
|
|
1479
|
+
"-s", "-o", "/dev/null", "-w", "%{http_code}",
|
|
1480
|
+
"-X", "POST",
|
|
1481
|
+
"-H", `x-api-key: ${key}`,
|
|
1482
|
+
"-H", "anthropic-version: 2023-06-01",
|
|
1483
|
+
"-H", "content-type: application/json",
|
|
1484
|
+
"--max-time", "8",
|
|
1485
|
+
"https://api.anthropic.com/v1/messages",
|
|
1486
|
+
"-d", JSON.stringify({ model: "claude-haiku-4-5", max_tokens: 5, messages: [{ role: "user", content: "ping" }] }),
|
|
1487
|
+
], { encoding: "utf-8" });
|
|
1488
|
+
const code = (result.stdout || "").trim();
|
|
1489
|
+
if (code === "200") ok("ANTHROPIC_API_KEY validated against api.anthropic.com");
|
|
1490
|
+
else if (code === "401") {
|
|
1491
|
+
warn(`ANTHROPIC_API_KEY is INVALID (HTTP 401 from api.anthropic.com).`);
|
|
1492
|
+
warn(` This will cause every sub-session spawn to fail. Either:`);
|
|
1493
|
+
warn(` 1. Replace the key in .env with a valid one, OR`);
|
|
1494
|
+
warn(` 2. Set MAESTRO_PREFER_SUBSCRIPTION_AUTH=1 in .env to use Claude Code subscription auth.`);
|
|
1495
|
+
issues++;
|
|
1496
|
+
} else if (code) warn(`ANTHROPIC_API_KEY check returned HTTP ${code} (expected 200)`);
|
|
1497
|
+
else warn(`ANTHROPIC_API_KEY check skipped (no network / curl missing)`);
|
|
1498
|
+
} catch { warn("ANTHROPIC_API_KEY check failed (curl error)"); }
|
|
1499
|
+
} else if (preferSubs) {
|
|
1500
|
+
ok("MAESTRO_PREFER_SUBSCRIPTION_AUTH=1 — using Claude Code subscription (Keychain OAuth)");
|
|
1501
|
+
}
|
|
1465
1502
|
} else {
|
|
1466
1503
|
fail(".env file not found — copy from .env.example");
|
|
1467
1504
|
issues++;
|
package/package.json
CHANGED
|
@@ -43,7 +43,7 @@
|
|
|
43
43
|
* logger optional fn({ ts, level, …rest }) → void for tests.
|
|
44
44
|
*/
|
|
45
45
|
|
|
46
|
-
import { existsSync, readFileSync, writeFileSync } from "node:fs";
|
|
46
|
+
import { existsSync, readFileSync, writeFileSync, mkdirSync, appendFileSync, openSync, closeSync, statSync, unlinkSync } from "node:fs";
|
|
47
47
|
import { join } from "node:path";
|
|
48
48
|
import { spawn } from "node:child_process";
|
|
49
49
|
import { homedir } from "node:os";
|
|
@@ -75,6 +75,17 @@ const DEFAULT_SPAWN_TIMEOUT_MS = 30 * 60_000;
|
|
|
75
75
|
// preferable to thrashing Claude / hitting usage limits.
|
|
76
76
|
const MAX_CONCURRENT_SUB_SESSIONS = 1;
|
|
77
77
|
|
|
78
|
+
// Retry policy. Most cadence failures are systemic (broken prompt, bad
|
|
79
|
+
// auth, transient API errors) — 5 retries doesn't help, it just amplifies
|
|
80
|
+
// the burn. 2 retries with exponential back-off is the right balance.
|
|
81
|
+
const DEFAULT_MAX_ATTEMPTS = 2;
|
|
82
|
+
const BACKOFF_SCHEDULE_MS = [0, 30_000, 120_000]; // 1st retry +30s, 2nd retry +2m
|
|
83
|
+
|
|
84
|
+
// Circuit breaker — when 3 same-cadence failures land in a row, stop
|
|
85
|
+
// spawning that cadence for 30 minutes. Prevents launchd-rate runaway.
|
|
86
|
+
const CIRCUIT_OPEN_THRESHOLD = 3;
|
|
87
|
+
const CIRCUIT_OPEN_DURATION_MS = 30 * 60_000;
|
|
88
|
+
|
|
78
89
|
// ---------------------------------------------------------------------------
|
|
79
90
|
// Helpers
|
|
80
91
|
// ---------------------------------------------------------------------------
|
|
@@ -129,8 +140,13 @@ function resolveClaudeBin() {
|
|
|
129
140
|
|
|
130
141
|
/**
|
|
131
142
|
* Spawn a sub-session running the cadence's trigger prompt and resolve
|
|
132
|
-
* with { exit_code, durationMs }. Reads the prompt at call
|
|
133
|
-
* latest version (possibly upgraded between ticks) is always
|
|
143
|
+
* with { exit_code, durationMs, stderr_tail }. Reads the prompt at call
|
|
144
|
+
* time so the latest version (possibly upgraded between ticks) is always
|
|
145
|
+
* used.
|
|
146
|
+
*
|
|
147
|
+
* Robustness: stdout + stderr are tee'd to logs/cadence-bus/subsessions/
|
|
148
|
+
* so non-zero exits remain diagnosable after the fact. The last ~4 KB of
|
|
149
|
+
* stderr is also captured in-memory and surfaced on the failure event.
|
|
134
150
|
*/
|
|
135
151
|
function realSpawnSession({ agentRoot, cadence, promptPath, timeoutMs, log }) {
|
|
136
152
|
return new Promise((resolveOut) => {
|
|
@@ -167,14 +183,43 @@ function realSpawnSession({ agentRoot, cadence, promptPath, timeoutMs, log }) {
|
|
|
167
183
|
AGENT_DIR: agentRoot,
|
|
168
184
|
PATH: augmentedPath,
|
|
169
185
|
};
|
|
186
|
+
// Auth handling. Claude Code authenticates via macOS Keychain
|
|
187
|
+
// (OAuth from the user's Pro/Max subscription) when no API key is
|
|
188
|
+
// set, OR via the ANTHROPIC_API_KEY env var when one is present.
|
|
189
|
+
// If the env key is present BUT looks like a placeholder / empty
|
|
190
|
+
// string, we strip it so claude can fall back to Keychain OAuth.
|
|
191
|
+
// Set MAESTRO_PREFER_SUBSCRIPTION_AUTH=1 in .env to always strip
|
|
192
|
+
// the API key (force subscription auth) — useful when the agent
|
|
193
|
+
// owns a Claude Code Pro/Max subscription and shouldn't burn API
|
|
194
|
+
// credits for routine ticks.
|
|
195
|
+
const preferSubscription = process.env.MAESTRO_PREFER_SUBSCRIPTION_AUTH === "1";
|
|
196
|
+
const apiKey = env.ANTHROPIC_API_KEY || "";
|
|
197
|
+
if (preferSubscription || !apiKey.trim() || /^(your-api-key|placeholder|xxx+|sk-ant-xxx)/i.test(apiKey)) {
|
|
198
|
+
delete env.ANTHROPIC_API_KEY;
|
|
199
|
+
}
|
|
170
200
|
const started = Date.now();
|
|
171
201
|
|
|
172
|
-
log
|
|
202
|
+
// Per-run log file. Pattern is short enough to be tail-friendly.
|
|
203
|
+
const logsDir = join(agentRoot, "logs", "cadence-bus", "subsessions");
|
|
204
|
+
mkdirSync(logsDir, { recursive: true });
|
|
205
|
+
const date = new Date().toISOString().slice(0, 10);
|
|
206
|
+
const stamp = new Date().toISOString().replace(/[:.]/g, "-");
|
|
207
|
+
const stdoutPath = join(logsDir, `${date}-${cadence}-${stamp}.stdout.log`);
|
|
208
|
+
const stderrPath = join(logsDir, `${date}-${cadence}-${stamp}.stderr.log`);
|
|
209
|
+
const stdoutFd = openSync(stdoutPath, "a");
|
|
210
|
+
const stderrFd = openSync(stderrPath, "a");
|
|
211
|
+
|
|
212
|
+
log({ level: "info", stage: "subsession_spawn", cadence, bin, stdout: stdoutPath, stderr: stderrPath });
|
|
173
213
|
|
|
174
214
|
let child;
|
|
175
215
|
try {
|
|
176
|
-
|
|
216
|
+
// stdio:
|
|
217
|
+
// 0 ignore (claude --print reads prompt from argv, not stdin)
|
|
218
|
+
// 1 → file (capture stdout for later inspection)
|
|
219
|
+
// 2 → file (capture stderr — critical for diagnosing exit-1)
|
|
220
|
+
child = spawn(bin, args, { cwd: agentRoot, env, stdio: ["ignore", stdoutFd, stderrFd] });
|
|
177
221
|
} catch (err) {
|
|
222
|
+
try { closeSync(stdoutFd); closeSync(stderrFd); } catch { /* */ }
|
|
178
223
|
resolveOut({ ok: false, exit_code: -4, error: `spawn failed: ${err.message}` });
|
|
179
224
|
return;
|
|
180
225
|
}
|
|
@@ -188,8 +233,22 @@ function realSpawnSession({ agentRoot, cadence, promptPath, timeoutMs, log }) {
|
|
|
188
233
|
|
|
189
234
|
child.on("exit", (code, signal) => {
|
|
190
235
|
clearTimeout(timer);
|
|
236
|
+
try { closeSync(stdoutFd); closeSync(stderrFd); } catch { /* */ }
|
|
191
237
|
const durationMs = Date.now() - started;
|
|
192
238
|
const exit_code = typeof code === "number" ? code : (signal ? -1 : -5);
|
|
239
|
+
|
|
240
|
+
// Pull tail of stderr (and stdout if stderr empty) for the failure
|
|
241
|
+
// surface. Best-effort; we never block on file size.
|
|
242
|
+
let stderrTail = "";
|
|
243
|
+
try {
|
|
244
|
+
const body = readFileSync(stderrPath, "utf-8");
|
|
245
|
+
stderrTail = body.slice(-4096);
|
|
246
|
+
if (!stderrTail.trim()) {
|
|
247
|
+
const so = readFileSync(stdoutPath, "utf-8");
|
|
248
|
+
stderrTail = so.slice(-4096);
|
|
249
|
+
}
|
|
250
|
+
} catch { /* file may not exist if spawn ENOENT before fd-redirect */ }
|
|
251
|
+
|
|
193
252
|
// Record cost-ledger row. Token counts are 0 until we parse the
|
|
194
253
|
// session's JSON output; for now exit-code + duration are enough
|
|
195
254
|
// to spot pathological retry loops.
|
|
@@ -208,16 +267,29 @@ function realSpawnSession({ agentRoot, cadence, promptPath, timeoutMs, log }) {
|
|
|
208
267
|
], { stdio: "ignore", env: { ...env, AGENT_ROOT: agentRoot } }).unref();
|
|
209
268
|
}
|
|
210
269
|
} catch { /* cost tracking is best-effort */ }
|
|
270
|
+
|
|
271
|
+
// Clean up empty log files so the directory doesn't accumulate
|
|
272
|
+
// hundreds of zero-byte successes.
|
|
273
|
+
try {
|
|
274
|
+
|
|
275
|
+
if (statSync(stdoutPath).size === 0) unlinkSync(stdoutPath);
|
|
276
|
+
if (statSync(stderrPath).size === 0) unlinkSync(stderrPath);
|
|
277
|
+
} catch { /* */ }
|
|
278
|
+
|
|
211
279
|
resolveOut({
|
|
212
280
|
ok: exit_code === 0,
|
|
213
281
|
exit_code,
|
|
214
282
|
signal: signal || null,
|
|
215
283
|
duration_ms: durationMs,
|
|
284
|
+
stderr_tail: stderrTail || null,
|
|
285
|
+
stdout_path: stdoutPath,
|
|
286
|
+
stderr_path: stderrPath,
|
|
216
287
|
});
|
|
217
288
|
});
|
|
218
289
|
|
|
219
290
|
child.on("error", (err) => {
|
|
220
291
|
clearTimeout(timer);
|
|
292
|
+
try { closeSync(stdoutFd); closeSync(stderrFd); } catch { /* */ }
|
|
221
293
|
const durationMs = Date.now() - started;
|
|
222
294
|
resolveOut({ ok: false, exit_code: -6, error: err.message, duration_ms: durationMs });
|
|
223
295
|
});
|
|
@@ -242,6 +314,11 @@ export function startConsumer(opts = {}) {
|
|
|
242
314
|
const maxSpawnMs = opts.maxSpawnMs ?? DEFAULT_SPAWN_TIMEOUT_MS;
|
|
243
315
|
const spawnSession = opts.spawnSession || realSpawnSession;
|
|
244
316
|
const userLogger = opts.logger;
|
|
317
|
+
// Test / tuning hooks for the reliability layer.
|
|
318
|
+
const backoffSchedule = opts.backoffSchedule || BACKOFF_SCHEDULE_MS;
|
|
319
|
+
const circuitThreshold = opts.circuitThreshold ?? CIRCUIT_OPEN_THRESHOLD;
|
|
320
|
+
const circuitDurationMs = opts.circuitDurationMs ?? CIRCUIT_OPEN_DURATION_MS;
|
|
321
|
+
const maxAttempts = opts.maxAttempts ?? DEFAULT_MAX_ATTEMPTS;
|
|
245
322
|
|
|
246
323
|
const stats = {
|
|
247
324
|
started_at: new Date().toISOString(),
|
|
@@ -249,6 +326,8 @@ export function startConsumer(opts = {}) {
|
|
|
249
326
|
inline: 0,
|
|
250
327
|
escalated: 0,
|
|
251
328
|
skipped_emergency_stop: 0,
|
|
329
|
+
skipped_circuit_open: 0,
|
|
330
|
+
skipped_backoff: 0,
|
|
252
331
|
dlq: 0,
|
|
253
332
|
retries: 0,
|
|
254
333
|
spawn_failures: 0,
|
|
@@ -261,6 +340,75 @@ export function startConsumer(opts = {}) {
|
|
|
261
340
|
let timers = [];
|
|
262
341
|
let activeSubSessions = 0;
|
|
263
342
|
|
|
343
|
+
// Per-cadence reliability state. Tracks consecutive failure count and
|
|
344
|
+
// the earliest moment we'll allow another spawn for that cadence.
|
|
345
|
+
// Persists nothing — circuit state is in-memory only. On daemon restart
|
|
346
|
+
// we get a fresh slate; that's intentional (operators expect a restart
|
|
347
|
+
// to mean "try again now").
|
|
348
|
+
const cadenceState = new Map(); // cadence → { failures, openUntil, nextAllowedAt }
|
|
349
|
+
|
|
350
|
+
function getCadenceState(cadence) {
|
|
351
|
+
let s = cadenceState.get(cadence);
|
|
352
|
+
if (!s) { s = { failures: 0, openUntil: 0, nextAllowedAt: 0 }; cadenceState.set(cadence, s); }
|
|
353
|
+
return s;
|
|
354
|
+
}
|
|
355
|
+
|
|
356
|
+
function recordSubsessionSuccess(cadence) {
|
|
357
|
+
const s = getCadenceState(cadence);
|
|
358
|
+
s.failures = 0;
|
|
359
|
+
s.openUntil = 0;
|
|
360
|
+
s.nextAllowedAt = 0;
|
|
361
|
+
}
|
|
362
|
+
|
|
363
|
+
function recordSubsessionFailure(cadence) {
|
|
364
|
+
const s = getCadenceState(cadence);
|
|
365
|
+
s.failures += 1;
|
|
366
|
+
// Exponential back-off honouring the (test-overridable) schedule.
|
|
367
|
+
const idx = Math.min(s.failures, backoffSchedule.length - 1);
|
|
368
|
+
s.nextAllowedAt = Date.now() + backoffSchedule[idx];
|
|
369
|
+
if (s.failures >= circuitThreshold) {
|
|
370
|
+
s.openUntil = Date.now() + circuitDurationMs;
|
|
371
|
+
log({ level: "error", stage: "circuit_opened", cadence, failures: s.failures, open_until: new Date(s.openUntil).toISOString() });
|
|
372
|
+
writeCircuitFile();
|
|
373
|
+
}
|
|
374
|
+
}
|
|
375
|
+
|
|
376
|
+
function writeCircuitFile() {
|
|
377
|
+
// Persist the open-circuit snapshot so doctor + the operator can see
|
|
378
|
+
// which cadences are currently held back without scraping logs.
|
|
379
|
+
const open = {};
|
|
380
|
+
for (const [cad, s] of cadenceState.entries()) {
|
|
381
|
+
if (s.openUntil > Date.now()) {
|
|
382
|
+
open[cad] = { failures: s.failures, open_until: new Date(s.openUntil).toISOString() };
|
|
383
|
+
}
|
|
384
|
+
}
|
|
385
|
+
const path = join(agentRoot, "state/cadence-bus/circuit-open.json");
|
|
386
|
+
try {
|
|
387
|
+
if (Object.keys(open).length === 0) {
|
|
388
|
+
// Remove the file when nothing is open.
|
|
389
|
+
|
|
390
|
+
try { unlinkSync(path); } catch { /* */ }
|
|
391
|
+
} else {
|
|
392
|
+
writeFileSync(path, JSON.stringify({ generated: new Date().toISOString(), open }, null, 2) + "\n");
|
|
393
|
+
}
|
|
394
|
+
} catch { /* best-effort */ }
|
|
395
|
+
}
|
|
396
|
+
|
|
397
|
+
function isCadenceAllowed(cadence) {
|
|
398
|
+
const s = getCadenceState(cadence);
|
|
399
|
+
const now = Date.now();
|
|
400
|
+
if (s.openUntil > now) return { allowed: false, reason: "circuit-open", retry_at: s.openUntil };
|
|
401
|
+
if (s.nextAllowedAt > now) return { allowed: false, reason: "backoff", retry_at: s.nextAllowedAt };
|
|
402
|
+
// Circuit closes automatically when openUntil passes.
|
|
403
|
+
if (s.openUntil && s.openUntil <= now) {
|
|
404
|
+
s.openUntil = 0;
|
|
405
|
+
s.failures = 0;
|
|
406
|
+
log({ level: "info", stage: "circuit_closed", cadence });
|
|
407
|
+
writeCircuitFile();
|
|
408
|
+
}
|
|
409
|
+
return { allowed: true };
|
|
410
|
+
}
|
|
411
|
+
|
|
264
412
|
function log(entry) {
|
|
265
413
|
const enriched = { ts: new Date().toISOString(), ...entry };
|
|
266
414
|
logBusEvent(agentRoot, enriched);
|
|
@@ -280,6 +428,32 @@ export function startConsumer(opts = {}) {
|
|
|
280
428
|
}
|
|
281
429
|
|
|
282
430
|
async function escalate(event) {
|
|
431
|
+
// Circuit-breaker / back-off gate. If this cadence is currently held
|
|
432
|
+
// back, requeue without spawning. The event keeps its attempt count
|
|
433
|
+
// because the failure was upstream (not a per-event problem).
|
|
434
|
+
const gate = isCadenceAllowed(event.cadence);
|
|
435
|
+
if (!gate.allowed) {
|
|
436
|
+
log({
|
|
437
|
+
level: "warn",
|
|
438
|
+
stage: gate.reason === "circuit-open" ? "skipped_circuit_open" : "skipped_backoff",
|
|
439
|
+
id: event.id,
|
|
440
|
+
cadence: event.cadence,
|
|
441
|
+
retry_at: new Date(gate.retry_at).toISOString(),
|
|
442
|
+
});
|
|
443
|
+
if (gate.reason === "circuit-open") stats.skipped_circuit_open += 1;
|
|
444
|
+
else stats.skipped_backoff += 1;
|
|
445
|
+
// Put the event back in inbox WITHOUT bumping attempts so it doesn't
|
|
446
|
+
// burn its retry budget while the circuit is open.
|
|
447
|
+
const paths2 = getBusPaths(agentRoot);
|
|
448
|
+
try {
|
|
449
|
+
const event2 = { ...event, attempts: Math.max(0, (event.attempts || 1) - 1) };
|
|
450
|
+
writeFileSync(join(paths2.inbox, `${event.id}.json`), JSON.stringify(event2, null, 2) + "\n");
|
|
451
|
+
|
|
452
|
+
try { unlinkSync(join(paths2.claimed, `${event.id}.json`)); } catch { /* */ }
|
|
453
|
+
} catch { /* best-effort */ }
|
|
454
|
+
return { ok: false, decision: gate.reason };
|
|
455
|
+
}
|
|
456
|
+
|
|
283
457
|
if (activeSubSessions >= MAX_CONCURRENT_SUB_SESSIONS) {
|
|
284
458
|
// Re-queue and try again next tick. Single-owner cadence consumer
|
|
285
459
|
// means this can only happen when a prior tick is still running —
|
|
@@ -291,7 +465,15 @@ export function startConsumer(opts = {}) {
|
|
|
291
465
|
cadence: event.cadence,
|
|
292
466
|
active_subsessions: activeSubSessions,
|
|
293
467
|
});
|
|
294
|
-
|
|
468
|
+
// Re-queue without burning the retry budget — concurrent-spawn isn't
|
|
469
|
+
// a per-event failure.
|
|
470
|
+
const paths2 = getBusPaths(agentRoot);
|
|
471
|
+
try {
|
|
472
|
+
const event2 = { ...event, attempts: Math.max(0, (event.attempts || 1) - 1) };
|
|
473
|
+
writeFileSync(join(paths2.inbox, `${event.id}.json`), JSON.stringify(event2, null, 2) + "\n");
|
|
474
|
+
|
|
475
|
+
try { unlinkSync(join(paths2.claimed, `${event.id}.json`)); } catch { /* */ }
|
|
476
|
+
} catch { /* best-effort */ }
|
|
295
477
|
stats.retries += 1;
|
|
296
478
|
return { ok: false, decision: "deferred" };
|
|
297
479
|
}
|
|
@@ -334,14 +516,31 @@ export function startConsumer(opts = {}) {
|
|
|
334
516
|
prompt: promptPath,
|
|
335
517
|
exit_code: result.exit_code,
|
|
336
518
|
duration_ms: result.duration_ms,
|
|
519
|
+
stdout_path: result.stdout_path || null,
|
|
520
|
+
stderr_path: result.stderr_path || null,
|
|
337
521
|
});
|
|
522
|
+
recordSubsessionSuccess(event.cadence);
|
|
338
523
|
stats.escalated += 1;
|
|
339
524
|
stats.last_decision = "escalated";
|
|
340
525
|
return { ok: true, decision: "escalated", exit_code: result.exit_code };
|
|
341
526
|
}
|
|
342
|
-
|
|
527
|
+
// Failure path: log + cap retries low. The exact stderr tail comes
|
|
528
|
+
// from the spawn helper so we never DLQ "blind" again.
|
|
529
|
+
const stderrTail = (result.stderr_tail || "").trim().split("\n").slice(-3).join(" | ");
|
|
530
|
+
log({
|
|
531
|
+
level: "error",
|
|
532
|
+
stage: "subsession_failed",
|
|
533
|
+
id: event.id,
|
|
534
|
+
cadence: event.cadence,
|
|
535
|
+
exit_code: result.exit_code,
|
|
536
|
+
duration_ms: result.duration_ms,
|
|
537
|
+
error: result.error || stderrTail || `exit ${result.exit_code}`,
|
|
538
|
+
stderr_path: result.stderr_path || null,
|
|
539
|
+
});
|
|
343
540
|
stats.spawn_failures += 1;
|
|
344
|
-
|
|
541
|
+
recordSubsessionFailure(event.cadence);
|
|
542
|
+
const reason = result.error || (stderrTail ? `exit ${result.exit_code}: ${stderrTail}` : `exit ${result.exit_code}`);
|
|
543
|
+
const outcome = failTick(agentRoot, event.id, reason, { maxAttempts });
|
|
345
544
|
if (outcome?.destination === "dlq") stats.dlq += 1;
|
|
346
545
|
else stats.retries += 1;
|
|
347
546
|
return { ok: false, decision: outcome?.destination || "failed" };
|
|
@@ -428,19 +627,38 @@ export function startConsumer(opts = {}) {
|
|
|
428
627
|
recoverStaleClaims(agentRoot);
|
|
429
628
|
|
|
430
629
|
let processed = 0;
|
|
431
|
-
|
|
432
|
-
//
|
|
630
|
+
let escalatedThisTick = 0;
|
|
631
|
+
// Drain inline events as much as the consumer can in one tick; cap
|
|
632
|
+
// sub-session escalations at 1 per tick so a fast-failing cadence
|
|
633
|
+
// can't burn a whole minute's worth of retries inside a single poll.
|
|
634
|
+
// The next poll (DEFAULT_POLL_MS later) will pick up where we left off.
|
|
433
635
|
while (!stopping) {
|
|
434
636
|
const claim = claimNextTick(agentRoot);
|
|
435
637
|
if (!claim) break;
|
|
436
638
|
const event = claim.event;
|
|
437
639
|
activeTick = event.id;
|
|
640
|
+
let didEscalate = false;
|
|
438
641
|
try {
|
|
642
|
+
const def = getCadenceDef(event.cadence);
|
|
643
|
+
const willEscalate = !def || (def.mode !== "inline" && (def.mode !== "guarded" || true));
|
|
644
|
+
// Roughly: if it's not a registry-inline cadence, we MAY escalate.
|
|
645
|
+
// We don't yet know if the guard will say inline; processEvent
|
|
646
|
+
// will tell us via stats. Use the escalated stats delta as the
|
|
647
|
+
// signal that an actual sub-session ran this iteration.
|
|
648
|
+
const before = stats.escalated + stats.spawn_failures + stats.skipped_circuit_open + stats.skipped_backoff;
|
|
439
649
|
await processEvent(event);
|
|
650
|
+
const after = stats.escalated + stats.spawn_failures + stats.skipped_circuit_open + stats.skipped_backoff;
|
|
651
|
+
if (after > before) didEscalate = true;
|
|
652
|
+
// Silence unused var warning.
|
|
653
|
+
void willEscalate;
|
|
440
654
|
} finally {
|
|
441
655
|
activeTick = null;
|
|
442
656
|
}
|
|
443
657
|
processed += 1;
|
|
658
|
+
if (didEscalate) escalatedThisTick += 1;
|
|
659
|
+
// Hard cap: at most ONE sub-session spawn per tick. Inline ticks
|
|
660
|
+
// keep draining freely (they're cheap).
|
|
661
|
+
if (escalatedThisTick >= 1) break;
|
|
444
662
|
if (processed >= 16) break; // soft batch cap
|
|
445
663
|
}
|
|
446
664
|
return { processed };
|
|
@@ -210,9 +210,16 @@ test("unknown cadence with no prompt file DLQ's immediately", async () => {
|
|
|
210
210
|
test("spawn failure retries within the budget, then DLQs", async () => {
|
|
211
211
|
const root = await makeAgentRoot();
|
|
212
212
|
plantPrompt(root, "weekly-strategic-memo");
|
|
213
|
+
// Disable back-off + raise circuit threshold so the test exercises the
|
|
214
|
+
// retry-then-DLQ path without waiting for back-off windows. The
|
|
215
|
+
// real defaults (30s/2m back-off, 3-failure circuit) are exercised by
|
|
216
|
+
// dedicated tests below.
|
|
213
217
|
const consumer = startConsumer({
|
|
214
218
|
agentRoot: root,
|
|
215
219
|
pollMs: 25,
|
|
220
|
+
backoffSchedule: [0, 0, 0],
|
|
221
|
+
circuitThreshold: 999,
|
|
222
|
+
maxAttempts: 2,
|
|
216
223
|
spawnSession: async () => ({ ok: false, exit_code: 1, error: "always-fail", duration_ms: 1 }),
|
|
217
224
|
});
|
|
218
225
|
try {
|
|
@@ -226,6 +233,68 @@ test("spawn failure retries within the budget, then DLQs", async () => {
|
|
|
226
233
|
}
|
|
227
234
|
});
|
|
228
235
|
|
|
236
|
+
test("circuit breaker opens after consecutive failures and blocks further spawns", async () => {
|
|
237
|
+
const root = await makeAgentRoot();
|
|
238
|
+
plantPrompt(root, "weekly-strategic-memo");
|
|
239
|
+
let spawnCount = 0;
|
|
240
|
+
const consumer = startConsumer({
|
|
241
|
+
agentRoot: root,
|
|
242
|
+
pollMs: 20,
|
|
243
|
+
backoffSchedule: [0, 0, 0],
|
|
244
|
+
circuitThreshold: 2,
|
|
245
|
+
circuitDurationMs: 60_000, // 1 min — long enough for the assertion window
|
|
246
|
+
maxAttempts: 1, // each event DLQs on first failure so we don't conflate retry-counts
|
|
247
|
+
spawnSession: async () => { spawnCount++; return { ok: false, exit_code: 1, error: "fail", duration_ms: 1 }; },
|
|
248
|
+
});
|
|
249
|
+
try {
|
|
250
|
+
// Enqueue 5 events; circuit should open after 2 failures, blocking the rest.
|
|
251
|
+
for (let i = 0; i < 5; i++) {
|
|
252
|
+
enqueueTick({ cadence: "weekly-strategic-memo", source: "launchd", agentRoot: root });
|
|
253
|
+
}
|
|
254
|
+
const opened = await waitFor(() => consumer.getStats().skipped_circuit_open >= 1, { timeoutMs: 10_000 });
|
|
255
|
+
assert.ok(opened, `circuit should open; stats=${JSON.stringify(consumer.getStats())}`);
|
|
256
|
+
// Spawn count must NOT keep climbing once the circuit is open.
|
|
257
|
+
const spawnsAtOpen = spawnCount;
|
|
258
|
+
await new Promise((r) => setTimeout(r, 500));
|
|
259
|
+
assert.equal(spawnCount, spawnsAtOpen, `spawns must stop once circuit opens (was ${spawnsAtOpen}, now ${spawnCount})`);
|
|
260
|
+
} finally {
|
|
261
|
+
await consumer.stop();
|
|
262
|
+
await rmRoot(root);
|
|
263
|
+
}
|
|
264
|
+
});
|
|
265
|
+
|
|
266
|
+
test("back-off skips re-spawning until the cooldown elapses", async () => {
|
|
267
|
+
const root = await makeAgentRoot();
|
|
268
|
+
plantPrompt(root, "weekly-strategic-memo");
|
|
269
|
+
let spawnCount = 0;
|
|
270
|
+
const consumer = startConsumer({
|
|
271
|
+
agentRoot: root,
|
|
272
|
+
pollMs: 20,
|
|
273
|
+
backoffSchedule: [0, 300, 300], // 300ms cooldown after each failure
|
|
274
|
+
circuitThreshold: 999,
|
|
275
|
+
maxAttempts: 1,
|
|
276
|
+
spawnSession: async () => { spawnCount++; return { ok: false, exit_code: 1, error: "fail", duration_ms: 1 }; },
|
|
277
|
+
});
|
|
278
|
+
try {
|
|
279
|
+
// Enqueue 2 events back-to-back. The 1st triggers a spawn (fails). The
|
|
280
|
+
// 2nd should be held back by the 300ms back-off window.
|
|
281
|
+
enqueueTick({ cadence: "weekly-strategic-memo", source: "launchd", agentRoot: root });
|
|
282
|
+
enqueueTick({ cadence: "weekly-strategic-memo", source: "launchd", agentRoot: root });
|
|
283
|
+
await waitFor(() => spawnCount >= 1, { timeoutMs: 5_000 });
|
|
284
|
+
const spawnsBeforeWait = spawnCount;
|
|
285
|
+
// During the back-off window no new spawn should fire.
|
|
286
|
+
await new Promise((r) => setTimeout(r, 150));
|
|
287
|
+
assert.ok(spawnCount === spawnsBeforeWait, `spawns must wait for back-off (was ${spawnsBeforeWait}, now ${spawnCount})`);
|
|
288
|
+
assert.ok(consumer.getStats().skipped_backoff >= 1, "skipped_backoff should be recorded");
|
|
289
|
+
// After the window passes, the next event should be processed.
|
|
290
|
+
await waitFor(() => spawnCount > spawnsBeforeWait, { timeoutMs: 5_000 });
|
|
291
|
+
assert.ok(spawnCount > spawnsBeforeWait, "spawning resumes after back-off");
|
|
292
|
+
} finally {
|
|
293
|
+
await consumer.stop();
|
|
294
|
+
await rmRoot(root);
|
|
295
|
+
}
|
|
296
|
+
});
|
|
297
|
+
|
|
229
298
|
// ---------------------------------------------------------------------------
|
|
230
299
|
// Emergency stop
|
|
231
300
|
// ---------------------------------------------------------------------------
|