@adaptic/maestro 1.8.4 → 1.9.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude/settings.json +11 -0
- package/agents/engineering-oversight/agent.md +44 -0
- package/agents/github-operator/agent.md +38 -0
- package/agents/inbox-processor/agent.md +39 -0
- package/bin/maestro.mjs +302 -4
- package/framework-features.json +107 -0
- package/lib/feature-init.mjs +297 -0
- package/package.json +5 -2
- package/scaffold/config/known-agents.json +57 -8
- package/scripts/cost/track-claude-usage.mjs +154 -0
- package/scripts/daemon/cadence-consumer.mjs +287 -12
- package/scripts/daemon/cadence-consumer.test.mjs +69 -0
- package/scripts/decisions/capture-decision.mjs +116 -0
- package/scripts/emergency-stop.sh +56 -19
- package/scripts/hooks/session-start-banner.sh +79 -0
- package/scripts/maintenance/backup-to-cloud.sh +124 -0
- package/scripts/rag/ingest.mjs +111 -0
- package/scripts/rag/search.mjs +119 -0
- package/scripts/resume-operations.sh +50 -13
- package/scripts/setup/init-backup.mjs +54 -0
- package/scripts/setup/init-cadence-bus.mjs +60 -0
- package/scripts/setup/init-cost-tracking.mjs +45 -0
- package/scripts/setup/init-decision-capture.mjs +66 -0
- package/scripts/setup/init-known-agents.mjs +57 -0
- package/scripts/setup/init-memory-executive.mjs +45 -0
- package/scripts/setup/init-rag.mjs +103 -0
- package/scripts/setup/init-session-router.mjs +38 -0
|
@@ -43,9 +43,10 @@
|
|
|
43
43
|
* logger optional fn({ ts, level, …rest }) → void for tests.
|
|
44
44
|
*/
|
|
45
45
|
|
|
46
|
-
import { existsSync, readFileSync, writeFileSync } from "node:fs";
|
|
46
|
+
import { existsSync, readFileSync, writeFileSync, mkdirSync, appendFileSync, openSync, closeSync, statSync, unlinkSync } from "node:fs";
|
|
47
47
|
import { join } from "node:path";
|
|
48
48
|
import { spawn } from "node:child_process";
|
|
49
|
+
import { homedir } from "node:os";
|
|
49
50
|
|
|
50
51
|
import {
|
|
51
52
|
ensureBusDirs,
|
|
@@ -74,6 +75,17 @@ const DEFAULT_SPAWN_TIMEOUT_MS = 30 * 60_000;
|
|
|
74
75
|
// preferable to thrashing Claude / hitting usage limits.
|
|
75
76
|
const MAX_CONCURRENT_SUB_SESSIONS = 1;
|
|
76
77
|
|
|
78
|
+
// Retry policy. Most cadence failures are systemic (broken prompt, bad
|
|
79
|
+
// auth, transient API errors) — 5 retries doesn't help, it just amplifies
|
|
80
|
+
// the burn. 2 retries with exponential back-off is the right balance.
|
|
81
|
+
const DEFAULT_MAX_ATTEMPTS = 2;
|
|
82
|
+
const BACKOFF_SCHEDULE_MS = [0, 30_000, 120_000]; // 1st retry +30s, 2nd retry +2m
|
|
83
|
+
|
|
84
|
+
// Circuit breaker — when 3 same-cadence failures land in a row, stop
|
|
85
|
+
// spawning that cadence for 30 minutes. Prevents launchd-rate runaway.
|
|
86
|
+
const CIRCUIT_OPEN_THRESHOLD = 3;
|
|
87
|
+
const CIRCUIT_OPEN_DURATION_MS = 30 * 60_000;
|
|
88
|
+
|
|
77
89
|
// ---------------------------------------------------------------------------
|
|
78
90
|
// Helpers
|
|
79
91
|
// ---------------------------------------------------------------------------
|
|
@@ -92,10 +104,49 @@ function defaultLogger(entry) {
|
|
|
92
104
|
}
|
|
93
105
|
}
|
|
94
106
|
|
|
107
|
+
/**
|
|
108
|
+
* Resolve an absolute path to the Claude CLI. launchd's bare environment
|
|
109
|
+
* does NOT include /Users/<u>/.local/bin or homebrew on PATH, so a plain
|
|
110
|
+
* `spawn('claude', …)` fails with ENOENT — which is exactly what was
|
|
111
|
+
* stuck in ravi-ai's DLQ. This resolver returns the first existing
|
|
112
|
+
* candidate among:
|
|
113
|
+
*
|
|
114
|
+
* 1. $CLAUDE_BIN env var (if set + executable)
|
|
115
|
+
* 2. ~/.local/bin/claude (default Claude Code install path)
|
|
116
|
+
* 3. /opt/homebrew/bin/claude (homebrew on Apple Silicon)
|
|
117
|
+
* 4. /usr/local/bin/claude (homebrew on Intel)
|
|
118
|
+
* 5. /usr/bin/claude
|
|
119
|
+
*
|
|
120
|
+
* Falls back to bare "claude" so the spawn's own error stays informative
|
|
121
|
+
* when nothing is found.
|
|
122
|
+
*/
|
|
123
|
+
let _resolvedClaude = null;
|
|
124
|
+
function resolveClaudeBin() {
|
|
125
|
+
if (_resolvedClaude) return _resolvedClaude;
|
|
126
|
+
const envOverride = process.env.CLAUDE_BIN;
|
|
127
|
+
const candidates = [
|
|
128
|
+
envOverride,
|
|
129
|
+
join(homedir(), ".local/bin/claude"),
|
|
130
|
+
"/opt/homebrew/bin/claude",
|
|
131
|
+
"/usr/local/bin/claude",
|
|
132
|
+
"/usr/bin/claude",
|
|
133
|
+
].filter(Boolean);
|
|
134
|
+
for (const c of candidates) {
|
|
135
|
+
if (existsSync(c)) { _resolvedClaude = c; return c; }
|
|
136
|
+
}
|
|
137
|
+
_resolvedClaude = "claude"; // last-resort; spawn will report ENOENT
|
|
138
|
+
return _resolvedClaude;
|
|
139
|
+
}
|
|
140
|
+
|
|
95
141
|
/**
|
|
96
142
|
* Spawn a sub-session running the cadence's trigger prompt and resolve
|
|
97
|
-
* with { exit_code, durationMs }. Reads the prompt at call
|
|
98
|
-
* latest version (possibly upgraded between ticks) is always
|
|
143
|
+
* with { exit_code, durationMs, stderr_tail }. Reads the prompt at call
|
|
144
|
+
* time so the latest version (possibly upgraded between ticks) is always
|
|
145
|
+
* used.
|
|
146
|
+
*
|
|
147
|
+
* Robustness: stdout + stderr are tee'd to logs/cadence-bus/subsessions/
|
|
148
|
+
* so non-zero exits remain diagnosable after the fact. The last ~4 KB of
|
|
149
|
+
* stderr is also captured in-memory and surfaced on the failure event.
|
|
99
150
|
*/
|
|
100
151
|
function realSpawnSession({ agentRoot, cadence, promptPath, timeoutMs, log }) {
|
|
101
152
|
return new Promise((resolveOut) => {
|
|
@@ -111,17 +162,50 @@ function realSpawnSession({ agentRoot, cadence, promptPath, timeoutMs, log }) {
|
|
|
111
162
|
return;
|
|
112
163
|
}
|
|
113
164
|
|
|
114
|
-
const bin =
|
|
165
|
+
const bin = resolveClaudeBin();
|
|
115
166
|
const args = ["--print", "--dangerously-skip-permissions", body];
|
|
116
|
-
|
|
167
|
+
// Augment PATH so any tool the subsession invokes (jq, node, etc.)
|
|
168
|
+
// can still be found. launchd's bare env strips /opt/homebrew/bin etc.
|
|
169
|
+
const augmentedPath = [
|
|
170
|
+
process.env.PATH || "",
|
|
171
|
+
`${homedir()}/.local/bin`,
|
|
172
|
+
"/opt/homebrew/bin",
|
|
173
|
+
"/opt/homebrew/sbin",
|
|
174
|
+
"/usr/local/bin",
|
|
175
|
+
"/usr/bin",
|
|
176
|
+
"/bin",
|
|
177
|
+
"/usr/sbin",
|
|
178
|
+
"/sbin",
|
|
179
|
+
].filter(Boolean).join(":");
|
|
180
|
+
const env = {
|
|
181
|
+
...process.env,
|
|
182
|
+
AGENT_ROOT: agentRoot,
|
|
183
|
+
AGENT_DIR: agentRoot,
|
|
184
|
+
PATH: augmentedPath,
|
|
185
|
+
};
|
|
117
186
|
const started = Date.now();
|
|
118
187
|
|
|
119
|
-
log
|
|
188
|
+
// Per-run log file. Pattern is short enough to be tail-friendly.
|
|
189
|
+
const logsDir = join(agentRoot, "logs", "cadence-bus", "subsessions");
|
|
190
|
+
mkdirSync(logsDir, { recursive: true });
|
|
191
|
+
const date = new Date().toISOString().slice(0, 10);
|
|
192
|
+
const stamp = new Date().toISOString().replace(/[:.]/g, "-");
|
|
193
|
+
const stdoutPath = join(logsDir, `${date}-${cadence}-${stamp}.stdout.log`);
|
|
194
|
+
const stderrPath = join(logsDir, `${date}-${cadence}-${stamp}.stderr.log`);
|
|
195
|
+
const stdoutFd = openSync(stdoutPath, "a");
|
|
196
|
+
const stderrFd = openSync(stderrPath, "a");
|
|
197
|
+
|
|
198
|
+
log({ level: "info", stage: "subsession_spawn", cadence, bin, stdout: stdoutPath, stderr: stderrPath });
|
|
120
199
|
|
|
121
200
|
let child;
|
|
122
201
|
try {
|
|
123
|
-
|
|
202
|
+
// stdio:
|
|
203
|
+
// 0 ignore (claude --print reads prompt from argv, not stdin)
|
|
204
|
+
// 1 → file (capture stdout for later inspection)
|
|
205
|
+
// 2 → file (capture stderr — critical for diagnosing exit-1)
|
|
206
|
+
child = spawn(bin, args, { cwd: agentRoot, env, stdio: ["ignore", stdoutFd, stderrFd] });
|
|
124
207
|
} catch (err) {
|
|
208
|
+
try { closeSync(stdoutFd); closeSync(stderrFd); } catch { /* */ }
|
|
125
209
|
resolveOut({ ok: false, exit_code: -4, error: `spawn failed: ${err.message}` });
|
|
126
210
|
return;
|
|
127
211
|
}
|
|
@@ -135,18 +219,63 @@ function realSpawnSession({ agentRoot, cadence, promptPath, timeoutMs, log }) {
|
|
|
135
219
|
|
|
136
220
|
child.on("exit", (code, signal) => {
|
|
137
221
|
clearTimeout(timer);
|
|
222
|
+
try { closeSync(stdoutFd); closeSync(stderrFd); } catch { /* */ }
|
|
138
223
|
const durationMs = Date.now() - started;
|
|
139
224
|
const exit_code = typeof code === "number" ? code : (signal ? -1 : -5);
|
|
225
|
+
|
|
226
|
+
// Pull tail of stderr (and stdout if stderr empty) for the failure
|
|
227
|
+
// surface. Best-effort; we never block on file size.
|
|
228
|
+
let stderrTail = "";
|
|
229
|
+
try {
|
|
230
|
+
const body = readFileSync(stderrPath, "utf-8");
|
|
231
|
+
stderrTail = body.slice(-4096);
|
|
232
|
+
if (!stderrTail.trim()) {
|
|
233
|
+
const so = readFileSync(stdoutPath, "utf-8");
|
|
234
|
+
stderrTail = so.slice(-4096);
|
|
235
|
+
}
|
|
236
|
+
} catch { /* file may not exist if spawn ENOENT before fd-redirect */ }
|
|
237
|
+
|
|
238
|
+
// Record cost-ledger row. Token counts are 0 until we parse the
|
|
239
|
+
// session's JSON output; for now exit-code + duration are enough
|
|
240
|
+
// to spot pathological retry loops.
|
|
241
|
+
try {
|
|
242
|
+
const trackerPath = join(agentRoot, "scripts/cost/track-claude-usage.mjs");
|
|
243
|
+
if (existsSync(trackerPath)) {
|
|
244
|
+
spawn(process.execPath, [
|
|
245
|
+
trackerPath, "record",
|
|
246
|
+
"--cadence", cadence,
|
|
247
|
+
"--source", "cadence-consumer",
|
|
248
|
+
"--model", "sonnet",
|
|
249
|
+
"--duration-ms", String(durationMs),
|
|
250
|
+
"--input-tokens", "0",
|
|
251
|
+
"--output-tokens", "0",
|
|
252
|
+
"--exit", String(exit_code),
|
|
253
|
+
], { stdio: "ignore", env: { ...env, AGENT_ROOT: agentRoot } }).unref();
|
|
254
|
+
}
|
|
255
|
+
} catch { /* cost tracking is best-effort */ }
|
|
256
|
+
|
|
257
|
+
// Clean up empty log files so the directory doesn't accumulate
|
|
258
|
+
// hundreds of zero-byte successes.
|
|
259
|
+
try {
|
|
260
|
+
|
|
261
|
+
if (statSync(stdoutPath).size === 0) unlinkSync(stdoutPath);
|
|
262
|
+
if (statSync(stderrPath).size === 0) unlinkSync(stderrPath);
|
|
263
|
+
} catch { /* */ }
|
|
264
|
+
|
|
140
265
|
resolveOut({
|
|
141
266
|
ok: exit_code === 0,
|
|
142
267
|
exit_code,
|
|
143
268
|
signal: signal || null,
|
|
144
269
|
duration_ms: durationMs,
|
|
270
|
+
stderr_tail: stderrTail || null,
|
|
271
|
+
stdout_path: stdoutPath,
|
|
272
|
+
stderr_path: stderrPath,
|
|
145
273
|
});
|
|
146
274
|
});
|
|
147
275
|
|
|
148
276
|
child.on("error", (err) => {
|
|
149
277
|
clearTimeout(timer);
|
|
278
|
+
try { closeSync(stdoutFd); closeSync(stderrFd); } catch { /* */ }
|
|
150
279
|
const durationMs = Date.now() - started;
|
|
151
280
|
resolveOut({ ok: false, exit_code: -6, error: err.message, duration_ms: durationMs });
|
|
152
281
|
});
|
|
@@ -171,6 +300,11 @@ export function startConsumer(opts = {}) {
|
|
|
171
300
|
const maxSpawnMs = opts.maxSpawnMs ?? DEFAULT_SPAWN_TIMEOUT_MS;
|
|
172
301
|
const spawnSession = opts.spawnSession || realSpawnSession;
|
|
173
302
|
const userLogger = opts.logger;
|
|
303
|
+
// Test / tuning hooks for the reliability layer.
|
|
304
|
+
const backoffSchedule = opts.backoffSchedule || BACKOFF_SCHEDULE_MS;
|
|
305
|
+
const circuitThreshold = opts.circuitThreshold ?? CIRCUIT_OPEN_THRESHOLD;
|
|
306
|
+
const circuitDurationMs = opts.circuitDurationMs ?? CIRCUIT_OPEN_DURATION_MS;
|
|
307
|
+
const maxAttempts = opts.maxAttempts ?? DEFAULT_MAX_ATTEMPTS;
|
|
174
308
|
|
|
175
309
|
const stats = {
|
|
176
310
|
started_at: new Date().toISOString(),
|
|
@@ -178,6 +312,8 @@ export function startConsumer(opts = {}) {
|
|
|
178
312
|
inline: 0,
|
|
179
313
|
escalated: 0,
|
|
180
314
|
skipped_emergency_stop: 0,
|
|
315
|
+
skipped_circuit_open: 0,
|
|
316
|
+
skipped_backoff: 0,
|
|
181
317
|
dlq: 0,
|
|
182
318
|
retries: 0,
|
|
183
319
|
spawn_failures: 0,
|
|
@@ -190,6 +326,75 @@ export function startConsumer(opts = {}) {
|
|
|
190
326
|
let timers = [];
|
|
191
327
|
let activeSubSessions = 0;
|
|
192
328
|
|
|
329
|
+
// Per-cadence reliability state. Tracks consecutive failure count and
|
|
330
|
+
// the earliest moment we'll allow another spawn for that cadence.
|
|
331
|
+
// Persists nothing — circuit state is in-memory only. On daemon restart
|
|
332
|
+
// we get a fresh slate; that's intentional (operators expect a restart
|
|
333
|
+
// to mean "try again now").
|
|
334
|
+
const cadenceState = new Map(); // cadence → { failures, openUntil, nextAllowedAt }
|
|
335
|
+
|
|
336
|
+
function getCadenceState(cadence) {
|
|
337
|
+
let s = cadenceState.get(cadence);
|
|
338
|
+
if (!s) { s = { failures: 0, openUntil: 0, nextAllowedAt: 0 }; cadenceState.set(cadence, s); }
|
|
339
|
+
return s;
|
|
340
|
+
}
|
|
341
|
+
|
|
342
|
+
function recordSubsessionSuccess(cadence) {
|
|
343
|
+
const s = getCadenceState(cadence);
|
|
344
|
+
s.failures = 0;
|
|
345
|
+
s.openUntil = 0;
|
|
346
|
+
s.nextAllowedAt = 0;
|
|
347
|
+
}
|
|
348
|
+
|
|
349
|
+
function recordSubsessionFailure(cadence) {
|
|
350
|
+
const s = getCadenceState(cadence);
|
|
351
|
+
s.failures += 1;
|
|
352
|
+
// Exponential back-off honouring the (test-overridable) schedule.
|
|
353
|
+
const idx = Math.min(s.failures, backoffSchedule.length - 1);
|
|
354
|
+
s.nextAllowedAt = Date.now() + backoffSchedule[idx];
|
|
355
|
+
if (s.failures >= circuitThreshold) {
|
|
356
|
+
s.openUntil = Date.now() + circuitDurationMs;
|
|
357
|
+
log({ level: "error", stage: "circuit_opened", cadence, failures: s.failures, open_until: new Date(s.openUntil).toISOString() });
|
|
358
|
+
writeCircuitFile();
|
|
359
|
+
}
|
|
360
|
+
}
|
|
361
|
+
|
|
362
|
+
function writeCircuitFile() {
|
|
363
|
+
// Persist the open-circuit snapshot so doctor + the operator can see
|
|
364
|
+
// which cadences are currently held back without scraping logs.
|
|
365
|
+
const open = {};
|
|
366
|
+
for (const [cad, s] of cadenceState.entries()) {
|
|
367
|
+
if (s.openUntil > Date.now()) {
|
|
368
|
+
open[cad] = { failures: s.failures, open_until: new Date(s.openUntil).toISOString() };
|
|
369
|
+
}
|
|
370
|
+
}
|
|
371
|
+
const path = join(agentRoot, "state/cadence-bus/circuit-open.json");
|
|
372
|
+
try {
|
|
373
|
+
if (Object.keys(open).length === 0) {
|
|
374
|
+
// Remove the file when nothing is open.
|
|
375
|
+
|
|
376
|
+
try { unlinkSync(path); } catch { /* */ }
|
|
377
|
+
} else {
|
|
378
|
+
writeFileSync(path, JSON.stringify({ generated: new Date().toISOString(), open }, null, 2) + "\n");
|
|
379
|
+
}
|
|
380
|
+
} catch { /* best-effort */ }
|
|
381
|
+
}
|
|
382
|
+
|
|
383
|
+
function isCadenceAllowed(cadence) {
|
|
384
|
+
const s = getCadenceState(cadence);
|
|
385
|
+
const now = Date.now();
|
|
386
|
+
if (s.openUntil > now) return { allowed: false, reason: "circuit-open", retry_at: s.openUntil };
|
|
387
|
+
if (s.nextAllowedAt > now) return { allowed: false, reason: "backoff", retry_at: s.nextAllowedAt };
|
|
388
|
+
// Circuit closes automatically when openUntil passes.
|
|
389
|
+
if (s.openUntil && s.openUntil <= now) {
|
|
390
|
+
s.openUntil = 0;
|
|
391
|
+
s.failures = 0;
|
|
392
|
+
log({ level: "info", stage: "circuit_closed", cadence });
|
|
393
|
+
writeCircuitFile();
|
|
394
|
+
}
|
|
395
|
+
return { allowed: true };
|
|
396
|
+
}
|
|
397
|
+
|
|
193
398
|
function log(entry) {
|
|
194
399
|
const enriched = { ts: new Date().toISOString(), ...entry };
|
|
195
400
|
logBusEvent(agentRoot, enriched);
|
|
@@ -209,6 +414,32 @@ export function startConsumer(opts = {}) {
|
|
|
209
414
|
}
|
|
210
415
|
|
|
211
416
|
async function escalate(event) {
|
|
417
|
+
// Circuit-breaker / back-off gate. If this cadence is currently held
|
|
418
|
+
// back, requeue without spawning. The event keeps its attempt count
|
|
419
|
+
// because the failure was upstream (not a per-event problem).
|
|
420
|
+
const gate = isCadenceAllowed(event.cadence);
|
|
421
|
+
if (!gate.allowed) {
|
|
422
|
+
log({
|
|
423
|
+
level: "warn",
|
|
424
|
+
stage: gate.reason === "circuit-open" ? "skipped_circuit_open" : "skipped_backoff",
|
|
425
|
+
id: event.id,
|
|
426
|
+
cadence: event.cadence,
|
|
427
|
+
retry_at: new Date(gate.retry_at).toISOString(),
|
|
428
|
+
});
|
|
429
|
+
if (gate.reason === "circuit-open") stats.skipped_circuit_open += 1;
|
|
430
|
+
else stats.skipped_backoff += 1;
|
|
431
|
+
// Put the event back in inbox WITHOUT bumping attempts so it doesn't
|
|
432
|
+
// burn its retry budget while the circuit is open.
|
|
433
|
+
const paths2 = getBusPaths(agentRoot);
|
|
434
|
+
try {
|
|
435
|
+
const event2 = { ...event, attempts: Math.max(0, (event.attempts || 1) - 1) };
|
|
436
|
+
writeFileSync(join(paths2.inbox, `${event.id}.json`), JSON.stringify(event2, null, 2) + "\n");
|
|
437
|
+
|
|
438
|
+
try { unlinkSync(join(paths2.claimed, `${event.id}.json`)); } catch { /* */ }
|
|
439
|
+
} catch { /* best-effort */ }
|
|
440
|
+
return { ok: false, decision: gate.reason };
|
|
441
|
+
}
|
|
442
|
+
|
|
212
443
|
if (activeSubSessions >= MAX_CONCURRENT_SUB_SESSIONS) {
|
|
213
444
|
// Re-queue and try again next tick. Single-owner cadence consumer
|
|
214
445
|
// means this can only happen when a prior tick is still running —
|
|
@@ -220,7 +451,15 @@ export function startConsumer(opts = {}) {
|
|
|
220
451
|
cadence: event.cadence,
|
|
221
452
|
active_subsessions: activeSubSessions,
|
|
222
453
|
});
|
|
223
|
-
|
|
454
|
+
// Re-queue without burning the retry budget — concurrent-spawn isn't
|
|
455
|
+
// a per-event failure.
|
|
456
|
+
const paths2 = getBusPaths(agentRoot);
|
|
457
|
+
try {
|
|
458
|
+
const event2 = { ...event, attempts: Math.max(0, (event.attempts || 1) - 1) };
|
|
459
|
+
writeFileSync(join(paths2.inbox, `${event.id}.json`), JSON.stringify(event2, null, 2) + "\n");
|
|
460
|
+
|
|
461
|
+
try { unlinkSync(join(paths2.claimed, `${event.id}.json`)); } catch { /* */ }
|
|
462
|
+
} catch { /* best-effort */ }
|
|
224
463
|
stats.retries += 1;
|
|
225
464
|
return { ok: false, decision: "deferred" };
|
|
226
465
|
}
|
|
@@ -263,14 +502,31 @@ export function startConsumer(opts = {}) {
|
|
|
263
502
|
prompt: promptPath,
|
|
264
503
|
exit_code: result.exit_code,
|
|
265
504
|
duration_ms: result.duration_ms,
|
|
505
|
+
stdout_path: result.stdout_path || null,
|
|
506
|
+
stderr_path: result.stderr_path || null,
|
|
266
507
|
});
|
|
508
|
+
recordSubsessionSuccess(event.cadence);
|
|
267
509
|
stats.escalated += 1;
|
|
268
510
|
stats.last_decision = "escalated";
|
|
269
511
|
return { ok: true, decision: "escalated", exit_code: result.exit_code };
|
|
270
512
|
}
|
|
271
|
-
|
|
513
|
+
// Failure path: log + cap retries low. The exact stderr tail comes
|
|
514
|
+
// from the spawn helper so we never DLQ "blind" again.
|
|
515
|
+
const stderrTail = (result.stderr_tail || "").trim().split("\n").slice(-3).join(" | ");
|
|
516
|
+
log({
|
|
517
|
+
level: "error",
|
|
518
|
+
stage: "subsession_failed",
|
|
519
|
+
id: event.id,
|
|
520
|
+
cadence: event.cadence,
|
|
521
|
+
exit_code: result.exit_code,
|
|
522
|
+
duration_ms: result.duration_ms,
|
|
523
|
+
error: result.error || stderrTail || `exit ${result.exit_code}`,
|
|
524
|
+
stderr_path: result.stderr_path || null,
|
|
525
|
+
});
|
|
272
526
|
stats.spawn_failures += 1;
|
|
273
|
-
|
|
527
|
+
recordSubsessionFailure(event.cadence);
|
|
528
|
+
const reason = result.error || (stderrTail ? `exit ${result.exit_code}: ${stderrTail}` : `exit ${result.exit_code}`);
|
|
529
|
+
const outcome = failTick(agentRoot, event.id, reason, { maxAttempts });
|
|
274
530
|
if (outcome?.destination === "dlq") stats.dlq += 1;
|
|
275
531
|
else stats.retries += 1;
|
|
276
532
|
return { ok: false, decision: outcome?.destination || "failed" };
|
|
@@ -357,19 +613,38 @@ export function startConsumer(opts = {}) {
|
|
|
357
613
|
recoverStaleClaims(agentRoot);
|
|
358
614
|
|
|
359
615
|
let processed = 0;
|
|
360
|
-
|
|
361
|
-
//
|
|
616
|
+
let escalatedThisTick = 0;
|
|
617
|
+
// Drain inline events as much as the consumer can in one tick; cap
|
|
618
|
+
// sub-session escalations at 1 per tick so a fast-failing cadence
|
|
619
|
+
// can't burn a whole minute's worth of retries inside a single poll.
|
|
620
|
+
// The next poll (DEFAULT_POLL_MS later) will pick up where we left off.
|
|
362
621
|
while (!stopping) {
|
|
363
622
|
const claim = claimNextTick(agentRoot);
|
|
364
623
|
if (!claim) break;
|
|
365
624
|
const event = claim.event;
|
|
366
625
|
activeTick = event.id;
|
|
626
|
+
let didEscalate = false;
|
|
367
627
|
try {
|
|
628
|
+
const def = getCadenceDef(event.cadence);
|
|
629
|
+
const willEscalate = !def || (def.mode !== "inline" && (def.mode !== "guarded" || true));
|
|
630
|
+
// Roughly: if it's not a registry-inline cadence, we MAY escalate.
|
|
631
|
+
// We don't yet know if the guard will say inline; processEvent
|
|
632
|
+
// will tell us via stats. Use the escalated stats delta as the
|
|
633
|
+
// signal that an actual sub-session ran this iteration.
|
|
634
|
+
const before = stats.escalated + stats.spawn_failures + stats.skipped_circuit_open + stats.skipped_backoff;
|
|
368
635
|
await processEvent(event);
|
|
636
|
+
const after = stats.escalated + stats.spawn_failures + stats.skipped_circuit_open + stats.skipped_backoff;
|
|
637
|
+
if (after > before) didEscalate = true;
|
|
638
|
+
// Silence unused var warning.
|
|
639
|
+
void willEscalate;
|
|
369
640
|
} finally {
|
|
370
641
|
activeTick = null;
|
|
371
642
|
}
|
|
372
643
|
processed += 1;
|
|
644
|
+
if (didEscalate) escalatedThisTick += 1;
|
|
645
|
+
// Hard cap: at most ONE sub-session spawn per tick. Inline ticks
|
|
646
|
+
// keep draining freely (they're cheap).
|
|
647
|
+
if (escalatedThisTick >= 1) break;
|
|
373
648
|
if (processed >= 16) break; // soft batch cap
|
|
374
649
|
}
|
|
375
650
|
return { processed };
|
|
@@ -210,9 +210,16 @@ test("unknown cadence with no prompt file DLQ's immediately", async () => {
|
|
|
210
210
|
test("spawn failure retries within the budget, then DLQs", async () => {
|
|
211
211
|
const root = await makeAgentRoot();
|
|
212
212
|
plantPrompt(root, "weekly-strategic-memo");
|
|
213
|
+
// Disable back-off + raise circuit threshold so the test exercises the
|
|
214
|
+
// retry-then-DLQ path without waiting for back-off windows. The
|
|
215
|
+
// real defaults (30s/2m back-off, 3-failure circuit) are exercised by
|
|
216
|
+
// dedicated tests below.
|
|
213
217
|
const consumer = startConsumer({
|
|
214
218
|
agentRoot: root,
|
|
215
219
|
pollMs: 25,
|
|
220
|
+
backoffSchedule: [0, 0, 0],
|
|
221
|
+
circuitThreshold: 999,
|
|
222
|
+
maxAttempts: 2,
|
|
216
223
|
spawnSession: async () => ({ ok: false, exit_code: 1, error: "always-fail", duration_ms: 1 }),
|
|
217
224
|
});
|
|
218
225
|
try {
|
|
@@ -226,6 +233,68 @@ test("spawn failure retries within the budget, then DLQs", async () => {
|
|
|
226
233
|
}
|
|
227
234
|
});
|
|
228
235
|
|
|
236
|
+
test("circuit breaker opens after consecutive failures and blocks further spawns", async () => {
|
|
237
|
+
const root = await makeAgentRoot();
|
|
238
|
+
plantPrompt(root, "weekly-strategic-memo");
|
|
239
|
+
let spawnCount = 0;
|
|
240
|
+
const consumer = startConsumer({
|
|
241
|
+
agentRoot: root,
|
|
242
|
+
pollMs: 20,
|
|
243
|
+
backoffSchedule: [0, 0, 0],
|
|
244
|
+
circuitThreshold: 2,
|
|
245
|
+
circuitDurationMs: 60_000, // 1 min — long enough for the assertion window
|
|
246
|
+
maxAttempts: 1, // each event DLQs on first failure so we don't conflate retry-counts
|
|
247
|
+
spawnSession: async () => { spawnCount++; return { ok: false, exit_code: 1, error: "fail", duration_ms: 1 }; },
|
|
248
|
+
});
|
|
249
|
+
try {
|
|
250
|
+
// Enqueue 5 events; circuit should open after 2 failures, blocking the rest.
|
|
251
|
+
for (let i = 0; i < 5; i++) {
|
|
252
|
+
enqueueTick({ cadence: "weekly-strategic-memo", source: "launchd", agentRoot: root });
|
|
253
|
+
}
|
|
254
|
+
const opened = await waitFor(() => consumer.getStats().skipped_circuit_open >= 1, { timeoutMs: 10_000 });
|
|
255
|
+
assert.ok(opened, `circuit should open; stats=${JSON.stringify(consumer.getStats())}`);
|
|
256
|
+
// Spawn count must NOT keep climbing once the circuit is open.
|
|
257
|
+
const spawnsAtOpen = spawnCount;
|
|
258
|
+
await new Promise((r) => setTimeout(r, 500));
|
|
259
|
+
assert.equal(spawnCount, spawnsAtOpen, `spawns must stop once circuit opens (was ${spawnsAtOpen}, now ${spawnCount})`);
|
|
260
|
+
} finally {
|
|
261
|
+
await consumer.stop();
|
|
262
|
+
await rmRoot(root);
|
|
263
|
+
}
|
|
264
|
+
});
|
|
265
|
+
|
|
266
|
+
test("back-off skips re-spawning until the cooldown elapses", async () => {
|
|
267
|
+
const root = await makeAgentRoot();
|
|
268
|
+
plantPrompt(root, "weekly-strategic-memo");
|
|
269
|
+
let spawnCount = 0;
|
|
270
|
+
const consumer = startConsumer({
|
|
271
|
+
agentRoot: root,
|
|
272
|
+
pollMs: 20,
|
|
273
|
+
backoffSchedule: [0, 300, 300], // 300ms cooldown after each failure
|
|
274
|
+
circuitThreshold: 999,
|
|
275
|
+
maxAttempts: 1,
|
|
276
|
+
spawnSession: async () => { spawnCount++; return { ok: false, exit_code: 1, error: "fail", duration_ms: 1 }; },
|
|
277
|
+
});
|
|
278
|
+
try {
|
|
279
|
+
// Enqueue 2 events back-to-back. The 1st triggers a spawn (fails). The
|
|
280
|
+
// 2nd should be held back by the 300ms back-off window.
|
|
281
|
+
enqueueTick({ cadence: "weekly-strategic-memo", source: "launchd", agentRoot: root });
|
|
282
|
+
enqueueTick({ cadence: "weekly-strategic-memo", source: "launchd", agentRoot: root });
|
|
283
|
+
await waitFor(() => spawnCount >= 1, { timeoutMs: 5_000 });
|
|
284
|
+
const spawnsBeforeWait = spawnCount;
|
|
285
|
+
// During the back-off window no new spawn should fire.
|
|
286
|
+
await new Promise((r) => setTimeout(r, 150));
|
|
287
|
+
assert.ok(spawnCount === spawnsBeforeWait, `spawns must wait for back-off (was ${spawnsBeforeWait}, now ${spawnCount})`);
|
|
288
|
+
assert.ok(consumer.getStats().skipped_backoff >= 1, "skipped_backoff should be recorded");
|
|
289
|
+
// After the window passes, the next event should be processed.
|
|
290
|
+
await waitFor(() => spawnCount > spawnsBeforeWait, { timeoutMs: 5_000 });
|
|
291
|
+
assert.ok(spawnCount > spawnsBeforeWait, "spawning resumes after back-off");
|
|
292
|
+
} finally {
|
|
293
|
+
await consumer.stop();
|
|
294
|
+
await rmRoot(root);
|
|
295
|
+
}
|
|
296
|
+
});
|
|
297
|
+
|
|
229
298
|
// ---------------------------------------------------------------------------
|
|
230
299
|
// Emergency stop
|
|
231
300
|
// ---------------------------------------------------------------------------
|
|
@@ -0,0 +1,116 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
/**
|
|
3
|
+
* scripts/decisions/capture-decision.mjs — Record a decision into the
|
|
4
|
+
* agent's decision log.
|
|
5
|
+
*
|
|
6
|
+
* Decisions land in knowledge/decisions/DEC-YYYY-MM-DD-NNN.yaml and the
|
|
7
|
+
* index in knowledge/decisions/index.yaml is updated. Idempotent on the
|
|
8
|
+
* same decision-id (overwrites the file, leaves the index intact).
|
|
9
|
+
*
|
|
10
|
+
* Usage:
|
|
11
|
+
* node scripts/decisions/capture-decision.mjs \
|
|
12
|
+
* --title "Adopt cadence bus" \
|
|
13
|
+
* --domain "infrastructure" \
|
|
14
|
+
* --decision "Use a local file-backed event bus consumed by a single persistent main session." \
|
|
15
|
+
* --rationale "Reduces Claude Code spawn cost; enables centralised throttling." \
|
|
16
|
+
* --context "Inbox/cadence ticks were spawning fresh sessions per launchd interval." \
|
|
17
|
+
* [--alternatives "Per-tick spawn (status quo); CronCreate-based scheduler"]
|
|
18
|
+
* [--stakeholders "ravi, mehran"]
|
|
19
|
+
* [--status active]
|
|
20
|
+
* [--decision-maker "Ravi Patel"]
|
|
21
|
+
*
|
|
22
|
+
* Reads agent.json for the default decision-maker.
|
|
23
|
+
*/
|
|
24
|
+
|
|
25
|
+
import { existsSync, mkdirSync, readFileSync, readdirSync, writeFileSync } from "node:fs";
|
|
26
|
+
import { join, resolve, dirname } from "node:path";
|
|
27
|
+
import { fileURLToPath } from "node:url";
|
|
28
|
+
|
|
29
|
+
const __dirname = dirname(fileURLToPath(import.meta.url));
|
|
30
|
+
const AGENT_DIR = process.env.AGENT_ROOT || process.env.AGENT_DIR || resolve(__dirname, "..", "..");
|
|
31
|
+
|
|
32
|
+
function fail(msg) { process.stderr.write(`[capture-decision] ${msg}\n`); process.exit(1); }
|
|
33
|
+
|
|
34
|
+
const args = process.argv.slice(2);
|
|
35
|
+
const flags = {};
|
|
36
|
+
for (let i = 0; i < args.length; i++) {
|
|
37
|
+
const a = args[i];
|
|
38
|
+
if (!a.startsWith("--")) continue;
|
|
39
|
+
const key = a.slice(2);
|
|
40
|
+
const val = args[i + 1] && !args[i + 1].startsWith("--") ? args[++i] : "true";
|
|
41
|
+
flags[key] = val;
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
if (!flags.title || !flags.decision) fail("--title and --decision are required (see --help in source).");
|
|
45
|
+
|
|
46
|
+
const decisionMaker = flags["decision-maker"] || (() => {
|
|
47
|
+
try {
|
|
48
|
+
const a = JSON.parse(readFileSync(join(AGENT_DIR, "config/agent.json"), "utf-8"));
|
|
49
|
+
return a.fullName || a.firstName || "the agent";
|
|
50
|
+
} catch { return "the agent"; }
|
|
51
|
+
})();
|
|
52
|
+
|
|
53
|
+
const date = new Date().toISOString().slice(0, 10);
|
|
54
|
+
const decDir = join(AGENT_DIR, "knowledge/decisions");
|
|
55
|
+
mkdirSync(decDir, { recursive: true });
|
|
56
|
+
|
|
57
|
+
// Find next NNN for today.
|
|
58
|
+
const todayPrefix = `DEC-${date}-`;
|
|
59
|
+
let n = 1;
|
|
60
|
+
for (const name of readdirSync(decDir)) {
|
|
61
|
+
if (name.startsWith(todayPrefix)) {
|
|
62
|
+
const tail = name.replace(/^DEC-\d{4}-\d{2}-\d{2}-/, "").replace(/\.ya?ml$/, "");
|
|
63
|
+
const m = parseInt(tail, 10);
|
|
64
|
+
if (!Number.isNaN(m) && m >= n) n = m + 1;
|
|
65
|
+
}
|
|
66
|
+
}
|
|
67
|
+
const id = `${todayPrefix}${String(n).padStart(3, "0")}`;
|
|
68
|
+
const filename = `${id}.yaml`;
|
|
69
|
+
|
|
70
|
+
const yamlEsc = (s) => String(s ?? "").replace(/"/g, '\\"');
|
|
71
|
+
const lines = [
|
|
72
|
+
`id: ${id}`,
|
|
73
|
+
`date: ${date}`,
|
|
74
|
+
`title: "${yamlEsc(flags.title)}"`,
|
|
75
|
+
`domain: "${yamlEsc(flags.domain || "operational")}"`,
|
|
76
|
+
`decision_maker: "${yamlEsc(decisionMaker)}"`,
|
|
77
|
+
`decision_text: "${yamlEsc(flags.decision)}"`,
|
|
78
|
+
`context: "${yamlEsc(flags.context || "")}"`,
|
|
79
|
+
`rationale: "${yamlEsc(flags.rationale || "")}"`,
|
|
80
|
+
`status: ${yamlEsc(flags.status || "active")}`,
|
|
81
|
+
];
|
|
82
|
+
if (flags.alternatives) {
|
|
83
|
+
lines.push("alternatives:");
|
|
84
|
+
for (const alt of String(flags.alternatives).split(";").map((s) => s.trim()).filter(Boolean)) {
|
|
85
|
+
lines.push(` - "${yamlEsc(alt)}"`);
|
|
86
|
+
}
|
|
87
|
+
}
|
|
88
|
+
if (flags.stakeholders) {
|
|
89
|
+
lines.push("stakeholders:");
|
|
90
|
+
for (const s of String(flags.stakeholders).split(",").map((s) => s.trim()).filter(Boolean)) {
|
|
91
|
+
lines.push(` - ${s}`);
|
|
92
|
+
}
|
|
93
|
+
}
|
|
94
|
+
if (flags["expires-at"]) {
|
|
95
|
+
lines.push(`expires_at: ${flags["expires-at"]}`);
|
|
96
|
+
}
|
|
97
|
+
lines.push("");
|
|
98
|
+
|
|
99
|
+
writeFileSync(join(decDir, filename), lines.join("\n"));
|
|
100
|
+
|
|
101
|
+
// Update index — naive append; reader can re-sort.
|
|
102
|
+
const indexPath = join(decDir, "index.yaml");
|
|
103
|
+
let indexBody = existsSync(indexPath) ? readFileSync(indexPath, "utf-8") : "# Decision Index — auto-maintained\ndecisions: []\n";
|
|
104
|
+
if (!indexBody.includes(id)) {
|
|
105
|
+
// Append a list entry before EOF.
|
|
106
|
+
if (/decisions:\s*\[\s*\]\s*$/.test(indexBody.trim())) {
|
|
107
|
+
indexBody = indexBody.replace(/decisions:\s*\[\s*\]/, `decisions:\n - id: ${id}\n date: ${date}\n title: "${yamlEsc(flags.title)}"\n status: ${yamlEsc(flags.status || "active")}`);
|
|
108
|
+
} else if (/decisions:\s*$/m.test(indexBody)) {
|
|
109
|
+
indexBody = indexBody.replace(/decisions:\s*$/m, `decisions:\n - id: ${id}\n date: ${date}\n title: "${yamlEsc(flags.title)}"\n status: ${yamlEsc(flags.status || "active")}`);
|
|
110
|
+
} else {
|
|
111
|
+
indexBody = indexBody.trimEnd() + `\n - id: ${id}\n date: ${date}\n title: "${yamlEsc(flags.title)}"\n status: ${yamlEsc(flags.status || "active")}\n`;
|
|
112
|
+
}
|
|
113
|
+
writeFileSync(indexPath, indexBody);
|
|
114
|
+
}
|
|
115
|
+
|
|
116
|
+
process.stdout.write(JSON.stringify({ ok: true, id, file: join(decDir, filename) }, null, 2) + "\n");
|