@adaptic/maestro 1.9.0 → 1.9.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@adaptic/maestro",
3
- "version": "1.9.0",
3
+ "version": "1.9.1",
4
4
  "description": "Maestro — Autonomous AI agent operating system. Deploy AI employees on dedicated Mac minis.",
5
5
  "type": "module",
6
6
  "bin": {
@@ -43,7 +43,7 @@
43
43
  * logger optional fn({ ts, level, …rest }) → void for tests.
44
44
  */
45
45
 
46
- import { existsSync, readFileSync, writeFileSync } from "node:fs";
46
+ import { existsSync, readFileSync, writeFileSync, mkdirSync, appendFileSync, openSync, closeSync, statSync, unlinkSync } from "node:fs";
47
47
  import { join } from "node:path";
48
48
  import { spawn } from "node:child_process";
49
49
  import { homedir } from "node:os";
@@ -75,6 +75,17 @@ const DEFAULT_SPAWN_TIMEOUT_MS = 30 * 60_000;
75
75
  // preferable to thrashing Claude / hitting usage limits.
76
76
  const MAX_CONCURRENT_SUB_SESSIONS = 1;
77
77
 
78
+ // Retry policy. Most cadence failures are systemic (broken prompt, bad
79
+ // auth, transient API errors) — 5 retries doesn't help, it just amplifies
80
+ // the burn. 2 retries with exponential back-off is the right balance.
81
+ const DEFAULT_MAX_ATTEMPTS = 2;
82
+ const BACKOFF_SCHEDULE_MS = [0, 30_000, 120_000]; // 1st retry +30s, 2nd retry +2m
83
+
84
+ // Circuit breaker — when 3 same-cadence failures land in a row, stop
85
+ // spawning that cadence for 30 minutes. Prevents launchd-rate runaway.
86
+ const CIRCUIT_OPEN_THRESHOLD = 3;
87
+ const CIRCUIT_OPEN_DURATION_MS = 30 * 60_000;
88
+
78
89
  // ---------------------------------------------------------------------------
79
90
  // Helpers
80
91
  // ---------------------------------------------------------------------------
@@ -129,8 +140,13 @@ function resolveClaudeBin() {
129
140
 
130
141
  /**
131
142
  * Spawn a sub-session running the cadence's trigger prompt and resolve
132
- * with { exit_code, durationMs }. Reads the prompt at call time so the
133
- * latest version (possibly upgraded between ticks) is always used.
143
+ * with { exit_code, durationMs, stderr_tail }. Reads the prompt at call
144
+ * time so the latest version (possibly upgraded between ticks) is always
145
+ * used.
146
+ *
147
+ * Robustness: stdout + stderr are tee'd to logs/cadence-bus/subsessions/
148
+ * so non-zero exits remain diagnosable after the fact. The last ~4 KB of
149
+ * stderr is also captured in-memory and surfaced on the failure event.
134
150
  */
135
151
  function realSpawnSession({ agentRoot, cadence, promptPath, timeoutMs, log }) {
136
152
  return new Promise((resolveOut) => {
@@ -169,12 +185,27 @@ function realSpawnSession({ agentRoot, cadence, promptPath, timeoutMs, log }) {
169
185
  };
170
186
  const started = Date.now();
171
187
 
172
- log({ level: "info", stage: "subsession_spawn", cadence, bin });
188
+ // Per-run log file. Pattern is short enough to be tail-friendly.
189
+ const logsDir = join(agentRoot, "logs", "cadence-bus", "subsessions");
190
+ mkdirSync(logsDir, { recursive: true });
191
+ const date = new Date().toISOString().slice(0, 10);
192
+ const stamp = new Date().toISOString().replace(/[:.]/g, "-");
193
+ const stdoutPath = join(logsDir, `${date}-${cadence}-${stamp}.stdout.log`);
194
+ const stderrPath = join(logsDir, `${date}-${cadence}-${stamp}.stderr.log`);
195
+ const stdoutFd = openSync(stdoutPath, "a");
196
+ const stderrFd = openSync(stderrPath, "a");
197
+
198
+ log({ level: "info", stage: "subsession_spawn", cadence, bin, stdout: stdoutPath, stderr: stderrPath });
173
199
 
174
200
  let child;
175
201
  try {
176
- child = spawn(bin, args, { cwd: agentRoot, env, stdio: "ignore" });
202
+ // stdio:
203
+ // 0 ignore (claude --print reads prompt from argv, not stdin)
204
+ // 1 → file (capture stdout for later inspection)
205
+ // 2 → file (capture stderr — critical for diagnosing exit-1)
206
+ child = spawn(bin, args, { cwd: agentRoot, env, stdio: ["ignore", stdoutFd, stderrFd] });
177
207
  } catch (err) {
208
+ try { closeSync(stdoutFd); closeSync(stderrFd); } catch { /* */ }
178
209
  resolveOut({ ok: false, exit_code: -4, error: `spawn failed: ${err.message}` });
179
210
  return;
180
211
  }
@@ -188,8 +219,22 @@ function realSpawnSession({ agentRoot, cadence, promptPath, timeoutMs, log }) {
188
219
 
189
220
  child.on("exit", (code, signal) => {
190
221
  clearTimeout(timer);
222
+ try { closeSync(stdoutFd); closeSync(stderrFd); } catch { /* */ }
191
223
  const durationMs = Date.now() - started;
192
224
  const exit_code = typeof code === "number" ? code : (signal ? -1 : -5);
225
+
226
+ // Pull tail of stderr (and stdout if stderr empty) for the failure
227
+ // surface. Best-effort; we never block on file size.
228
+ let stderrTail = "";
229
+ try {
230
+ const body = readFileSync(stderrPath, "utf-8");
231
+ stderrTail = body.slice(-4096);
232
+ if (!stderrTail.trim()) {
233
+ const so = readFileSync(stdoutPath, "utf-8");
234
+ stderrTail = so.slice(-4096);
235
+ }
236
+ } catch { /* file may not exist if spawn ENOENT before fd-redirect */ }
237
+
193
238
  // Record cost-ledger row. Token counts are 0 until we parse the
194
239
  // session's JSON output; for now exit-code + duration are enough
195
240
  // to spot pathological retry loops.
@@ -208,16 +253,29 @@ function realSpawnSession({ agentRoot, cadence, promptPath, timeoutMs, log }) {
208
253
  ], { stdio: "ignore", env: { ...env, AGENT_ROOT: agentRoot } }).unref();
209
254
  }
210
255
  } catch { /* cost tracking is best-effort */ }
256
+
257
+ // Clean up empty log files so the directory doesn't accumulate
258
+ // hundreds of zero-byte successes.
259
+ try {
260
+
261
+ if (statSync(stdoutPath).size === 0) unlinkSync(stdoutPath);
262
+ if (statSync(stderrPath).size === 0) unlinkSync(stderrPath);
263
+ } catch { /* */ }
264
+
211
265
  resolveOut({
212
266
  ok: exit_code === 0,
213
267
  exit_code,
214
268
  signal: signal || null,
215
269
  duration_ms: durationMs,
270
+ stderr_tail: stderrTail || null,
271
+ stdout_path: stdoutPath,
272
+ stderr_path: stderrPath,
216
273
  });
217
274
  });
218
275
 
219
276
  child.on("error", (err) => {
220
277
  clearTimeout(timer);
278
+ try { closeSync(stdoutFd); closeSync(stderrFd); } catch { /* */ }
221
279
  const durationMs = Date.now() - started;
222
280
  resolveOut({ ok: false, exit_code: -6, error: err.message, duration_ms: durationMs });
223
281
  });
@@ -242,6 +300,11 @@ export function startConsumer(opts = {}) {
242
300
  const maxSpawnMs = opts.maxSpawnMs ?? DEFAULT_SPAWN_TIMEOUT_MS;
243
301
  const spawnSession = opts.spawnSession || realSpawnSession;
244
302
  const userLogger = opts.logger;
303
+ // Test / tuning hooks for the reliability layer.
304
+ const backoffSchedule = opts.backoffSchedule || BACKOFF_SCHEDULE_MS;
305
+ const circuitThreshold = opts.circuitThreshold ?? CIRCUIT_OPEN_THRESHOLD;
306
+ const circuitDurationMs = opts.circuitDurationMs ?? CIRCUIT_OPEN_DURATION_MS;
307
+ const maxAttempts = opts.maxAttempts ?? DEFAULT_MAX_ATTEMPTS;
245
308
 
246
309
  const stats = {
247
310
  started_at: new Date().toISOString(),
@@ -249,6 +312,8 @@ export function startConsumer(opts = {}) {
249
312
  inline: 0,
250
313
  escalated: 0,
251
314
  skipped_emergency_stop: 0,
315
+ skipped_circuit_open: 0,
316
+ skipped_backoff: 0,
252
317
  dlq: 0,
253
318
  retries: 0,
254
319
  spawn_failures: 0,
@@ -261,6 +326,75 @@ export function startConsumer(opts = {}) {
261
326
  let timers = [];
262
327
  let activeSubSessions = 0;
263
328
 
329
+ // Per-cadence reliability state. Tracks consecutive failure count and
330
+ // the earliest moment we'll allow another spawn for that cadence.
331
+ // Persists nothing — circuit state is in-memory only. On daemon restart
332
+ // we get a fresh slate; that's intentional (operators expect a restart
333
+ // to mean "try again now").
334
+ const cadenceState = new Map(); // cadence → { failures, openUntil, nextAllowedAt }
335
+
336
+ function getCadenceState(cadence) {
337
+ let s = cadenceState.get(cadence);
338
+ if (!s) { s = { failures: 0, openUntil: 0, nextAllowedAt: 0 }; cadenceState.set(cadence, s); }
339
+ return s;
340
+ }
341
+
342
+ function recordSubsessionSuccess(cadence) {
343
+ const s = getCadenceState(cadence);
344
+ s.failures = 0;
345
+ s.openUntil = 0;
346
+ s.nextAllowedAt = 0;
347
+ }
348
+
349
+ function recordSubsessionFailure(cadence) {
350
+ const s = getCadenceState(cadence);
351
+ s.failures += 1;
352
+ // Exponential back-off honouring the (test-overridable) schedule.
353
+ const idx = Math.min(s.failures, backoffSchedule.length - 1);
354
+ s.nextAllowedAt = Date.now() + backoffSchedule[idx];
355
+ if (s.failures >= circuitThreshold) {
356
+ s.openUntil = Date.now() + circuitDurationMs;
357
+ log({ level: "error", stage: "circuit_opened", cadence, failures: s.failures, open_until: new Date(s.openUntil).toISOString() });
358
+ writeCircuitFile();
359
+ }
360
+ }
361
+
362
+ function writeCircuitFile() {
363
+ // Persist the open-circuit snapshot so doctor + the operator can see
364
+ // which cadences are currently held back without scraping logs.
365
+ const open = {};
366
+ for (const [cad, s] of cadenceState.entries()) {
367
+ if (s.openUntil > Date.now()) {
368
+ open[cad] = { failures: s.failures, open_until: new Date(s.openUntil).toISOString() };
369
+ }
370
+ }
371
+ const path = join(agentRoot, "state/cadence-bus/circuit-open.json");
372
+ try {
373
+ if (Object.keys(open).length === 0) {
374
+ // Remove the file when nothing is open.
375
+
376
+ try { unlinkSync(path); } catch { /* */ }
377
+ } else {
378
+ writeFileSync(path, JSON.stringify({ generated: new Date().toISOString(), open }, null, 2) + "\n");
379
+ }
380
+ } catch { /* best-effort */ }
381
+ }
382
+
383
+ function isCadenceAllowed(cadence) {
384
+ const s = getCadenceState(cadence);
385
+ const now = Date.now();
386
+ if (s.openUntil > now) return { allowed: false, reason: "circuit-open", retry_at: s.openUntil };
387
+ if (s.nextAllowedAt > now) return { allowed: false, reason: "backoff", retry_at: s.nextAllowedAt };
388
+ // Circuit closes automatically when openUntil passes.
389
+ if (s.openUntil && s.openUntil <= now) {
390
+ s.openUntil = 0;
391
+ s.failures = 0;
392
+ log({ level: "info", stage: "circuit_closed", cadence });
393
+ writeCircuitFile();
394
+ }
395
+ return { allowed: true };
396
+ }
397
+
264
398
  function log(entry) {
265
399
  const enriched = { ts: new Date().toISOString(), ...entry };
266
400
  logBusEvent(agentRoot, enriched);
@@ -280,6 +414,32 @@ export function startConsumer(opts = {}) {
280
414
  }
281
415
 
282
416
  async function escalate(event) {
417
+ // Circuit-breaker / back-off gate. If this cadence is currently held
418
+ // back, requeue without spawning. The event keeps its attempt count
419
+ // because the failure was upstream (not a per-event problem).
420
+ const gate = isCadenceAllowed(event.cadence);
421
+ if (!gate.allowed) {
422
+ log({
423
+ level: "warn",
424
+ stage: gate.reason === "circuit-open" ? "skipped_circuit_open" : "skipped_backoff",
425
+ id: event.id,
426
+ cadence: event.cadence,
427
+ retry_at: new Date(gate.retry_at).toISOString(),
428
+ });
429
+ if (gate.reason === "circuit-open") stats.skipped_circuit_open += 1;
430
+ else stats.skipped_backoff += 1;
431
+ // Put the event back in inbox WITHOUT bumping attempts so it doesn't
432
+ // burn its retry budget while the circuit is open.
433
+ const paths2 = getBusPaths(agentRoot);
434
+ try {
435
+ const event2 = { ...event, attempts: Math.max(0, (event.attempts || 1) - 1) };
436
+ writeFileSync(join(paths2.inbox, `${event.id}.json`), JSON.stringify(event2, null, 2) + "\n");
437
+
438
+ try { unlinkSync(join(paths2.claimed, `${event.id}.json`)); } catch { /* */ }
439
+ } catch { /* best-effort */ }
440
+ return { ok: false, decision: gate.reason };
441
+ }
442
+
283
443
  if (activeSubSessions >= MAX_CONCURRENT_SUB_SESSIONS) {
284
444
  // Re-queue and try again next tick. Single-owner cadence consumer
285
445
  // means this can only happen when a prior tick is still running —
@@ -291,7 +451,15 @@ export function startConsumer(opts = {}) {
291
451
  cadence: event.cadence,
292
452
  active_subsessions: activeSubSessions,
293
453
  });
294
- failTick(agentRoot, event.id, "deferred:concurrent-spawn", { maxAttempts: 10 });
454
+ // Re-queue without burning the retry budget — concurrent-spawn isn't
455
+ // a per-event failure.
456
+ const paths2 = getBusPaths(agentRoot);
457
+ try {
458
+ const event2 = { ...event, attempts: Math.max(0, (event.attempts || 1) - 1) };
459
+ writeFileSync(join(paths2.inbox, `${event.id}.json`), JSON.stringify(event2, null, 2) + "\n");
460
+
461
+ try { unlinkSync(join(paths2.claimed, `${event.id}.json`)); } catch { /* */ }
462
+ } catch { /* best-effort */ }
295
463
  stats.retries += 1;
296
464
  return { ok: false, decision: "deferred" };
297
465
  }
@@ -334,14 +502,31 @@ export function startConsumer(opts = {}) {
334
502
  prompt: promptPath,
335
503
  exit_code: result.exit_code,
336
504
  duration_ms: result.duration_ms,
505
+ stdout_path: result.stdout_path || null,
506
+ stderr_path: result.stderr_path || null,
337
507
  });
508
+ recordSubsessionSuccess(event.cadence);
338
509
  stats.escalated += 1;
339
510
  stats.last_decision = "escalated";
340
511
  return { ok: true, decision: "escalated", exit_code: result.exit_code };
341
512
  }
342
- log({ level: "error", stage: "subsession_failed", id: event.id, cadence: event.cadence, exit_code: result.exit_code, error: result.error || null });
513
+ // Failure path: log + cap retries low. The exact stderr tail comes
514
+ // from the spawn helper so we never DLQ "blind" again.
515
+ const stderrTail = (result.stderr_tail || "").trim().split("\n").slice(-3).join(" | ");
516
+ log({
517
+ level: "error",
518
+ stage: "subsession_failed",
519
+ id: event.id,
520
+ cadence: event.cadence,
521
+ exit_code: result.exit_code,
522
+ duration_ms: result.duration_ms,
523
+ error: result.error || stderrTail || `exit ${result.exit_code}`,
524
+ stderr_path: result.stderr_path || null,
525
+ });
343
526
  stats.spawn_failures += 1;
344
- const outcome = failTick(agentRoot, event.id, result.error || `exit ${result.exit_code}`);
527
+ recordSubsessionFailure(event.cadence);
528
+ const reason = result.error || (stderrTail ? `exit ${result.exit_code}: ${stderrTail}` : `exit ${result.exit_code}`);
529
+ const outcome = failTick(agentRoot, event.id, reason, { maxAttempts });
345
530
  if (outcome?.destination === "dlq") stats.dlq += 1;
346
531
  else stats.retries += 1;
347
532
  return { ok: false, decision: outcome?.destination || "failed" };
@@ -428,19 +613,38 @@ export function startConsumer(opts = {}) {
428
613
  recoverStaleClaims(agentRoot);
429
614
 
430
615
  let processed = 0;
431
- // Drain as much as the consumer can in one tick, but yield to the
432
- // event loop between events so heartbeats and stop signals fire.
616
+ let escalatedThisTick = 0;
617
+ // Drain inline events as much as the consumer can in one tick; cap
618
+ // sub-session escalations at 1 per tick so a fast-failing cadence
619
+ // can't burn a whole minute's worth of retries inside a single poll.
620
+ // The next poll (DEFAULT_POLL_MS later) will pick up where we left off.
433
621
  while (!stopping) {
434
622
  const claim = claimNextTick(agentRoot);
435
623
  if (!claim) break;
436
624
  const event = claim.event;
437
625
  activeTick = event.id;
626
+ let didEscalate = false;
438
627
  try {
628
+ const def = getCadenceDef(event.cadence);
629
+ const willEscalate = !def || (def.mode !== "inline" && (def.mode !== "guarded" || true));
630
+ // Roughly: if it's not a registry-inline cadence, we MAY escalate.
631
+ // We don't yet know if the guard will say inline; processEvent
632
+ // will tell us via stats. Use the escalated stats delta as the
633
+ // signal that an actual sub-session ran this iteration.
634
+ const before = stats.escalated + stats.spawn_failures + stats.skipped_circuit_open + stats.skipped_backoff;
439
635
  await processEvent(event);
636
+ const after = stats.escalated + stats.spawn_failures + stats.skipped_circuit_open + stats.skipped_backoff;
637
+ if (after > before) didEscalate = true;
638
+ // Silence unused var warning.
639
+ void willEscalate;
440
640
  } finally {
441
641
  activeTick = null;
442
642
  }
443
643
  processed += 1;
644
+ if (didEscalate) escalatedThisTick += 1;
645
+ // Hard cap: at most ONE sub-session spawn per tick. Inline ticks
646
+ // keep draining freely (they're cheap).
647
+ if (escalatedThisTick >= 1) break;
444
648
  if (processed >= 16) break; // soft batch cap
445
649
  }
446
650
  return { processed };
@@ -210,9 +210,16 @@ test("unknown cadence with no prompt file DLQ's immediately", async () => {
210
210
  test("spawn failure retries within the budget, then DLQs", async () => {
211
211
  const root = await makeAgentRoot();
212
212
  plantPrompt(root, "weekly-strategic-memo");
213
+ // Disable back-off + raise circuit threshold so the test exercises the
214
+ // retry-then-DLQ path without waiting for back-off windows. The
215
+ // real defaults (30s/2m back-off, 3-failure circuit) are exercised by
216
+ // dedicated tests below.
213
217
  const consumer = startConsumer({
214
218
  agentRoot: root,
215
219
  pollMs: 25,
220
+ backoffSchedule: [0, 0, 0],
221
+ circuitThreshold: 999,
222
+ maxAttempts: 2,
216
223
  spawnSession: async () => ({ ok: false, exit_code: 1, error: "always-fail", duration_ms: 1 }),
217
224
  });
218
225
  try {
@@ -226,6 +233,68 @@ test("spawn failure retries within the budget, then DLQs", async () => {
226
233
  }
227
234
  });
228
235
 
236
+ test("circuit breaker opens after consecutive failures and blocks further spawns", async () => {
237
+ const root = await makeAgentRoot();
238
+ plantPrompt(root, "weekly-strategic-memo");
239
+ let spawnCount = 0;
240
+ const consumer = startConsumer({
241
+ agentRoot: root,
242
+ pollMs: 20,
243
+ backoffSchedule: [0, 0, 0],
244
+ circuitThreshold: 2,
245
+ circuitDurationMs: 60_000, // 1 min — long enough for the assertion window
246
+ maxAttempts: 1, // each event DLQs on first failure so we don't conflate retry-counts
247
+ spawnSession: async () => { spawnCount++; return { ok: false, exit_code: 1, error: "fail", duration_ms: 1 }; },
248
+ });
249
+ try {
250
+ // Enqueue 5 events; circuit should open after 2 failures, blocking the rest.
251
+ for (let i = 0; i < 5; i++) {
252
+ enqueueTick({ cadence: "weekly-strategic-memo", source: "launchd", agentRoot: root });
253
+ }
254
+ const opened = await waitFor(() => consumer.getStats().skipped_circuit_open >= 1, { timeoutMs: 10_000 });
255
+ assert.ok(opened, `circuit should open; stats=${JSON.stringify(consumer.getStats())}`);
256
+ // Spawn count must NOT keep climbing once the circuit is open.
257
+ const spawnsAtOpen = spawnCount;
258
+ await new Promise((r) => setTimeout(r, 500));
259
+ assert.equal(spawnCount, spawnsAtOpen, `spawns must stop once circuit opens (was ${spawnsAtOpen}, now ${spawnCount})`);
260
+ } finally {
261
+ await consumer.stop();
262
+ await rmRoot(root);
263
+ }
264
+ });
265
+
266
+ test("back-off skips re-spawning until the cooldown elapses", async () => {
267
+ const root = await makeAgentRoot();
268
+ plantPrompt(root, "weekly-strategic-memo");
269
+ let spawnCount = 0;
270
+ const consumer = startConsumer({
271
+ agentRoot: root,
272
+ pollMs: 20,
273
+ backoffSchedule: [0, 300, 300], // 300ms cooldown after each failure
274
+ circuitThreshold: 999,
275
+ maxAttempts: 1,
276
+ spawnSession: async () => { spawnCount++; return { ok: false, exit_code: 1, error: "fail", duration_ms: 1 }; },
277
+ });
278
+ try {
279
+ // Enqueue 2 events back-to-back. The 1st triggers a spawn (fails). The
280
+ // 2nd should be held back by the 300ms back-off window.
281
+ enqueueTick({ cadence: "weekly-strategic-memo", source: "launchd", agentRoot: root });
282
+ enqueueTick({ cadence: "weekly-strategic-memo", source: "launchd", agentRoot: root });
283
+ await waitFor(() => spawnCount >= 1, { timeoutMs: 5_000 });
284
+ const spawnsBeforeWait = spawnCount;
285
+ // During the back-off window no new spawn should fire.
286
+ await new Promise((r) => setTimeout(r, 150));
287
+ assert.ok(spawnCount === spawnsBeforeWait, `spawns must wait for back-off (was ${spawnsBeforeWait}, now ${spawnCount})`);
288
+ assert.ok(consumer.getStats().skipped_backoff >= 1, "skipped_backoff should be recorded");
289
+ // After the window passes, the next event should be processed.
290
+ await waitFor(() => spawnCount > spawnsBeforeWait, { timeoutMs: 5_000 });
291
+ assert.ok(spawnCount > spawnsBeforeWait, "spawning resumes after back-off");
292
+ } finally {
293
+ await consumer.stop();
294
+ await rmRoot(root);
295
+ }
296
+ });
297
+
229
298
  // ---------------------------------------------------------------------------
230
299
  // Emergency stop
231
300
  // ---------------------------------------------------------------------------