@adaptic/maestro 1.7.3 → 1.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. package/.claude/commands/init-maestro.md +15 -2
  2. package/.gitignore +7 -0
  3. package/README.md +62 -11
  4. package/bin/maestro.mjs +338 -2
  5. package/bin/maestro.test.mjs +299 -0
  6. package/docs/guides/poller-daemon-setup.md +21 -8
  7. package/docs/runbooks/perpetual-operations.md +19 -15
  8. package/docs/runbooks/recovery-and-failover.md +42 -0
  9. package/lib/cadence-bus.mjs +625 -0
  10. package/lib/cadence-bus.test.mjs +354 -0
  11. package/package.json +6 -1
  12. package/scaffold/CLAUDE.md +11 -7
  13. package/scripts/cadence/cadence-status.mjs +36 -0
  14. package/scripts/cadence/enqueue-cadence-tick.mjs +158 -0
  15. package/scripts/cadence/enqueue-cadence-tick.test.mjs +154 -0
  16. package/scripts/cadence/launchd-cadence-wrapper.sh +85 -0
  17. package/scripts/daemon/cadence-consumer.mjs +439 -0
  18. package/scripts/daemon/cadence-consumer.test.mjs +397 -0
  19. package/scripts/daemon/cadence-handlers.mjs +263 -0
  20. package/scripts/daemon/maestro-daemon.mjs +20 -0
  21. package/scripts/local-triggers/generate-plists.sh +33 -12
  22. package/scripts/local-triggers/generate-plists.test.mjs +185 -0
  23. package/scripts/local-triggers/plists/.gitkeep +0 -0
  24. package/scripts/local-triggers/run-trigger.sh +22 -3
  25. package/scripts/local-triggers/plists/ai.adaptic.sophie-backlog-executor.plist +0 -21
  26. package/scripts/local-triggers/plists/ai.adaptic.sophie-daemon.plist +0 -32
  27. package/scripts/local-triggers/plists/ai.adaptic.sophie-inbox-processor.plist +0 -21
  28. package/scripts/local-triggers/plists/ai.adaptic.sophie-meeting-action-capture.plist +0 -21
  29. package/scripts/local-triggers/plists/ai.adaptic.sophie-meeting-prep.plist +0 -21
  30. package/scripts/local-triggers/plists/ai.adaptic.sophie-midday-sweep.plist +0 -26
  31. package/scripts/local-triggers/plists/ai.adaptic.sophie-quarterly-self-assessment.plist +0 -62
  32. package/scripts/local-triggers/plists/ai.adaptic.sophie-weekly-engineering-health.plist +0 -28
  33. package/scripts/local-triggers/plists/ai.adaptic.sophie-weekly-execution.plist +0 -28
  34. package/scripts/local-triggers/plists/ai.adaptic.sophie-weekly-hiring.plist +0 -28
  35. package/scripts/local-triggers/plists/ai.adaptic.sophie-weekly-priorities.plist +0 -28
  36. package/scripts/local-triggers/plists/ai.adaptic.sophie-weekly-strategic-memo.plist +0 -28
@@ -0,0 +1,625 @@
1
+ /**
2
+ * Maestro — Cadence Bus
3
+ *
4
+ * Local, file-backed event queue that decouples scheduled cadence ticks
5
+ * (launchd, manual, daemon, init-maestro, upgrade) from the persistent main
6
+ * Maestro session that actually services them.
7
+ *
8
+ * Why this exists
9
+ * ---------------
10
+ * Before the bus, every scheduled cadence tick (inbox-processor every 5m,
11
+ * backlog-executor every 10m, the daily/weekly/quarterly ones, …) was wired
12
+ * to launch a fresh `claude --print` session via run-trigger.sh. That meant
13
+ * dozens of full Claude Code spawns per day, each paying full auth/context/
14
+ * token overhead even when the tick had nothing to do. It also made
15
+ * emergency-stop, deduplication, rate-limiting, and quota gating ad-hoc.
16
+ *
17
+ * After the bus, launchd just enqueues a tiny JSON event onto a directory.
18
+ * The persistent daemon (maestro-daemon.mjs) consumes events in-process,
19
+ * handles lightweight scan/classify/route work inline, and only spawns a
20
+ * sub-session when the work genuinely warrants it.
21
+ *
22
+ * Storage layout (all paths relative to AGENT_ROOT)
23
+ * --------------------------------------------------
24
+ * state/cadence-bus/
25
+ * inbox/ — newly enqueued events (one JSON per event)
26
+ * claimed/ — events currently being processed
27
+ * processed/<YYYY-MM-DD>/ — successful completions, archived by date
28
+ * failed/ — last-known-bad event payloads (transient errors)
29
+ * dlq/ — events that exceeded retry budget (terminal)
30
+ * queue.jsonl — append-only fallback log (every enqueue lands
31
+ * here too, so no event is lost even if inbox/
32
+ * is unavailable on a given enqueue)
33
+ * health.json — heartbeat written by the consumer
34
+ *
35
+ * Concurrency model
36
+ * -----------------
37
+ * Atomic enqueue: write to a temp file in the same directory, then
38
+ * rename(tmp, final). rename(2) is atomic on macOS APFS
39
+ * when source and destination live on the same volume.
40
+ *
41
+ * Atomic claim: rename(inbox/<id>.json → claimed/<id>.json). If two
42
+ * consumers race for the same file, exactly one rename
43
+ * succeeds; the loser gets ENOENT and skips the event.
44
+ * No locks, no stale-lock pathology.
45
+ *
46
+ * Stale recovery: on consumer startup (and periodically while running),
47
+ * any file in claimed/ older than STALE_CLAIM_MS is
48
+ * either returned to inbox/ (under the retry budget) or
49
+ * moved to dlq/. This handles crashes mid-processing.
50
+ *
51
+ * Event payload schema
52
+ * --------------------
53
+ * {
54
+ * "id": "evt-2026-05-12T15:30:00.123Z-<random>",
55
+ * "type": "cadence_tick" | "manual_tick" | "internal",
56
+ * "source": "launchd" | "manual" | "daemon" | "init-maestro" | "upgrade",
57
+ * "ts": "2026-05-12T15:30:00.123Z",
58
+ * "cadence": "inbox-processor" | "backlog-executor" | "weekly-strategic-memo" | …,
59
+ * "workflow": "continuous/inbox-processor" (optional),
60
+ * "correlation_id": "<external id, optional>",
61
+ * "priority": "high" | "normal" | "low" (default "normal"),
62
+ * "metadata": { … free-form, kept for auditing },
63
+ * "attempts": 0 (auto-managed by consumer)
64
+ * }
65
+ *
66
+ * The bus does not interpret `cadence` itself — it only routes by it.
67
+ * Handler selection and execution live in scripts/daemon/cadence-handlers.mjs
68
+ * and scripts/daemon/cadence-consumer.mjs.
69
+ */
70
+
71
+ import {
72
+ appendFileSync,
73
+ closeSync,
74
+ existsSync,
75
+ mkdirSync,
76
+ openSync,
77
+ readFileSync,
78
+ readdirSync,
79
+ renameSync,
80
+ statSync,
81
+ unlinkSync,
82
+ writeFileSync,
83
+ } from "node:fs";
84
+ import { join, resolve, dirname } from "node:path";
85
+ import { randomBytes } from "node:crypto";
86
+
87
+ // ---------------------------------------------------------------------------
88
+ // Constants (override-friendly via env for tests)
89
+ // ---------------------------------------------------------------------------
90
+
91
+ export const BUS_RELATIVE = "state/cadence-bus";
92
+ export const LOGS_RELATIVE = "logs/cadence-bus";
93
+
94
+ // Bus schema version. If we change the on-disk layout in a backwards-
95
+ // incompatible way, bump this and write a migration. Doctor reads this.
96
+ export const BUS_VERSION = "1";
97
+
98
+ // Stale claim threshold — claims older than this are recovered to inbox or
99
+ // the dlq. Defaults assume the longest live cadence handler runs well under
100
+ // 30 minutes; tune via env for long-running migrations.
101
+ const DEFAULT_STALE_CLAIM_MS = 30 * 60 * 1000;
102
+
103
+ // Maximum attempts before an event goes to dlq. Cadence ticks are mostly
104
+ // non-essential; we don't want a single stuck tick to clog the bus.
105
+ const DEFAULT_MAX_ATTEMPTS = 5;
106
+
107
+ // ---------------------------------------------------------------------------
108
+ // Path resolution
109
+ // ---------------------------------------------------------------------------
110
+
111
+ /**
112
+ * Resolve the agent root. Prefers explicit arg, then AGENT_ROOT env, then
113
+ * AGENT_DIR env, then process.cwd(). Returns an absolute path.
114
+ */
115
+ export function resolveAgentRoot(agentRoot) {
116
+ return resolve(
117
+ agentRoot ||
118
+ process.env.AGENT_ROOT ||
119
+ process.env.AGENT_DIR ||
120
+ process.cwd()
121
+ );
122
+ }
123
+
124
+ /**
125
+ * Return all bus paths derived from an agent root.
126
+ */
127
+ export function getBusPaths(agentRoot) {
128
+ const root = resolveAgentRoot(agentRoot);
129
+ const base = join(root, BUS_RELATIVE);
130
+ return {
131
+ agentRoot: root,
132
+ base,
133
+ inbox: join(base, "inbox"),
134
+ claimed: join(base, "claimed"),
135
+ processed: join(base, "processed"),
136
+ failed: join(base, "failed"),
137
+ dlq: join(base, "dlq"),
138
+ queueJsonl: join(base, "queue.jsonl"),
139
+ health: join(base, "health.json"),
140
+ version: join(base, "VERSION"),
141
+ logsDir: join(root, LOGS_RELATIVE),
142
+ emergencyStop: join(root, ".emergency-stop"),
143
+ };
144
+ }
145
+
146
+ /**
147
+ * Idempotently create the cadence-bus directory tree. Safe to call on every
148
+ * enqueue/consume; no-op once present.
149
+ */
150
+ export function ensureBusDirs(agentRoot) {
151
+ const paths = getBusPaths(agentRoot);
152
+ for (const dir of [paths.inbox, paths.claimed, paths.processed, paths.failed, paths.dlq, paths.logsDir]) {
153
+ mkdirSync(dir, { recursive: true });
154
+ }
155
+ // Touch a version marker so doctor / upgrade can detect a freshly-created
156
+ // bus and reason about schema migrations later.
157
+ if (!existsSync(paths.version)) {
158
+ writeFileSync(paths.version, `${BUS_VERSION}\n`);
159
+ }
160
+ return paths;
161
+ }
162
+
163
+ // ---------------------------------------------------------------------------
164
+ // Event id
165
+ // ---------------------------------------------------------------------------
166
+
167
+ /**
168
+ * Generate a unique, sortable event id of the form:
169
+ * evt-2026-05-12T15:30:00.123Z-<8 hex>
170
+ * The ISO prefix makes inbox/ directory listings naturally sorted by time.
171
+ * The random suffix avoids collisions for events enqueued in the same ms.
172
+ */
173
+ export function nextEventId(now = new Date()) {
174
+ const iso = now.toISOString();
175
+ const rand = randomBytes(4).toString("hex");
176
+ return `evt-${iso}-${rand}`;
177
+ }
178
+
179
+ // ---------------------------------------------------------------------------
180
+ // Logging
181
+ // ---------------------------------------------------------------------------
182
+
183
+ function todayUtc(now = new Date()) {
184
+ return now.toISOString().slice(0, 10);
185
+ }
186
+
187
+ /**
188
+ * Append a structured log line to logs/cadence-bus/<date>.jsonl. Best-effort;
189
+ * caller never blocks on log failures. Returns true on success, false on a
190
+ * caught I/O error.
191
+ */
192
+ export function logBusEvent(agentRoot, entry) {
193
+ try {
194
+ const paths = getBusPaths(agentRoot);
195
+ mkdirSync(paths.logsDir, { recursive: true });
196
+ const file = join(paths.logsDir, `${todayUtc()}.jsonl`);
197
+ const line = JSON.stringify({ ts: new Date().toISOString(), ...entry }) + "\n";
198
+ appendFileSync(file, line);
199
+ return true;
200
+ } catch {
201
+ return false;
202
+ }
203
+ }
204
+
205
+ // ---------------------------------------------------------------------------
206
+ // Atomic write
207
+ // ---------------------------------------------------------------------------
208
+
209
+ /**
210
+ * Atomically write JSON to `targetPath`. Writes a sibling `.tmp` file then
211
+ * renames it. fsync the temp file's directory after rename to flush metadata
212
+ * (best-effort; on macOS APFS the rename is durable once it returns).
213
+ */
214
+ function writeJsonAtomic(targetPath, obj) {
215
+ mkdirSync(dirname(targetPath), { recursive: true });
216
+ const tmp = `${targetPath}.tmp.${process.pid}.${Date.now()}.${randomBytes(2).toString("hex")}`;
217
+ writeFileSync(tmp, JSON.stringify(obj, null, 2) + "\n");
218
+ try {
219
+ renameSync(tmp, targetPath);
220
+ } catch (err) {
221
+ // Clean up the orphan tmp file if rename failed.
222
+ try { unlinkSync(tmp); } catch { /* ignore */ }
223
+ throw err;
224
+ }
225
+ }
226
+
227
+ // ---------------------------------------------------------------------------
228
+ // Enqueue
229
+ // ---------------------------------------------------------------------------
230
+
231
+ /**
232
+ * Append a one-line JSON record to the fallback queue.jsonl. Used both as a
233
+ * tamper-evident audit log of every enqueue and as a last-resort fallback
234
+ * when writing to inbox/ fails (e.g. disk full, permissions).
235
+ */
236
+ function appendFallbackQueue(paths, event) {
237
+ try {
238
+ mkdirSync(dirname(paths.queueJsonl), { recursive: true });
239
+ appendFileSync(paths.queueJsonl, JSON.stringify(event) + "\n");
240
+ return true;
241
+ } catch {
242
+ return false;
243
+ }
244
+ }
245
+
246
+ /**
247
+ * Enqueue a cadence event onto the bus.
248
+ *
249
+ * @param {object} input
250
+ * @param {string} input.cadence Cadence name (e.g. "inbox-processor"). Required.
251
+ * @param {string} [input.type] Event type, default "cadence_tick".
252
+ * @param {string} [input.source] Originator label, default "manual".
253
+ * @param {string} [input.workflow]
254
+ * @param {string} [input.correlation_id]
255
+ * @param {"high"|"normal"|"low"} [input.priority]
256
+ * @param {object} [input.metadata]
257
+ * @param {string} [input.agentRoot] Override AGENT_ROOT for tests.
258
+ * @returns {{ id: string, path: string, fallbackOnly: boolean, skipped?: string }}
259
+ */
260
+ export function enqueueTick(input = {}) {
261
+ if (!input || typeof input !== "object") {
262
+ throw new TypeError("enqueueTick: input must be an object");
263
+ }
264
+ const cadence = String(input.cadence || "").trim();
265
+ if (!cadence) {
266
+ throw new Error("enqueueTick: cadence is required");
267
+ }
268
+
269
+ const paths = ensureBusDirs(input.agentRoot);
270
+
271
+ // Emergency-stop short-circuit. The bus still logs the event so we can
272
+ // see what would have run, but no file lands in inbox/.
273
+ if (existsSync(paths.emergencyStop)) {
274
+ const event = buildEvent(input, cadence);
275
+ appendFallbackQueue(paths, { ...event, _suppressed: "emergency-stop" });
276
+ logBusEvent(paths.agentRoot, {
277
+ level: "info",
278
+ stage: "enqueue_suppressed",
279
+ cadence,
280
+ reason: "emergency-stop",
281
+ id: event.id,
282
+ });
283
+ return { id: event.id, path: null, fallbackOnly: true, skipped: "emergency-stop" };
284
+ }
285
+
286
+ const event = buildEvent(input, cadence);
287
+ const target = join(paths.inbox, `${event.id}.json`);
288
+
289
+ // Always write the audit row first. If inbox write fails the event is
290
+ // still recoverable from queue.jsonl.
291
+ appendFallbackQueue(paths, event);
292
+
293
+ let fallbackOnly = false;
294
+ try {
295
+ writeJsonAtomic(target, event);
296
+ } catch (err) {
297
+ fallbackOnly = true;
298
+ logBusEvent(paths.agentRoot, {
299
+ level: "error",
300
+ stage: "enqueue_inbox_failed",
301
+ cadence,
302
+ id: event.id,
303
+ error: err.message,
304
+ });
305
+ }
306
+
307
+ logBusEvent(paths.agentRoot, {
308
+ level: "info",
309
+ stage: fallbackOnly ? "enqueued_fallback" : "enqueued",
310
+ cadence,
311
+ source: event.source,
312
+ priority: event.priority,
313
+ id: event.id,
314
+ });
315
+
316
+ return { id: event.id, path: fallbackOnly ? null : target, fallbackOnly };
317
+ }
318
+
319
+ function buildEvent(input, cadence) {
320
+ return {
321
+ id: input.id || nextEventId(),
322
+ type: input.type || "cadence_tick",
323
+ source: input.source || "manual",
324
+ ts: input.ts || new Date().toISOString(),
325
+ cadence,
326
+ workflow: input.workflow || null,
327
+ correlation_id: input.correlation_id || null,
328
+ priority: ["high", "normal", "low"].includes(input.priority) ? input.priority : "normal",
329
+ metadata: input.metadata && typeof input.metadata === "object" ? input.metadata : {},
330
+ attempts: typeof input.attempts === "number" ? input.attempts : 0,
331
+ };
332
+ }
333
+
334
+ // ---------------------------------------------------------------------------
335
+ // Consume
336
+ // ---------------------------------------------------------------------------
337
+
338
+ /**
339
+ * List inbox event ids in chronological order. Returns ids without `.json`.
340
+ */
341
+ export function listInbox(agentRoot) {
342
+ const paths = getBusPaths(agentRoot);
343
+ if (!existsSync(paths.inbox)) return [];
344
+ return readdirSync(paths.inbox)
345
+ .filter((n) => n.endsWith(".json") && !n.endsWith(".tmp"))
346
+ .sort()
347
+ .map((n) => n.slice(0, -5));
348
+ }
349
+
350
+ /**
351
+ * List currently-claimed event ids. Used by stale-claim recovery.
352
+ */
353
+ export function listClaimed(agentRoot) {
354
+ const paths = getBusPaths(agentRoot);
355
+ if (!existsSync(paths.claimed)) return [];
356
+ return readdirSync(paths.claimed)
357
+ .filter((n) => n.endsWith(".json") && !n.endsWith(".tmp"))
358
+ .sort()
359
+ .map((n) => n.slice(0, -5));
360
+ }
361
+
362
+ /**
363
+ * Atomically claim the oldest inbox event for processing. Returns the parsed
364
+ * event payload (with absolute paths attached), or null if the inbox was
365
+ * empty / all candidates lost the race.
366
+ *
367
+ * Other consumers racing for the same event will see ENOENT on rename and
368
+ * naturally move on.
369
+ */
370
+ export function claimNextTick(agentRoot) {
371
+ const paths = ensureBusDirs(agentRoot);
372
+ const ids = listInbox(paths.agentRoot);
373
+ for (const id of ids) {
374
+ const src = join(paths.inbox, `${id}.json`);
375
+ const dst = join(paths.claimed, `${id}.json`);
376
+ try {
377
+ renameSync(src, dst);
378
+ } catch (err) {
379
+ if (err.code === "ENOENT") continue; // someone else claimed it
380
+ throw err;
381
+ }
382
+ // Load the event from its claimed path. If it's malformed, fail it and
383
+ // try the next one.
384
+ let event;
385
+ try {
386
+ event = JSON.parse(readFileSync(dst, "utf-8"));
387
+ } catch (err) {
388
+ logBusEvent(paths.agentRoot, {
389
+ level: "error",
390
+ stage: "claim_parse_failed",
391
+ id,
392
+ error: err.message,
393
+ });
394
+ failTick(paths.agentRoot, id, `parse-failed: ${err.message}`);
395
+ continue;
396
+ }
397
+ event.attempts = (typeof event.attempts === "number" ? event.attempts : 0) + 1;
398
+ // Persist the bumped attempt count so a crash mid-handler still reflects
399
+ // that we tried once.
400
+ try {
401
+ writeJsonAtomic(dst, event);
402
+ } catch {
403
+ /* best-effort */
404
+ }
405
+ logBusEvent(paths.agentRoot, {
406
+ level: "info",
407
+ stage: "claimed",
408
+ id,
409
+ cadence: event.cadence,
410
+ attempts: event.attempts,
411
+ });
412
+ return { event, claimedPath: dst };
413
+ }
414
+ return null;
415
+ }
416
+
417
+ // ---------------------------------------------------------------------------
418
+ // Lifecycle transitions
419
+ // ---------------------------------------------------------------------------
420
+
421
+ function moveClaimed(paths, id, destDir, suffix = ".json") {
422
+ const src = join(paths.claimed, `${id}.json`);
423
+ mkdirSync(destDir, { recursive: true });
424
+ const dst = join(destDir, `${id}${suffix}`);
425
+ if (!existsSync(src)) return null;
426
+ renameSync(src, dst);
427
+ return dst;
428
+ }
429
+
430
+ /**
431
+ * Mark a claimed event as successfully processed. Moves it to
432
+ * processed/<YYYY-MM-DD>/<id>.json and stores the handler result alongside.
433
+ */
434
+ export function completeTick(agentRoot, id, result = {}) {
435
+ const paths = ensureBusDirs(agentRoot);
436
+ const date = todayUtc();
437
+ const dst = moveClaimed(paths, id, join(paths.processed, date));
438
+ if (!dst) {
439
+ logBusEvent(paths.agentRoot, { level: "warn", stage: "complete_missing", id });
440
+ return null;
441
+ }
442
+ // Annotate the archived event with the result.
443
+ try {
444
+ const event = JSON.parse(readFileSync(dst, "utf-8"));
445
+ event.result = result;
446
+ event.completed_at = new Date().toISOString();
447
+ writeJsonAtomic(dst, event);
448
+ } catch {
449
+ /* best-effort */
450
+ }
451
+ logBusEvent(paths.agentRoot, {
452
+ level: "info",
453
+ stage: "processed",
454
+ id,
455
+ cadence: result.cadence || null,
456
+ decision: result.decision || null,
457
+ duration_ms: result.duration_ms ?? null,
458
+ });
459
+ return dst;
460
+ }
461
+
462
+ /**
463
+ * Mark a claimed event as failed. If under the retry budget, returns it to
464
+ * inbox/ for another attempt; otherwise routes to dlq/.
465
+ */
466
+ export function failTick(agentRoot, id, errorOrReason, opts = {}) {
467
+ const paths = ensureBusDirs(agentRoot);
468
+ const maxAttempts = opts.maxAttempts ?? DEFAULT_MAX_ATTEMPTS;
469
+ const srcClaimed = join(paths.claimed, `${id}.json`);
470
+ let event = null;
471
+ if (existsSync(srcClaimed)) {
472
+ try { event = JSON.parse(readFileSync(srcClaimed, "utf-8")); } catch { /* */ }
473
+ }
474
+ if (!event) {
475
+ // Caller already moved or never had a claim; nothing to do.
476
+ logBusEvent(paths.agentRoot, { level: "warn", stage: "fail_missing", id });
477
+ return null;
478
+ }
479
+ event.last_error = typeof errorOrReason === "string"
480
+ ? errorOrReason
481
+ : (errorOrReason?.message || String(errorOrReason));
482
+ event.failed_at = new Date().toISOString();
483
+
484
+ const attempts = typeof event.attempts === "number" ? event.attempts : 0;
485
+ if (attempts >= maxAttempts || opts.terminal === true) {
486
+ // Move to dlq/
487
+ moveClaimed(paths, id, paths.dlq);
488
+ logBusEvent(paths.agentRoot, {
489
+ level: "error",
490
+ stage: "dlq",
491
+ id,
492
+ cadence: event.cadence,
493
+ attempts,
494
+ reason: event.last_error,
495
+ });
496
+ return { destination: "dlq" };
497
+ }
498
+
499
+ // Re-enqueue: write fresh in inbox/, drop the claim.
500
+ const newTarget = join(paths.inbox, `${id}.json`);
501
+ writeJsonAtomic(newTarget, event);
502
+ try { unlinkSync(srcClaimed); } catch { /* */ }
503
+ logBusEvent(paths.agentRoot, {
504
+ level: "warn",
505
+ stage: "retry_requeued",
506
+ id,
507
+ cadence: event.cadence,
508
+ attempts,
509
+ reason: event.last_error,
510
+ });
511
+ return { destination: "inbox" };
512
+ }
513
+
514
+ // ---------------------------------------------------------------------------
515
+ // Stale claim recovery
516
+ // ---------------------------------------------------------------------------
517
+
518
+ /**
519
+ * Sweep claimed/ for entries older than `maxAgeMs`. Each stale claim is
520
+ * either returned to inbox/ (under the retry budget) or moved to dlq/.
521
+ * Safe to run on consumer startup *and* periodically while running, because
522
+ * the consumer is the only writer to claimed/.
523
+ *
524
+ * Returns the count of events recovered + count moved to dlq.
525
+ */
526
+ export function recoverStaleClaims(agentRoot, maxAgeMs = DEFAULT_STALE_CLAIM_MS, now = Date.now()) {
527
+ const paths = ensureBusDirs(agentRoot);
528
+ const stats = { recovered: 0, dlq: 0, scanned: 0 };
529
+ if (!existsSync(paths.claimed)) return stats;
530
+ for (const name of readdirSync(paths.claimed)) {
531
+ if (!name.endsWith(".json")) continue;
532
+ stats.scanned++;
533
+ const claimedPath = join(paths.claimed, name);
534
+ let st;
535
+ try { st = statSync(claimedPath); } catch { continue; }
536
+ if (now - st.mtimeMs < maxAgeMs) continue;
537
+
538
+ const id = name.slice(0, -5);
539
+ const outcome = failTick(paths.agentRoot, id, "stale-claim-recovery", {
540
+ maxAttempts: DEFAULT_MAX_ATTEMPTS,
541
+ });
542
+ if (outcome?.destination === "dlq") stats.dlq++;
543
+ else if (outcome?.destination === "inbox") stats.recovered++;
544
+ }
545
+ if (stats.scanned > 0) {
546
+ logBusEvent(paths.agentRoot, {
547
+ level: "info",
548
+ stage: "stale_recovery_complete",
549
+ ...stats,
550
+ });
551
+ }
552
+ return stats;
553
+ }
554
+
555
+ // ---------------------------------------------------------------------------
556
+ // Heartbeat
557
+ // ---------------------------------------------------------------------------
558
+
559
+ /**
560
+ * Write the consumer's heartbeat state. Doctor / healthcheck reads this to
561
+ * confirm the persistent main session is alive.
562
+ */
563
+ export function writeHealth(agentRoot, state = {}) {
564
+ const paths = ensureBusDirs(agentRoot);
565
+ const payload = {
566
+ version: BUS_VERSION,
567
+ ts: new Date().toISOString(),
568
+ pid: process.pid,
569
+ ...state,
570
+ };
571
+ try {
572
+ writeJsonAtomic(paths.health, payload);
573
+ } catch {
574
+ /* best-effort */
575
+ }
576
+ return payload;
577
+ }
578
+
579
+ /**
580
+ * Read the consumer's heartbeat state. Returns null if missing or unreadable.
581
+ */
582
+ export function readHealth(agentRoot) {
583
+ const paths = getBusPaths(agentRoot);
584
+ if (!existsSync(paths.health)) return null;
585
+ try {
586
+ return JSON.parse(readFileSync(paths.health, "utf-8"));
587
+ } catch {
588
+ return null;
589
+ }
590
+ }
591
+
592
+ /**
593
+ * Inspect bus depth — counts of events at each stage. Used by doctor and
594
+ * tests; cheap enough to call frequently.
595
+ */
596
+ export function busDepth(agentRoot) {
597
+ const paths = getBusPaths(agentRoot);
598
+ const dirCount = (d) => existsSync(d) ? readdirSync(d).filter((n) => n.endsWith(".json")).length : 0;
599
+ return {
600
+ inbox: dirCount(paths.inbox),
601
+ claimed: dirCount(paths.claimed),
602
+ dlq: dirCount(paths.dlq),
603
+ failed: dirCount(paths.failed),
604
+ };
605
+ }
606
+
607
+ // ---------------------------------------------------------------------------
608
+ // CLI usability
609
+ // ---------------------------------------------------------------------------
610
+
611
+ /**
612
+ * Touch each bus directory so a fresh checkout/install has the expected
613
+ * shape on disk. Equivalent to calling ensureBusDirs + readHealth probe
614
+ * + writing a `.gitkeep` per dir so empty-bus repos still track structure.
615
+ */
616
+ export function bootstrapBus(agentRoot) {
617
+ const paths = ensureBusDirs(agentRoot);
618
+ for (const dir of [paths.inbox, paths.claimed, paths.processed, paths.failed, paths.dlq, paths.logsDir]) {
619
+ const keep = join(dir, ".gitkeep");
620
+ if (!existsSync(keep)) {
621
+ try { closeSync(openSync(keep, "a")); } catch { /* ignore */ }
622
+ }
623
+ }
624
+ return paths;
625
+ }