@adaptic/maestro 1.7.3 → 1.8.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude/commands/init-maestro.md +15 -2
- package/.gitignore +7 -0
- package/README.md +62 -11
- package/bin/maestro.mjs +338 -2
- package/bin/maestro.test.mjs +299 -0
- package/docs/guides/poller-daemon-setup.md +21 -8
- package/docs/runbooks/perpetual-operations.md +19 -15
- package/docs/runbooks/recovery-and-failover.md +42 -0
- package/lib/cadence-bus.mjs +625 -0
- package/lib/cadence-bus.test.mjs +354 -0
- package/package.json +6 -1
- package/scaffold/CLAUDE.md +11 -7
- package/scripts/cadence/cadence-status.mjs +36 -0
- package/scripts/cadence/enqueue-cadence-tick.mjs +158 -0
- package/scripts/cadence/enqueue-cadence-tick.test.mjs +154 -0
- package/scripts/cadence/launchd-cadence-wrapper.sh +85 -0
- package/scripts/daemon/cadence-consumer.mjs +439 -0
- package/scripts/daemon/cadence-consumer.test.mjs +397 -0
- package/scripts/daemon/cadence-handlers.mjs +263 -0
- package/scripts/daemon/maestro-daemon.mjs +20 -0
- package/scripts/local-triggers/generate-plists.sh +62 -17
- package/scripts/local-triggers/generate-plists.test.mjs +254 -0
- package/scripts/local-triggers/plists/.gitkeep +0 -0
- package/scripts/local-triggers/run-trigger.sh +22 -3
- package/scripts/local-triggers/plists/ai.adaptic.sophie-backlog-executor.plist +0 -21
- package/scripts/local-triggers/plists/ai.adaptic.sophie-daemon.plist +0 -32
- package/scripts/local-triggers/plists/ai.adaptic.sophie-inbox-processor.plist +0 -21
- package/scripts/local-triggers/plists/ai.adaptic.sophie-meeting-action-capture.plist +0 -21
- package/scripts/local-triggers/plists/ai.adaptic.sophie-meeting-prep.plist +0 -21
- package/scripts/local-triggers/plists/ai.adaptic.sophie-midday-sweep.plist +0 -26
- package/scripts/local-triggers/plists/ai.adaptic.sophie-quarterly-self-assessment.plist +0 -62
- package/scripts/local-triggers/plists/ai.adaptic.sophie-weekly-engineering-health.plist +0 -28
- package/scripts/local-triggers/plists/ai.adaptic.sophie-weekly-execution.plist +0 -28
- package/scripts/local-triggers/plists/ai.adaptic.sophie-weekly-hiring.plist +0 -28
- package/scripts/local-triggers/plists/ai.adaptic.sophie-weekly-priorities.plist +0 -28
- package/scripts/local-triggers/plists/ai.adaptic.sophie-weekly-strategic-memo.plist +0 -28
|
@@ -0,0 +1,625 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Maestro — Cadence Bus
|
|
3
|
+
*
|
|
4
|
+
* Local, file-backed event queue that decouples scheduled cadence ticks
|
|
5
|
+
* (launchd, manual, daemon, init-maestro, upgrade) from the persistent main
|
|
6
|
+
* Maestro session that actually services them.
|
|
7
|
+
*
|
|
8
|
+
* Why this exists
|
|
9
|
+
* ---------------
|
|
10
|
+
* Before the bus, every scheduled cadence tick (inbox-processor every 5m,
|
|
11
|
+
* backlog-executor every 10m, the daily/weekly/quarterly ones, …) was wired
|
|
12
|
+
* to launch a fresh `claude --print` session via run-trigger.sh. That meant
|
|
13
|
+
* dozens of full Claude Code spawns per day, each paying full auth/context/
|
|
14
|
+
* token overhead even when the tick had nothing to do. It also made
|
|
15
|
+
* emergency-stop, deduplication, rate-limiting, and quota gating ad-hoc.
|
|
16
|
+
*
|
|
17
|
+
* After the bus, launchd just enqueues a tiny JSON event onto a directory.
|
|
18
|
+
* The persistent daemon (maestro-daemon.mjs) consumes events in-process,
|
|
19
|
+
* handles lightweight scan/classify/route work inline, and only spawns a
|
|
20
|
+
* sub-session when the work genuinely warrants it.
|
|
21
|
+
*
|
|
22
|
+
* Storage layout (all paths relative to AGENT_ROOT)
|
|
23
|
+
* --------------------------------------------------
|
|
24
|
+
* state/cadence-bus/
|
|
25
|
+
* inbox/ — newly enqueued events (one JSON per event)
|
|
26
|
+
* claimed/ — events currently being processed
|
|
27
|
+
* processed/<YYYY-MM-DD>/ — successful completions, archived by date
|
|
28
|
+
* failed/ — last-known-bad event payloads (transient errors)
|
|
29
|
+
* dlq/ — events that exceeded retry budget (terminal)
|
|
30
|
+
* queue.jsonl — append-only fallback log (every enqueue lands
|
|
31
|
+
* here too, so no event is lost even if inbox/
|
|
32
|
+
* is unavailable on a given enqueue)
|
|
33
|
+
* health.json — heartbeat written by the consumer
|
|
34
|
+
*
|
|
35
|
+
* Concurrency model
|
|
36
|
+
* -----------------
|
|
37
|
+
* Atomic enqueue: write to a temp file in the same directory, then
|
|
38
|
+
* rename(tmp, final). rename(2) is atomic on macOS APFS
|
|
39
|
+
* when source and destination live on the same volume.
|
|
40
|
+
*
|
|
41
|
+
* Atomic claim: rename(inbox/<id>.json → claimed/<id>.json). If two
|
|
42
|
+
* consumers race for the same file, exactly one rename
|
|
43
|
+
* succeeds; the loser gets ENOENT and skips the event.
|
|
44
|
+
* No locks, no stale-lock pathology.
|
|
45
|
+
*
|
|
46
|
+
* Stale recovery: on consumer startup (and periodically while running),
|
|
47
|
+
* any file in claimed/ older than STALE_CLAIM_MS is
|
|
48
|
+
* either returned to inbox/ (under the retry budget) or
|
|
49
|
+
* moved to dlq/. This handles crashes mid-processing.
|
|
50
|
+
*
|
|
51
|
+
* Event payload schema
|
|
52
|
+
* --------------------
|
|
53
|
+
* {
|
|
54
|
+
* "id": "evt-2026-05-12T15:30:00.123Z-<random>",
|
|
55
|
+
* "type": "cadence_tick" | "manual_tick" | "internal",
|
|
56
|
+
* "source": "launchd" | "manual" | "daemon" | "init-maestro" | "upgrade",
|
|
57
|
+
* "ts": "2026-05-12T15:30:00.123Z",
|
|
58
|
+
* "cadence": "inbox-processor" | "backlog-executor" | "weekly-strategic-memo" | …,
|
|
59
|
+
* "workflow": "continuous/inbox-processor" (optional),
|
|
60
|
+
* "correlation_id": "<external id, optional>",
|
|
61
|
+
* "priority": "high" | "normal" | "low" (default "normal"),
|
|
62
|
+
* "metadata": { … free-form, kept for auditing },
|
|
63
|
+
* "attempts": 0 (auto-managed by consumer)
|
|
64
|
+
* }
|
|
65
|
+
*
|
|
66
|
+
* The bus does not interpret `cadence` itself — it only routes by it.
|
|
67
|
+
* Handler selection and execution live in scripts/daemon/cadence-handlers.mjs
|
|
68
|
+
* and scripts/daemon/cadence-consumer.mjs.
|
|
69
|
+
*/
|
|
70
|
+
|
|
71
|
+
import {
|
|
72
|
+
appendFileSync,
|
|
73
|
+
closeSync,
|
|
74
|
+
existsSync,
|
|
75
|
+
mkdirSync,
|
|
76
|
+
openSync,
|
|
77
|
+
readFileSync,
|
|
78
|
+
readdirSync,
|
|
79
|
+
renameSync,
|
|
80
|
+
statSync,
|
|
81
|
+
unlinkSync,
|
|
82
|
+
writeFileSync,
|
|
83
|
+
} from "node:fs";
|
|
84
|
+
import { join, resolve, dirname } from "node:path";
|
|
85
|
+
import { randomBytes } from "node:crypto";
|
|
86
|
+
|
|
87
|
+
// ---------------------------------------------------------------------------
|
|
88
|
+
// Constants (override-friendly via env for tests)
|
|
89
|
+
// ---------------------------------------------------------------------------
|
|
90
|
+
|
|
91
|
+
export const BUS_RELATIVE = "state/cadence-bus";
|
|
92
|
+
export const LOGS_RELATIVE = "logs/cadence-bus";
|
|
93
|
+
|
|
94
|
+
// Bus schema version. If we change the on-disk layout in a backwards-
|
|
95
|
+
// incompatible way, bump this and write a migration. Doctor reads this.
|
|
96
|
+
export const BUS_VERSION = "1";
|
|
97
|
+
|
|
98
|
+
// Stale claim threshold — claims older than this are recovered to inbox or
|
|
99
|
+
// the dlq. Defaults assume the longest live cadence handler runs well under
|
|
100
|
+
// 30 minutes; tune via env for long-running migrations.
|
|
101
|
+
const DEFAULT_STALE_CLAIM_MS = 30 * 60 * 1000;
|
|
102
|
+
|
|
103
|
+
// Maximum attempts before an event goes to dlq. Cadence ticks are mostly
|
|
104
|
+
// non-essential; we don't want a single stuck tick to clog the bus.
|
|
105
|
+
const DEFAULT_MAX_ATTEMPTS = 5;
|
|
106
|
+
|
|
107
|
+
// ---------------------------------------------------------------------------
|
|
108
|
+
// Path resolution
|
|
109
|
+
// ---------------------------------------------------------------------------
|
|
110
|
+
|
|
111
|
+
/**
|
|
112
|
+
* Resolve the agent root. Prefers explicit arg, then AGENT_ROOT env, then
|
|
113
|
+
* AGENT_DIR env, then process.cwd(). Returns an absolute path.
|
|
114
|
+
*/
|
|
115
|
+
export function resolveAgentRoot(agentRoot) {
|
|
116
|
+
return resolve(
|
|
117
|
+
agentRoot ||
|
|
118
|
+
process.env.AGENT_ROOT ||
|
|
119
|
+
process.env.AGENT_DIR ||
|
|
120
|
+
process.cwd()
|
|
121
|
+
);
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
/**
|
|
125
|
+
* Return all bus paths derived from an agent root.
|
|
126
|
+
*/
|
|
127
|
+
export function getBusPaths(agentRoot) {
|
|
128
|
+
const root = resolveAgentRoot(agentRoot);
|
|
129
|
+
const base = join(root, BUS_RELATIVE);
|
|
130
|
+
return {
|
|
131
|
+
agentRoot: root,
|
|
132
|
+
base,
|
|
133
|
+
inbox: join(base, "inbox"),
|
|
134
|
+
claimed: join(base, "claimed"),
|
|
135
|
+
processed: join(base, "processed"),
|
|
136
|
+
failed: join(base, "failed"),
|
|
137
|
+
dlq: join(base, "dlq"),
|
|
138
|
+
queueJsonl: join(base, "queue.jsonl"),
|
|
139
|
+
health: join(base, "health.json"),
|
|
140
|
+
version: join(base, "VERSION"),
|
|
141
|
+
logsDir: join(root, LOGS_RELATIVE),
|
|
142
|
+
emergencyStop: join(root, ".emergency-stop"),
|
|
143
|
+
};
|
|
144
|
+
}
|
|
145
|
+
|
|
146
|
+
/**
|
|
147
|
+
* Idempotently create the cadence-bus directory tree. Safe to call on every
|
|
148
|
+
* enqueue/consume; no-op once present.
|
|
149
|
+
*/
|
|
150
|
+
export function ensureBusDirs(agentRoot) {
|
|
151
|
+
const paths = getBusPaths(agentRoot);
|
|
152
|
+
for (const dir of [paths.inbox, paths.claimed, paths.processed, paths.failed, paths.dlq, paths.logsDir]) {
|
|
153
|
+
mkdirSync(dir, { recursive: true });
|
|
154
|
+
}
|
|
155
|
+
// Touch a version marker so doctor / upgrade can detect a freshly-created
|
|
156
|
+
// bus and reason about schema migrations later.
|
|
157
|
+
if (!existsSync(paths.version)) {
|
|
158
|
+
writeFileSync(paths.version, `${BUS_VERSION}\n`);
|
|
159
|
+
}
|
|
160
|
+
return paths;
|
|
161
|
+
}
|
|
162
|
+
|
|
163
|
+
// ---------------------------------------------------------------------------
|
|
164
|
+
// Event id
|
|
165
|
+
// ---------------------------------------------------------------------------
|
|
166
|
+
|
|
167
|
+
/**
|
|
168
|
+
* Generate a unique, sortable event id of the form:
|
|
169
|
+
* evt-2026-05-12T15:30:00.123Z-<8 hex>
|
|
170
|
+
* The ISO prefix makes inbox/ directory listings naturally sorted by time.
|
|
171
|
+
* The random suffix avoids collisions for events enqueued in the same ms.
|
|
172
|
+
*/
|
|
173
|
+
export function nextEventId(now = new Date()) {
|
|
174
|
+
const iso = now.toISOString();
|
|
175
|
+
const rand = randomBytes(4).toString("hex");
|
|
176
|
+
return `evt-${iso}-${rand}`;
|
|
177
|
+
}
|
|
178
|
+
|
|
179
|
+
// ---------------------------------------------------------------------------
|
|
180
|
+
// Logging
|
|
181
|
+
// ---------------------------------------------------------------------------
|
|
182
|
+
|
|
183
|
+
function todayUtc(now = new Date()) {
|
|
184
|
+
return now.toISOString().slice(0, 10);
|
|
185
|
+
}
|
|
186
|
+
|
|
187
|
+
/**
|
|
188
|
+
* Append a structured log line to logs/cadence-bus/<date>.jsonl. Best-effort;
|
|
189
|
+
* caller never blocks on log failures. Returns true on success, false on a
|
|
190
|
+
* caught I/O error.
|
|
191
|
+
*/
|
|
192
|
+
export function logBusEvent(agentRoot, entry) {
|
|
193
|
+
try {
|
|
194
|
+
const paths = getBusPaths(agentRoot);
|
|
195
|
+
mkdirSync(paths.logsDir, { recursive: true });
|
|
196
|
+
const file = join(paths.logsDir, `${todayUtc()}.jsonl`);
|
|
197
|
+
const line = JSON.stringify({ ts: new Date().toISOString(), ...entry }) + "\n";
|
|
198
|
+
appendFileSync(file, line);
|
|
199
|
+
return true;
|
|
200
|
+
} catch {
|
|
201
|
+
return false;
|
|
202
|
+
}
|
|
203
|
+
}
|
|
204
|
+
|
|
205
|
+
// ---------------------------------------------------------------------------
|
|
206
|
+
// Atomic write
|
|
207
|
+
// ---------------------------------------------------------------------------
|
|
208
|
+
|
|
209
|
+
/**
|
|
210
|
+
* Atomically write JSON to `targetPath`. Writes a sibling `.tmp` file then
|
|
211
|
+
* renames it. fsync the temp file's directory after rename to flush metadata
|
|
212
|
+
* (best-effort; on macOS APFS the rename is durable once it returns).
|
|
213
|
+
*/
|
|
214
|
+
function writeJsonAtomic(targetPath, obj) {
|
|
215
|
+
mkdirSync(dirname(targetPath), { recursive: true });
|
|
216
|
+
const tmp = `${targetPath}.tmp.${process.pid}.${Date.now()}.${randomBytes(2).toString("hex")}`;
|
|
217
|
+
writeFileSync(tmp, JSON.stringify(obj, null, 2) + "\n");
|
|
218
|
+
try {
|
|
219
|
+
renameSync(tmp, targetPath);
|
|
220
|
+
} catch (err) {
|
|
221
|
+
// Clean up the orphan tmp file if rename failed.
|
|
222
|
+
try { unlinkSync(tmp); } catch { /* ignore */ }
|
|
223
|
+
throw err;
|
|
224
|
+
}
|
|
225
|
+
}
|
|
226
|
+
|
|
227
|
+
// ---------------------------------------------------------------------------
|
|
228
|
+
// Enqueue
|
|
229
|
+
// ---------------------------------------------------------------------------
|
|
230
|
+
|
|
231
|
+
/**
|
|
232
|
+
* Append a one-line JSON record to the fallback queue.jsonl. Used both as a
|
|
233
|
+
* tamper-evident audit log of every enqueue and as a last-resort fallback
|
|
234
|
+
* when writing to inbox/ fails (e.g. disk full, permissions).
|
|
235
|
+
*/
|
|
236
|
+
function appendFallbackQueue(paths, event) {
|
|
237
|
+
try {
|
|
238
|
+
mkdirSync(dirname(paths.queueJsonl), { recursive: true });
|
|
239
|
+
appendFileSync(paths.queueJsonl, JSON.stringify(event) + "\n");
|
|
240
|
+
return true;
|
|
241
|
+
} catch {
|
|
242
|
+
return false;
|
|
243
|
+
}
|
|
244
|
+
}
|
|
245
|
+
|
|
246
|
+
/**
|
|
247
|
+
* Enqueue a cadence event onto the bus.
|
|
248
|
+
*
|
|
249
|
+
* @param {object} input
|
|
250
|
+
* @param {string} input.cadence Cadence name (e.g. "inbox-processor"). Required.
|
|
251
|
+
* @param {string} [input.type] Event type, default "cadence_tick".
|
|
252
|
+
* @param {string} [input.source] Originator label, default "manual".
|
|
253
|
+
* @param {string} [input.workflow]
|
|
254
|
+
* @param {string} [input.correlation_id]
|
|
255
|
+
* @param {"high"|"normal"|"low"} [input.priority]
|
|
256
|
+
* @param {object} [input.metadata]
|
|
257
|
+
* @param {string} [input.agentRoot] Override AGENT_ROOT for tests.
|
|
258
|
+
* @returns {{ id: string, path: string, fallbackOnly: boolean, skipped?: string }}
|
|
259
|
+
*/
|
|
260
|
+
export function enqueueTick(input = {}) {
|
|
261
|
+
if (!input || typeof input !== "object") {
|
|
262
|
+
throw new TypeError("enqueueTick: input must be an object");
|
|
263
|
+
}
|
|
264
|
+
const cadence = String(input.cadence || "").trim();
|
|
265
|
+
if (!cadence) {
|
|
266
|
+
throw new Error("enqueueTick: cadence is required");
|
|
267
|
+
}
|
|
268
|
+
|
|
269
|
+
const paths = ensureBusDirs(input.agentRoot);
|
|
270
|
+
|
|
271
|
+
// Emergency-stop short-circuit. The bus still logs the event so we can
|
|
272
|
+
// see what would have run, but no file lands in inbox/.
|
|
273
|
+
if (existsSync(paths.emergencyStop)) {
|
|
274
|
+
const event = buildEvent(input, cadence);
|
|
275
|
+
appendFallbackQueue(paths, { ...event, _suppressed: "emergency-stop" });
|
|
276
|
+
logBusEvent(paths.agentRoot, {
|
|
277
|
+
level: "info",
|
|
278
|
+
stage: "enqueue_suppressed",
|
|
279
|
+
cadence,
|
|
280
|
+
reason: "emergency-stop",
|
|
281
|
+
id: event.id,
|
|
282
|
+
});
|
|
283
|
+
return { id: event.id, path: null, fallbackOnly: true, skipped: "emergency-stop" };
|
|
284
|
+
}
|
|
285
|
+
|
|
286
|
+
const event = buildEvent(input, cadence);
|
|
287
|
+
const target = join(paths.inbox, `${event.id}.json`);
|
|
288
|
+
|
|
289
|
+
// Always write the audit row first. If inbox write fails the event is
|
|
290
|
+
// still recoverable from queue.jsonl.
|
|
291
|
+
appendFallbackQueue(paths, event);
|
|
292
|
+
|
|
293
|
+
let fallbackOnly = false;
|
|
294
|
+
try {
|
|
295
|
+
writeJsonAtomic(target, event);
|
|
296
|
+
} catch (err) {
|
|
297
|
+
fallbackOnly = true;
|
|
298
|
+
logBusEvent(paths.agentRoot, {
|
|
299
|
+
level: "error",
|
|
300
|
+
stage: "enqueue_inbox_failed",
|
|
301
|
+
cadence,
|
|
302
|
+
id: event.id,
|
|
303
|
+
error: err.message,
|
|
304
|
+
});
|
|
305
|
+
}
|
|
306
|
+
|
|
307
|
+
logBusEvent(paths.agentRoot, {
|
|
308
|
+
level: "info",
|
|
309
|
+
stage: fallbackOnly ? "enqueued_fallback" : "enqueued",
|
|
310
|
+
cadence,
|
|
311
|
+
source: event.source,
|
|
312
|
+
priority: event.priority,
|
|
313
|
+
id: event.id,
|
|
314
|
+
});
|
|
315
|
+
|
|
316
|
+
return { id: event.id, path: fallbackOnly ? null : target, fallbackOnly };
|
|
317
|
+
}
|
|
318
|
+
|
|
319
|
+
function buildEvent(input, cadence) {
|
|
320
|
+
return {
|
|
321
|
+
id: input.id || nextEventId(),
|
|
322
|
+
type: input.type || "cadence_tick",
|
|
323
|
+
source: input.source || "manual",
|
|
324
|
+
ts: input.ts || new Date().toISOString(),
|
|
325
|
+
cadence,
|
|
326
|
+
workflow: input.workflow || null,
|
|
327
|
+
correlation_id: input.correlation_id || null,
|
|
328
|
+
priority: ["high", "normal", "low"].includes(input.priority) ? input.priority : "normal",
|
|
329
|
+
metadata: input.metadata && typeof input.metadata === "object" ? input.metadata : {},
|
|
330
|
+
attempts: typeof input.attempts === "number" ? input.attempts : 0,
|
|
331
|
+
};
|
|
332
|
+
}
|
|
333
|
+
|
|
334
|
+
// ---------------------------------------------------------------------------
|
|
335
|
+
// Consume
|
|
336
|
+
// ---------------------------------------------------------------------------
|
|
337
|
+
|
|
338
|
+
/**
|
|
339
|
+
* List inbox event ids in chronological order. Returns ids without `.json`.
|
|
340
|
+
*/
|
|
341
|
+
export function listInbox(agentRoot) {
|
|
342
|
+
const paths = getBusPaths(agentRoot);
|
|
343
|
+
if (!existsSync(paths.inbox)) return [];
|
|
344
|
+
return readdirSync(paths.inbox)
|
|
345
|
+
.filter((n) => n.endsWith(".json") && !n.endsWith(".tmp"))
|
|
346
|
+
.sort()
|
|
347
|
+
.map((n) => n.slice(0, -5));
|
|
348
|
+
}
|
|
349
|
+
|
|
350
|
+
/**
|
|
351
|
+
* List currently-claimed event ids. Used by stale-claim recovery.
|
|
352
|
+
*/
|
|
353
|
+
export function listClaimed(agentRoot) {
|
|
354
|
+
const paths = getBusPaths(agentRoot);
|
|
355
|
+
if (!existsSync(paths.claimed)) return [];
|
|
356
|
+
return readdirSync(paths.claimed)
|
|
357
|
+
.filter((n) => n.endsWith(".json") && !n.endsWith(".tmp"))
|
|
358
|
+
.sort()
|
|
359
|
+
.map((n) => n.slice(0, -5));
|
|
360
|
+
}
|
|
361
|
+
|
|
362
|
+
/**
|
|
363
|
+
* Atomically claim the oldest inbox event for processing. Returns the parsed
|
|
364
|
+
* event payload (with absolute paths attached), or null if the inbox was
|
|
365
|
+
* empty / all candidates lost the race.
|
|
366
|
+
*
|
|
367
|
+
* Other consumers racing for the same event will see ENOENT on rename and
|
|
368
|
+
* naturally move on.
|
|
369
|
+
*/
|
|
370
|
+
export function claimNextTick(agentRoot) {
|
|
371
|
+
const paths = ensureBusDirs(agentRoot);
|
|
372
|
+
const ids = listInbox(paths.agentRoot);
|
|
373
|
+
for (const id of ids) {
|
|
374
|
+
const src = join(paths.inbox, `${id}.json`);
|
|
375
|
+
const dst = join(paths.claimed, `${id}.json`);
|
|
376
|
+
try {
|
|
377
|
+
renameSync(src, dst);
|
|
378
|
+
} catch (err) {
|
|
379
|
+
if (err.code === "ENOENT") continue; // someone else claimed it
|
|
380
|
+
throw err;
|
|
381
|
+
}
|
|
382
|
+
// Load the event from its claimed path. If it's malformed, fail it and
|
|
383
|
+
// try the next one.
|
|
384
|
+
let event;
|
|
385
|
+
try {
|
|
386
|
+
event = JSON.parse(readFileSync(dst, "utf-8"));
|
|
387
|
+
} catch (err) {
|
|
388
|
+
logBusEvent(paths.agentRoot, {
|
|
389
|
+
level: "error",
|
|
390
|
+
stage: "claim_parse_failed",
|
|
391
|
+
id,
|
|
392
|
+
error: err.message,
|
|
393
|
+
});
|
|
394
|
+
failTick(paths.agentRoot, id, `parse-failed: ${err.message}`);
|
|
395
|
+
continue;
|
|
396
|
+
}
|
|
397
|
+
event.attempts = (typeof event.attempts === "number" ? event.attempts : 0) + 1;
|
|
398
|
+
// Persist the bumped attempt count so a crash mid-handler still reflects
|
|
399
|
+
// that we tried once.
|
|
400
|
+
try {
|
|
401
|
+
writeJsonAtomic(dst, event);
|
|
402
|
+
} catch {
|
|
403
|
+
/* best-effort */
|
|
404
|
+
}
|
|
405
|
+
logBusEvent(paths.agentRoot, {
|
|
406
|
+
level: "info",
|
|
407
|
+
stage: "claimed",
|
|
408
|
+
id,
|
|
409
|
+
cadence: event.cadence,
|
|
410
|
+
attempts: event.attempts,
|
|
411
|
+
});
|
|
412
|
+
return { event, claimedPath: dst };
|
|
413
|
+
}
|
|
414
|
+
return null;
|
|
415
|
+
}
|
|
416
|
+
|
|
417
|
+
// ---------------------------------------------------------------------------
|
|
418
|
+
// Lifecycle transitions
|
|
419
|
+
// ---------------------------------------------------------------------------
|
|
420
|
+
|
|
421
|
+
function moveClaimed(paths, id, destDir, suffix = ".json") {
|
|
422
|
+
const src = join(paths.claimed, `${id}.json`);
|
|
423
|
+
mkdirSync(destDir, { recursive: true });
|
|
424
|
+
const dst = join(destDir, `${id}${suffix}`);
|
|
425
|
+
if (!existsSync(src)) return null;
|
|
426
|
+
renameSync(src, dst);
|
|
427
|
+
return dst;
|
|
428
|
+
}
|
|
429
|
+
|
|
430
|
+
/**
|
|
431
|
+
* Mark a claimed event as successfully processed. Moves it to
|
|
432
|
+
* processed/<YYYY-MM-DD>/<id>.json and stores the handler result alongside.
|
|
433
|
+
*/
|
|
434
|
+
export function completeTick(agentRoot, id, result = {}) {
|
|
435
|
+
const paths = ensureBusDirs(agentRoot);
|
|
436
|
+
const date = todayUtc();
|
|
437
|
+
const dst = moveClaimed(paths, id, join(paths.processed, date));
|
|
438
|
+
if (!dst) {
|
|
439
|
+
logBusEvent(paths.agentRoot, { level: "warn", stage: "complete_missing", id });
|
|
440
|
+
return null;
|
|
441
|
+
}
|
|
442
|
+
// Annotate the archived event with the result.
|
|
443
|
+
try {
|
|
444
|
+
const event = JSON.parse(readFileSync(dst, "utf-8"));
|
|
445
|
+
event.result = result;
|
|
446
|
+
event.completed_at = new Date().toISOString();
|
|
447
|
+
writeJsonAtomic(dst, event);
|
|
448
|
+
} catch {
|
|
449
|
+
/* best-effort */
|
|
450
|
+
}
|
|
451
|
+
logBusEvent(paths.agentRoot, {
|
|
452
|
+
level: "info",
|
|
453
|
+
stage: "processed",
|
|
454
|
+
id,
|
|
455
|
+
cadence: result.cadence || null,
|
|
456
|
+
decision: result.decision || null,
|
|
457
|
+
duration_ms: result.duration_ms ?? null,
|
|
458
|
+
});
|
|
459
|
+
return dst;
|
|
460
|
+
}
|
|
461
|
+
|
|
462
|
+
/**
|
|
463
|
+
* Mark a claimed event as failed. If under the retry budget, returns it to
|
|
464
|
+
* inbox/ for another attempt; otherwise routes to dlq/.
|
|
465
|
+
*/
|
|
466
|
+
export function failTick(agentRoot, id, errorOrReason, opts = {}) {
|
|
467
|
+
const paths = ensureBusDirs(agentRoot);
|
|
468
|
+
const maxAttempts = opts.maxAttempts ?? DEFAULT_MAX_ATTEMPTS;
|
|
469
|
+
const srcClaimed = join(paths.claimed, `${id}.json`);
|
|
470
|
+
let event = null;
|
|
471
|
+
if (existsSync(srcClaimed)) {
|
|
472
|
+
try { event = JSON.parse(readFileSync(srcClaimed, "utf-8")); } catch { /* */ }
|
|
473
|
+
}
|
|
474
|
+
if (!event) {
|
|
475
|
+
// Caller already moved or never had a claim; nothing to do.
|
|
476
|
+
logBusEvent(paths.agentRoot, { level: "warn", stage: "fail_missing", id });
|
|
477
|
+
return null;
|
|
478
|
+
}
|
|
479
|
+
event.last_error = typeof errorOrReason === "string"
|
|
480
|
+
? errorOrReason
|
|
481
|
+
: (errorOrReason?.message || String(errorOrReason));
|
|
482
|
+
event.failed_at = new Date().toISOString();
|
|
483
|
+
|
|
484
|
+
const attempts = typeof event.attempts === "number" ? event.attempts : 0;
|
|
485
|
+
if (attempts >= maxAttempts || opts.terminal === true) {
|
|
486
|
+
// Move to dlq/
|
|
487
|
+
moveClaimed(paths, id, paths.dlq);
|
|
488
|
+
logBusEvent(paths.agentRoot, {
|
|
489
|
+
level: "error",
|
|
490
|
+
stage: "dlq",
|
|
491
|
+
id,
|
|
492
|
+
cadence: event.cadence,
|
|
493
|
+
attempts,
|
|
494
|
+
reason: event.last_error,
|
|
495
|
+
});
|
|
496
|
+
return { destination: "dlq" };
|
|
497
|
+
}
|
|
498
|
+
|
|
499
|
+
// Re-enqueue: write fresh in inbox/, drop the claim.
|
|
500
|
+
const newTarget = join(paths.inbox, `${id}.json`);
|
|
501
|
+
writeJsonAtomic(newTarget, event);
|
|
502
|
+
try { unlinkSync(srcClaimed); } catch { /* */ }
|
|
503
|
+
logBusEvent(paths.agentRoot, {
|
|
504
|
+
level: "warn",
|
|
505
|
+
stage: "retry_requeued",
|
|
506
|
+
id,
|
|
507
|
+
cadence: event.cadence,
|
|
508
|
+
attempts,
|
|
509
|
+
reason: event.last_error,
|
|
510
|
+
});
|
|
511
|
+
return { destination: "inbox" };
|
|
512
|
+
}
|
|
513
|
+
|
|
514
|
+
// ---------------------------------------------------------------------------
|
|
515
|
+
// Stale claim recovery
|
|
516
|
+
// ---------------------------------------------------------------------------
|
|
517
|
+
|
|
518
|
+
/**
|
|
519
|
+
* Sweep claimed/ for entries older than `maxAgeMs`. Each stale claim is
|
|
520
|
+
* either returned to inbox/ (under the retry budget) or moved to dlq/.
|
|
521
|
+
* Safe to run on consumer startup *and* periodically while running, because
|
|
522
|
+
* the consumer is the only writer to claimed/.
|
|
523
|
+
*
|
|
524
|
+
* Returns the count of events recovered + count moved to dlq.
|
|
525
|
+
*/
|
|
526
|
+
export function recoverStaleClaims(agentRoot, maxAgeMs = DEFAULT_STALE_CLAIM_MS, now = Date.now()) {
|
|
527
|
+
const paths = ensureBusDirs(agentRoot);
|
|
528
|
+
const stats = { recovered: 0, dlq: 0, scanned: 0 };
|
|
529
|
+
if (!existsSync(paths.claimed)) return stats;
|
|
530
|
+
for (const name of readdirSync(paths.claimed)) {
|
|
531
|
+
if (!name.endsWith(".json")) continue;
|
|
532
|
+
stats.scanned++;
|
|
533
|
+
const claimedPath = join(paths.claimed, name);
|
|
534
|
+
let st;
|
|
535
|
+
try { st = statSync(claimedPath); } catch { continue; }
|
|
536
|
+
if (now - st.mtimeMs < maxAgeMs) continue;
|
|
537
|
+
|
|
538
|
+
const id = name.slice(0, -5);
|
|
539
|
+
const outcome = failTick(paths.agentRoot, id, "stale-claim-recovery", {
|
|
540
|
+
maxAttempts: DEFAULT_MAX_ATTEMPTS,
|
|
541
|
+
});
|
|
542
|
+
if (outcome?.destination === "dlq") stats.dlq++;
|
|
543
|
+
else if (outcome?.destination === "inbox") stats.recovered++;
|
|
544
|
+
}
|
|
545
|
+
if (stats.scanned > 0) {
|
|
546
|
+
logBusEvent(paths.agentRoot, {
|
|
547
|
+
level: "info",
|
|
548
|
+
stage: "stale_recovery_complete",
|
|
549
|
+
...stats,
|
|
550
|
+
});
|
|
551
|
+
}
|
|
552
|
+
return stats;
|
|
553
|
+
}
|
|
554
|
+
|
|
555
|
+
// ---------------------------------------------------------------------------
|
|
556
|
+
// Heartbeat
|
|
557
|
+
// ---------------------------------------------------------------------------
|
|
558
|
+
|
|
559
|
+
/**
|
|
560
|
+
* Write the consumer's heartbeat state. Doctor / healthcheck reads this to
|
|
561
|
+
* confirm the persistent main session is alive.
|
|
562
|
+
*/
|
|
563
|
+
export function writeHealth(agentRoot, state = {}) {
|
|
564
|
+
const paths = ensureBusDirs(agentRoot);
|
|
565
|
+
const payload = {
|
|
566
|
+
version: BUS_VERSION,
|
|
567
|
+
ts: new Date().toISOString(),
|
|
568
|
+
pid: process.pid,
|
|
569
|
+
...state,
|
|
570
|
+
};
|
|
571
|
+
try {
|
|
572
|
+
writeJsonAtomic(paths.health, payload);
|
|
573
|
+
} catch {
|
|
574
|
+
/* best-effort */
|
|
575
|
+
}
|
|
576
|
+
return payload;
|
|
577
|
+
}
|
|
578
|
+
|
|
579
|
+
/**
|
|
580
|
+
* Read the consumer's heartbeat state. Returns null if missing or unreadable.
|
|
581
|
+
*/
|
|
582
|
+
export function readHealth(agentRoot) {
|
|
583
|
+
const paths = getBusPaths(agentRoot);
|
|
584
|
+
if (!existsSync(paths.health)) return null;
|
|
585
|
+
try {
|
|
586
|
+
return JSON.parse(readFileSync(paths.health, "utf-8"));
|
|
587
|
+
} catch {
|
|
588
|
+
return null;
|
|
589
|
+
}
|
|
590
|
+
}
|
|
591
|
+
|
|
592
|
+
/**
|
|
593
|
+
* Inspect bus depth — counts of events at each stage. Used by doctor and
|
|
594
|
+
* tests; cheap enough to call frequently.
|
|
595
|
+
*/
|
|
596
|
+
export function busDepth(agentRoot) {
|
|
597
|
+
const paths = getBusPaths(agentRoot);
|
|
598
|
+
const dirCount = (d) => existsSync(d) ? readdirSync(d).filter((n) => n.endsWith(".json")).length : 0;
|
|
599
|
+
return {
|
|
600
|
+
inbox: dirCount(paths.inbox),
|
|
601
|
+
claimed: dirCount(paths.claimed),
|
|
602
|
+
dlq: dirCount(paths.dlq),
|
|
603
|
+
failed: dirCount(paths.failed),
|
|
604
|
+
};
|
|
605
|
+
}
|
|
606
|
+
|
|
607
|
+
// ---------------------------------------------------------------------------
|
|
608
|
+
// CLI usability
|
|
609
|
+
// ---------------------------------------------------------------------------
|
|
610
|
+
|
|
611
|
+
/**
|
|
612
|
+
* Touch each bus directory so a fresh checkout/install has the expected
|
|
613
|
+
* shape on disk. Equivalent to calling ensureBusDirs + readHealth probe
|
|
614
|
+
* + writing a `.gitkeep` per dir so empty-bus repos still track structure.
|
|
615
|
+
*/
|
|
616
|
+
export function bootstrapBus(agentRoot) {
|
|
617
|
+
const paths = ensureBusDirs(agentRoot);
|
|
618
|
+
for (const dir of [paths.inbox, paths.claimed, paths.processed, paths.failed, paths.dlq, paths.logsDir]) {
|
|
619
|
+
const keep = join(dir, ".gitkeep");
|
|
620
|
+
if (!existsSync(keep)) {
|
|
621
|
+
try { closeSync(openSync(keep, "a")); } catch { /* ignore */ }
|
|
622
|
+
}
|
|
623
|
+
}
|
|
624
|
+
return paths;
|
|
625
|
+
}
|