@adaptic/maestro 1.7.3 → 1.8.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude/commands/init-maestro.md +15 -2
- package/.gitignore +7 -0
- package/README.md +62 -11
- package/bin/maestro.mjs +338 -2
- package/bin/maestro.test.mjs +299 -0
- package/docs/guides/poller-daemon-setup.md +21 -8
- package/docs/runbooks/perpetual-operations.md +19 -15
- package/docs/runbooks/recovery-and-failover.md +42 -0
- package/lib/cadence-bus.mjs +625 -0
- package/lib/cadence-bus.test.mjs +354 -0
- package/package.json +6 -1
- package/scaffold/CLAUDE.md +11 -7
- package/scripts/cadence/cadence-status.mjs +36 -0
- package/scripts/cadence/enqueue-cadence-tick.mjs +158 -0
- package/scripts/cadence/enqueue-cadence-tick.test.mjs +154 -0
- package/scripts/cadence/launchd-cadence-wrapper.sh +85 -0
- package/scripts/daemon/cadence-consumer.mjs +439 -0
- package/scripts/daemon/cadence-consumer.test.mjs +397 -0
- package/scripts/daemon/cadence-handlers.mjs +263 -0
- package/scripts/daemon/maestro-daemon.mjs +20 -0
- package/scripts/local-triggers/generate-plists.sh +33 -12
- package/scripts/local-triggers/generate-plists.test.mjs +185 -0
- package/scripts/local-triggers/plists/.gitkeep +0 -0
- package/scripts/local-triggers/run-trigger.sh +22 -3
- package/scripts/local-triggers/plists/ai.adaptic.sophie-backlog-executor.plist +0 -21
- package/scripts/local-triggers/plists/ai.adaptic.sophie-daemon.plist +0 -32
- package/scripts/local-triggers/plists/ai.adaptic.sophie-inbox-processor.plist +0 -21
- package/scripts/local-triggers/plists/ai.adaptic.sophie-meeting-action-capture.plist +0 -21
- package/scripts/local-triggers/plists/ai.adaptic.sophie-meeting-prep.plist +0 -21
- package/scripts/local-triggers/plists/ai.adaptic.sophie-midday-sweep.plist +0 -26
- package/scripts/local-triggers/plists/ai.adaptic.sophie-quarterly-self-assessment.plist +0 -62
- package/scripts/local-triggers/plists/ai.adaptic.sophie-weekly-engineering-health.plist +0 -28
- package/scripts/local-triggers/plists/ai.adaptic.sophie-weekly-execution.plist +0 -28
- package/scripts/local-triggers/plists/ai.adaptic.sophie-weekly-hiring.plist +0 -28
- package/scripts/local-triggers/plists/ai.adaptic.sophie-weekly-priorities.plist +0 -28
- package/scripts/local-triggers/plists/ai.adaptic.sophie-weekly-strategic-memo.plist +0 -28
|
@@ -0,0 +1,154 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* enqueue-cadence-tick.test.mjs — node:test coverage for the CLI enqueue
|
|
3
|
+
* script (the entry point used by launchd plists).
|
|
4
|
+
*/
|
|
5
|
+
|
|
6
|
+
import { test } from "node:test";
|
|
7
|
+
import assert from "node:assert/strict";
|
|
8
|
+
import { promises as fsp } from "fs";
|
|
9
|
+
import { writeFileSync, readFileSync, existsSync } from "node:fs";
|
|
10
|
+
import { tmpdir } from "os";
|
|
11
|
+
import { join, resolve, dirname } from "path";
|
|
12
|
+
import { fileURLToPath } from "node:url";
|
|
13
|
+
import { spawnSync } from "node:child_process";
|
|
14
|
+
|
|
15
|
+
const __dirname = dirname(fileURLToPath(import.meta.url));
|
|
16
|
+
const ENQUEUE_SCRIPT = resolve(__dirname, "enqueue-cadence-tick.mjs");
|
|
17
|
+
|
|
18
|
+
async function makeAgentRoot() {
|
|
19
|
+
const path = join(
|
|
20
|
+
tmpdir(),
|
|
21
|
+
`enqueue-cli-test-${process.pid}-${Date.now()}-${Math.random().toString(36).slice(2, 8)}`
|
|
22
|
+
);
|
|
23
|
+
await fsp.mkdir(path, { recursive: true });
|
|
24
|
+
return path;
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
async function rmRoot(path) {
|
|
28
|
+
try { await fsp.rm(path, { recursive: true, force: true }); } catch { /* */ }
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
function run(cwd, args, env = {}) {
|
|
32
|
+
return spawnSync(process.execPath, [ENQUEUE_SCRIPT, ...args], {
|
|
33
|
+
cwd,
|
|
34
|
+
env: { ...process.env, AGENT_ROOT: cwd, ...env },
|
|
35
|
+
encoding: "utf-8",
|
|
36
|
+
});
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
// ---------------------------------------------------------------------------
|
|
40
|
+
// Happy path
|
|
41
|
+
// ---------------------------------------------------------------------------
|
|
42
|
+
|
|
43
|
+
test("CLI enqueues a tick and writes inbox JSON", async () => {
|
|
44
|
+
const root = await makeAgentRoot();
|
|
45
|
+
try {
|
|
46
|
+
const r = run(root, ["inbox-processor", "--source=launchd"]);
|
|
47
|
+
assert.equal(r.status, 0, r.stderr);
|
|
48
|
+
const out = JSON.parse(r.stdout.trim());
|
|
49
|
+
assert.equal(out.ok, true);
|
|
50
|
+
assert.equal(out.cadence, "inbox-processor");
|
|
51
|
+
assert.equal(out.fallback_only, false);
|
|
52
|
+
assert.ok(out.path);
|
|
53
|
+
assert.ok(existsSync(out.path));
|
|
54
|
+
const event = JSON.parse(readFileSync(out.path, "utf-8"));
|
|
55
|
+
assert.equal(event.source, "launchd");
|
|
56
|
+
assert.equal(event.cadence, "inbox-processor");
|
|
57
|
+
} finally { await rmRoot(root); }
|
|
58
|
+
});
|
|
59
|
+
|
|
60
|
+
test("CLI accepts metadata pairs (repeated and comma-separated)", async () => {
|
|
61
|
+
const root = await makeAgentRoot();
|
|
62
|
+
try {
|
|
63
|
+
const r = run(root, [
|
|
64
|
+
"weekly-strategic-memo",
|
|
65
|
+
"--source=manual",
|
|
66
|
+
"--metadata=note=launch",
|
|
67
|
+
"--metadata=k1=v1,k2=v2",
|
|
68
|
+
]);
|
|
69
|
+
assert.equal(r.status, 0);
|
|
70
|
+
const out = JSON.parse(r.stdout.trim());
|
|
71
|
+
const event = JSON.parse(readFileSync(out.path, "utf-8"));
|
|
72
|
+
assert.equal(event.metadata.note, "launch");
|
|
73
|
+
assert.equal(event.metadata.k1, "v1");
|
|
74
|
+
assert.equal(event.metadata.k2, "v2");
|
|
75
|
+
} finally { await rmRoot(root); }
|
|
76
|
+
});
|
|
77
|
+
|
|
78
|
+
test("CLI --quiet suppresses stdout JSON", async () => {
|
|
79
|
+
const root = await makeAgentRoot();
|
|
80
|
+
try {
|
|
81
|
+
const r = run(root, ["inbox-processor", "--source=launchd", "--quiet"]);
|
|
82
|
+
assert.equal(r.status, 0);
|
|
83
|
+
assert.equal(r.stdout.trim(), "");
|
|
84
|
+
} finally { await rmRoot(root); }
|
|
85
|
+
});
|
|
86
|
+
|
|
87
|
+
test("CLI --version prints bus schema version", async () => {
|
|
88
|
+
const r = spawnSync(process.execPath, [ENQUEUE_SCRIPT, "--version"], { encoding: "utf-8" });
|
|
89
|
+
assert.equal(r.status, 0);
|
|
90
|
+
assert.match(r.stdout, /^\d+$/m);
|
|
91
|
+
});
|
|
92
|
+
|
|
93
|
+
test("CLI --help prints usage and exits 0", async () => {
|
|
94
|
+
const r = spawnSync(process.execPath, [ENQUEUE_SCRIPT, "--help"], { encoding: "utf-8" });
|
|
95
|
+
assert.equal(r.status, 0);
|
|
96
|
+
assert.match(r.stdout, /enqueue-cadence-tick/);
|
|
97
|
+
});
|
|
98
|
+
|
|
99
|
+
// ---------------------------------------------------------------------------
|
|
100
|
+
// Error paths
|
|
101
|
+
// ---------------------------------------------------------------------------
|
|
102
|
+
|
|
103
|
+
test("CLI exits 2 with no cadence", async () => {
|
|
104
|
+
const root = await makeAgentRoot();
|
|
105
|
+
try {
|
|
106
|
+
const r = run(root, []);
|
|
107
|
+
assert.equal(r.status, 2);
|
|
108
|
+
assert.match(r.stderr, /required/);
|
|
109
|
+
} finally { await rmRoot(root); }
|
|
110
|
+
});
|
|
111
|
+
|
|
112
|
+
test("CLI exits 2 on unknown flag", async () => {
|
|
113
|
+
const root = await makeAgentRoot();
|
|
114
|
+
try {
|
|
115
|
+
const r = run(root, ["x", "--what-is-this"]);
|
|
116
|
+
assert.equal(r.status, 2);
|
|
117
|
+
assert.match(r.stderr, /unknown flag/);
|
|
118
|
+
} finally { await rmRoot(root); }
|
|
119
|
+
});
|
|
120
|
+
|
|
121
|
+
// ---------------------------------------------------------------------------
|
|
122
|
+
// Emergency stop
|
|
123
|
+
// ---------------------------------------------------------------------------
|
|
124
|
+
|
|
125
|
+
test("CLI returns 0 with skipped=emergency-stop when stop flag exists", async () => {
|
|
126
|
+
const root = await makeAgentRoot();
|
|
127
|
+
try {
|
|
128
|
+
writeFileSync(join(root, ".emergency-stop"), "now");
|
|
129
|
+
const r = run(root, ["weekly-strategic-memo", "--source=launchd"]);
|
|
130
|
+
assert.equal(r.status, 0);
|
|
131
|
+
const out = JSON.parse(r.stdout.trim());
|
|
132
|
+
assert.equal(out.skipped, "emergency-stop");
|
|
133
|
+
assert.equal(out.fallback_only, true);
|
|
134
|
+
assert.equal(out.path, null);
|
|
135
|
+
// No file landed in inbox/.
|
|
136
|
+
assert.ok(!existsSync(join(root, "state/cadence-bus/inbox", `${out.id}.json`)));
|
|
137
|
+
} finally { await rmRoot(root); }
|
|
138
|
+
});
|
|
139
|
+
|
|
140
|
+
// ---------------------------------------------------------------------------
|
|
141
|
+
// Bus-handoff sanity — the CLI must finish in well under a second so launchd
|
|
142
|
+
// never piles up; we don't pin a number, just a generous ceiling.
|
|
143
|
+
// ---------------------------------------------------------------------------
|
|
144
|
+
|
|
145
|
+
test("CLI completes in well under 2 seconds", async () => {
|
|
146
|
+
const root = await makeAgentRoot();
|
|
147
|
+
try {
|
|
148
|
+
const t0 = Date.now();
|
|
149
|
+
const r = run(root, ["inbox-processor", "--source=launchd", "--quiet"]);
|
|
150
|
+
const dt = Date.now() - t0;
|
|
151
|
+
assert.equal(r.status, 0);
|
|
152
|
+
assert.ok(dt < 2000, `enqueue took ${dt}ms`);
|
|
153
|
+
} finally { await rmRoot(root); }
|
|
154
|
+
});
|
|
@@ -0,0 +1,85 @@
|
|
|
1
|
+
#!/bin/bash
|
|
2
|
+
# launchd-cadence-wrapper.sh — Bootstraps env and execs the Node cadence
|
|
3
|
+
# enqueue script under launchd.
|
|
4
|
+
#
|
|
5
|
+
# Usage in a plist:
|
|
6
|
+
# <key>ProgramArguments</key>
|
|
7
|
+
# <array>
|
|
8
|
+
# <string>/path/to/scripts/cadence/launchd-cadence-wrapper.sh</string>
|
|
9
|
+
# <string>/path/to/scripts/cadence/enqueue-cadence-tick.mjs</string>
|
|
10
|
+
# <string>cadence-name</string>
|
|
11
|
+
# <string>--source=launchd</string>
|
|
12
|
+
# </array>
|
|
13
|
+
#
|
|
14
|
+
# What it does:
|
|
15
|
+
# 1. Sets HOME, PATH, USER, AGENT_ROOT (launchd's bare env doesn't include them).
|
|
16
|
+
# 2. Picks a node binary (nvm, homebrew, system fallback).
|
|
17
|
+
# 3. Redirects stdout/stderr to logs/cadence-bus/launchd-YYYY-MM-DD.log.
|
|
18
|
+
# 4. Exec's `node <enqueue-script> "$@"`.
|
|
19
|
+
#
|
|
20
|
+
# This is a *pure enqueue* wrapper — it must NOT spawn Claude Code. The
|
|
21
|
+
# heavyweight cadence work happens inside the persistent daemon's cadence
|
|
22
|
+
# consumer, not here.
|
|
23
|
+
|
|
24
|
+
set -e
|
|
25
|
+
|
|
26
|
+
if [ $# -lt 1 ]; then
|
|
27
|
+
echo "[cadence-wrapper] FATAL: enqueue script path required" >&2
|
|
28
|
+
exit 64
|
|
29
|
+
fi
|
|
30
|
+
|
|
31
|
+
ENQUEUE_SCRIPT="$1"
|
|
32
|
+
shift
|
|
33
|
+
|
|
34
|
+
# Walk up from the enqueue script to find the agent root. AGENT_ROOT is the
|
|
35
|
+
# nearest ancestor with both package.json and config/.
|
|
36
|
+
TARGET_DIR="$(cd "$(dirname "$ENQUEUE_SCRIPT")" && pwd -P)"
|
|
37
|
+
CANDIDATE="$TARGET_DIR"
|
|
38
|
+
AGENT_ROOT=""
|
|
39
|
+
while [ "$CANDIDATE" != "/" ]; do
|
|
40
|
+
if [ -f "$CANDIDATE/package.json" ] && [ -d "$CANDIDATE/config" ]; then
|
|
41
|
+
AGENT_ROOT="$CANDIDATE"
|
|
42
|
+
break
|
|
43
|
+
fi
|
|
44
|
+
CANDIDATE="$(dirname "$CANDIDATE")"
|
|
45
|
+
done
|
|
46
|
+
AGENT_ROOT="${AGENT_ROOT:-$TARGET_DIR}"
|
|
47
|
+
|
|
48
|
+
export AGENT_ROOT
|
|
49
|
+
export AGENT_DIR="$AGENT_ROOT"
|
|
50
|
+
export HOME="${HOME:-/Users/$(whoami)}"
|
|
51
|
+
export USER="${USER:-$(whoami)}"
|
|
52
|
+
export PATH="/opt/homebrew/bin:/opt/homebrew/sbin:/usr/local/bin:/usr/bin:/bin:/usr/sbin:/sbin:$PATH"
|
|
53
|
+
|
|
54
|
+
cd "$AGENT_ROOT"
|
|
55
|
+
|
|
56
|
+
# Resolve node binary — prefer nvm, fall back to homebrew, then system.
|
|
57
|
+
NODE_BIN=""
|
|
58
|
+
for candidate in \
|
|
59
|
+
"$HOME/.nvm/versions/node/v24.11.1/bin/node" \
|
|
60
|
+
"$HOME/.nvm/versions/node/v24/bin/node" \
|
|
61
|
+
"$HOME/.nvm/versions/node/v22/bin/node" \
|
|
62
|
+
"$HOME/.nvm/versions/node/v20/bin/node" \
|
|
63
|
+
/opt/homebrew/bin/node \
|
|
64
|
+
/usr/local/bin/node \
|
|
65
|
+
/usr/bin/node; do
|
|
66
|
+
if [ -x "$candidate" ]; then
|
|
67
|
+
NODE_BIN="$candidate"
|
|
68
|
+
break
|
|
69
|
+
fi
|
|
70
|
+
done
|
|
71
|
+
if [ -z "$NODE_BIN" ] && [ -d "$HOME/.nvm/versions/node" ]; then
|
|
72
|
+
NODE_BIN=$(ls -1d "$HOME/.nvm/versions/node"/v*/bin/node 2>/dev/null | sort -V | tail -1)
|
|
73
|
+
fi
|
|
74
|
+
if [ -z "$NODE_BIN" ] || [ ! -x "$NODE_BIN" ]; then
|
|
75
|
+
echo "[cadence-wrapper] FATAL: could not find node binary" >&2
|
|
76
|
+
exit 127
|
|
77
|
+
fi
|
|
78
|
+
|
|
79
|
+
LOG_DATE="$(date +%Y-%m-%d)"
|
|
80
|
+
LOG_DIR="$AGENT_ROOT/logs/cadence-bus"
|
|
81
|
+
mkdir -p "$LOG_DIR" 2>/dev/null || true
|
|
82
|
+
LOG_FILE="$LOG_DIR/launchd-${LOG_DATE}.log"
|
|
83
|
+
|
|
84
|
+
# Exec the enqueue script. stdin is closed; stdout/stderr go to the log.
|
|
85
|
+
exec "$NODE_BIN" "$ENQUEUE_SCRIPT" "$@" >> "$LOG_FILE" 2>&1
|
|
@@ -0,0 +1,439 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Maestro — Cadence Consumer
|
|
3
|
+
*
|
|
4
|
+
* The persistent main session's drain loop for the cadence bus.
|
|
5
|
+
*
|
|
6
|
+
* Lifecycle: launched once per agent process (inside maestro-daemon.mjs).
|
|
7
|
+
* On every tick it:
|
|
8
|
+
*
|
|
9
|
+
* 1. Recovers any stale claims (events whose handler crashed mid-flight).
|
|
10
|
+
* 2. Drains inbox/, atomically claiming each event.
|
|
11
|
+
* 3. Routes the event through cadence-handlers.mjs:
|
|
12
|
+
* - inline → run handler in-process, complete.
|
|
13
|
+
* - guarded → run cheap pre-check; if "no work", complete inline,
|
|
14
|
+
* else escalate to a sub-session.
|
|
15
|
+
* - escalate → spawn a sub-session running the cadence's trigger
|
|
16
|
+
* prompt under schedules/triggers/<name>.md.
|
|
17
|
+
* Unknown cadences with a prompt on disk default to escalate; without
|
|
18
|
+
* a prompt they go straight to dlq with a clear error.
|
|
19
|
+
* 4. Respects .emergency-stop: while present, the loop logs a heartbeat
|
|
20
|
+
* but never spawns a sub-session and never processes events. Existing
|
|
21
|
+
* claims remain on disk so they can be resumed once the stop is lifted.
|
|
22
|
+
* 5. Writes a heartbeat to state/cadence-bus/health.json on every cycle
|
|
23
|
+
* so doctor / healthcheck can confirm liveness.
|
|
24
|
+
*
|
|
25
|
+
* The consumer never spawns more than ONE sub-session at a time on its own.
|
|
26
|
+
* That bound exists because the parent process is the single owner of
|
|
27
|
+
* cadence housekeeping; if your workflow needs higher parallelism, lean on
|
|
28
|
+
* the existing daemon dispatcher (which is purpose-built for inbox items),
|
|
29
|
+
* not on multiplying cadence consumers.
|
|
30
|
+
*
|
|
31
|
+
* Public API:
|
|
32
|
+
* startConsumer(opts) → { stop(), getStats(), tickOnce() }
|
|
33
|
+
*
|
|
34
|
+
* Options:
|
|
35
|
+
* agentRoot override AGENT_ROOT (tests).
|
|
36
|
+
* pollMs drain interval (default 2_000).
|
|
37
|
+
* heartbeatMs health.json refresh interval (default 15_000).
|
|
38
|
+
* recoveryMs stale-claim sweep interval (default 5 * 60_000).
|
|
39
|
+
* spawnSession injected spawner for tests; defaults to a real
|
|
40
|
+
* `claude --print --dangerously-skip-permissions <prompt>`
|
|
41
|
+
* child_process.spawn.
|
|
42
|
+
* maxSpawnMs hard timeout per sub-session (default 30 * 60_000).
|
|
43
|
+
* logger optional fn({ ts, level, …rest }) → void for tests.
|
|
44
|
+
*/
|
|
45
|
+
|
|
46
|
+
import { existsSync, readFileSync, writeFileSync } from "node:fs";
|
|
47
|
+
import { join } from "node:path";
|
|
48
|
+
import { spawn } from "node:child_process";
|
|
49
|
+
|
|
50
|
+
import {
|
|
51
|
+
ensureBusDirs,
|
|
52
|
+
claimNextTick,
|
|
53
|
+
completeTick,
|
|
54
|
+
failTick,
|
|
55
|
+
recoverStaleClaims,
|
|
56
|
+
writeHealth,
|
|
57
|
+
getBusPaths,
|
|
58
|
+
logBusEvent,
|
|
59
|
+
busDepth,
|
|
60
|
+
} from "../../lib/cadence-bus.mjs";
|
|
61
|
+
import { getCadenceDef } from "./cadence-handlers.mjs";
|
|
62
|
+
|
|
63
|
+
// ---------------------------------------------------------------------------
|
|
64
|
+
// Defaults
|
|
65
|
+
// ---------------------------------------------------------------------------
|
|
66
|
+
|
|
67
|
+
const DEFAULT_POLL_MS = 2_000;
|
|
68
|
+
const DEFAULT_HEARTBEAT_MS = 15_000;
|
|
69
|
+
const DEFAULT_RECOVERY_MS = 5 * 60_000;
|
|
70
|
+
const DEFAULT_SPAWN_TIMEOUT_MS = 30 * 60_000;
|
|
71
|
+
|
|
72
|
+
// Concurrency: at most one sub-session at a time per cadence consumer.
|
|
73
|
+
// Cadence events are not realtime — queueing one tick behind another is
|
|
74
|
+
// preferable to thrashing Claude / hitting usage limits.
|
|
75
|
+
const MAX_CONCURRENT_SUB_SESSIONS = 1;
|
|
76
|
+
|
|
77
|
+
// ---------------------------------------------------------------------------
|
|
78
|
+
// Helpers
|
|
79
|
+
// ---------------------------------------------------------------------------
|
|
80
|
+
|
|
81
|
+
function isEmergencyStop(paths) {
|
|
82
|
+
return existsSync(paths.emergencyStop);
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
function defaultLogger(entry) {
|
|
86
|
+
// The default logger ALSO writes to logs/cadence-bus/<date>.jsonl via
|
|
87
|
+
// logBusEvent, but mirrors important events to stderr so launchd's
|
|
88
|
+
// StandardErrorPath captures them.
|
|
89
|
+
if (entry.level === "error" || entry.level === "warn") {
|
|
90
|
+
try { process.stderr.write(`[cadence-consumer] ${JSON.stringify(entry)}\n`); }
|
|
91
|
+
catch { /* ignore */ }
|
|
92
|
+
}
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
/**
|
|
96
|
+
* Spawn a sub-session running the cadence's trigger prompt and resolve
|
|
97
|
+
* with { exit_code, durationMs }. Reads the prompt at call time so the
|
|
98
|
+
* latest version (possibly upgraded between ticks) is always used.
|
|
99
|
+
*/
|
|
100
|
+
function realSpawnSession({ agentRoot, cadence, promptPath, timeoutMs, log }) {
|
|
101
|
+
return new Promise((resolveOut) => {
|
|
102
|
+
const fullPrompt = join(agentRoot, promptPath);
|
|
103
|
+
if (!existsSync(fullPrompt)) {
|
|
104
|
+
resolveOut({ ok: false, exit_code: -2, error: `prompt not found: ${promptPath}` });
|
|
105
|
+
return;
|
|
106
|
+
}
|
|
107
|
+
let body;
|
|
108
|
+
try { body = readFileSync(fullPrompt, "utf-8"); }
|
|
109
|
+
catch (err) {
|
|
110
|
+
resolveOut({ ok: false, exit_code: -3, error: `prompt read failed: ${err.message}` });
|
|
111
|
+
return;
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
const bin = process.env.CLAUDE_BIN || "claude";
|
|
115
|
+
const args = ["--print", "--dangerously-skip-permissions", body];
|
|
116
|
+
const env = { ...process.env, AGENT_ROOT: agentRoot, AGENT_DIR: agentRoot };
|
|
117
|
+
const started = Date.now();
|
|
118
|
+
|
|
119
|
+
log({ level: "info", stage: "subsession_spawn", cadence, bin });
|
|
120
|
+
|
|
121
|
+
let child;
|
|
122
|
+
try {
|
|
123
|
+
child = spawn(bin, args, { cwd: agentRoot, env, stdio: "ignore" });
|
|
124
|
+
} catch (err) {
|
|
125
|
+
resolveOut({ ok: false, exit_code: -4, error: `spawn failed: ${err.message}` });
|
|
126
|
+
return;
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
const timer = setTimeout(() => {
|
|
130
|
+
log({ level: "warn", stage: "subsession_timeout", cadence, timeout_ms: timeoutMs });
|
|
131
|
+
try { child.kill("SIGTERM"); } catch { /* ignore */ }
|
|
132
|
+
// Give it a beat to die before SIGKILL.
|
|
133
|
+
setTimeout(() => { try { child.kill("SIGKILL"); } catch { /* ignore */ } }, 5_000);
|
|
134
|
+
}, timeoutMs);
|
|
135
|
+
|
|
136
|
+
child.on("exit", (code, signal) => {
|
|
137
|
+
clearTimeout(timer);
|
|
138
|
+
const durationMs = Date.now() - started;
|
|
139
|
+
const exit_code = typeof code === "number" ? code : (signal ? -1 : -5);
|
|
140
|
+
resolveOut({
|
|
141
|
+
ok: exit_code === 0,
|
|
142
|
+
exit_code,
|
|
143
|
+
signal: signal || null,
|
|
144
|
+
duration_ms: durationMs,
|
|
145
|
+
});
|
|
146
|
+
});
|
|
147
|
+
|
|
148
|
+
child.on("error", (err) => {
|
|
149
|
+
clearTimeout(timer);
|
|
150
|
+
const durationMs = Date.now() - started;
|
|
151
|
+
resolveOut({ ok: false, exit_code: -6, error: err.message, duration_ms: durationMs });
|
|
152
|
+
});
|
|
153
|
+
});
|
|
154
|
+
}
|
|
155
|
+
|
|
156
|
+
// ---------------------------------------------------------------------------
|
|
157
|
+
// Public API
|
|
158
|
+
// ---------------------------------------------------------------------------
|
|
159
|
+
|
|
160
|
+
/**
|
|
161
|
+
* Start the consumer. Returns control handles (stop, getStats, tickOnce).
|
|
162
|
+
* Caller must `await stop()` to flush state on shutdown.
|
|
163
|
+
*/
|
|
164
|
+
export function startConsumer(opts = {}) {
|
|
165
|
+
const agentRoot = opts.agentRoot || process.env.AGENT_ROOT || process.env.AGENT_DIR || process.cwd();
|
|
166
|
+
const paths = ensureBusDirs(agentRoot);
|
|
167
|
+
|
|
168
|
+
const pollMs = opts.pollMs ?? DEFAULT_POLL_MS;
|
|
169
|
+
const heartbeatMs = opts.heartbeatMs ?? DEFAULT_HEARTBEAT_MS;
|
|
170
|
+
const recoveryMs = opts.recoveryMs ?? DEFAULT_RECOVERY_MS;
|
|
171
|
+
const maxSpawnMs = opts.maxSpawnMs ?? DEFAULT_SPAWN_TIMEOUT_MS;
|
|
172
|
+
const spawnSession = opts.spawnSession || realSpawnSession;
|
|
173
|
+
const userLogger = opts.logger;
|
|
174
|
+
|
|
175
|
+
const stats = {
|
|
176
|
+
started_at: new Date().toISOString(),
|
|
177
|
+
received: 0,
|
|
178
|
+
inline: 0,
|
|
179
|
+
escalated: 0,
|
|
180
|
+
skipped_emergency_stop: 0,
|
|
181
|
+
dlq: 0,
|
|
182
|
+
retries: 0,
|
|
183
|
+
spawn_failures: 0,
|
|
184
|
+
last_event_id: null,
|
|
185
|
+
last_decision: null,
|
|
186
|
+
};
|
|
187
|
+
|
|
188
|
+
let stopping = false;
|
|
189
|
+
let activeTick = null;
|
|
190
|
+
let timers = [];
|
|
191
|
+
let activeSubSessions = 0;
|
|
192
|
+
|
|
193
|
+
function log(entry) {
|
|
194
|
+
const enriched = { ts: new Date().toISOString(), ...entry };
|
|
195
|
+
logBusEvent(agentRoot, enriched);
|
|
196
|
+
if (userLogger) {
|
|
197
|
+
try { userLogger(enriched); } catch { /* never crash on logging */ }
|
|
198
|
+
} else {
|
|
199
|
+
defaultLogger(enriched);
|
|
200
|
+
}
|
|
201
|
+
}
|
|
202
|
+
|
|
203
|
+
function heartbeat() {
|
|
204
|
+
writeHealth(agentRoot, {
|
|
205
|
+
stats: { ...stats, depth: busDepth(agentRoot) },
|
|
206
|
+
active_subsessions: activeSubSessions,
|
|
207
|
+
stopping,
|
|
208
|
+
});
|
|
209
|
+
}
|
|
210
|
+
|
|
211
|
+
async function escalate(event) {
|
|
212
|
+
if (activeSubSessions >= MAX_CONCURRENT_SUB_SESSIONS) {
|
|
213
|
+
// Re-queue and try again next tick. Single-owner cadence consumer
|
|
214
|
+
// means this can only happen when a prior tick is still running —
|
|
215
|
+
// queue depth is the right back-pressure signal.
|
|
216
|
+
log({
|
|
217
|
+
level: "info",
|
|
218
|
+
stage: "escalate_deferred",
|
|
219
|
+
id: event.id,
|
|
220
|
+
cadence: event.cadence,
|
|
221
|
+
active_subsessions: activeSubSessions,
|
|
222
|
+
});
|
|
223
|
+
failTick(agentRoot, event.id, "deferred:concurrent-spawn", { maxAttempts: 10 });
|
|
224
|
+
stats.retries += 1;
|
|
225
|
+
return { ok: false, decision: "deferred" };
|
|
226
|
+
}
|
|
227
|
+
|
|
228
|
+
const def = getCadenceDef(event.cadence);
|
|
229
|
+
let promptPath = def?.prompt;
|
|
230
|
+
if (!promptPath) {
|
|
231
|
+
// Unknown cadence — try the conventional location.
|
|
232
|
+
const conventional = `schedules/triggers/${event.cadence}.md`;
|
|
233
|
+
if (existsSync(join(agentRoot, conventional))) {
|
|
234
|
+
promptPath = conventional;
|
|
235
|
+
log({ level: "warn", stage: "escalate_unknown_cadence", id: event.id, cadence: event.cadence, prompt: conventional });
|
|
236
|
+
} else {
|
|
237
|
+
log({ level: "error", stage: "escalate_no_prompt", id: event.id, cadence: event.cadence });
|
|
238
|
+
failTick(agentRoot, event.id, `no handler and no prompt at ${conventional}`, { terminal: true });
|
|
239
|
+
stats.dlq += 1;
|
|
240
|
+
return { ok: false, decision: "dlq-no-prompt" };
|
|
241
|
+
}
|
|
242
|
+
}
|
|
243
|
+
|
|
244
|
+
activeSubSessions += 1;
|
|
245
|
+
let result;
|
|
246
|
+
try {
|
|
247
|
+
log({ level: "info", stage: "escalating", id: event.id, cadence: event.cadence, prompt: promptPath, reason: event.metadata?.reason || "registry policy" });
|
|
248
|
+
result = await spawnSession({
|
|
249
|
+
agentRoot,
|
|
250
|
+
cadence: event.cadence,
|
|
251
|
+
promptPath,
|
|
252
|
+
timeoutMs: maxSpawnMs,
|
|
253
|
+
log,
|
|
254
|
+
});
|
|
255
|
+
} finally {
|
|
256
|
+
activeSubSessions -= 1;
|
|
257
|
+
}
|
|
258
|
+
|
|
259
|
+
if (result.ok) {
|
|
260
|
+
completeTick(agentRoot, event.id, {
|
|
261
|
+
decision: "escalated",
|
|
262
|
+
cadence: event.cadence,
|
|
263
|
+
prompt: promptPath,
|
|
264
|
+
exit_code: result.exit_code,
|
|
265
|
+
duration_ms: result.duration_ms,
|
|
266
|
+
});
|
|
267
|
+
stats.escalated += 1;
|
|
268
|
+
stats.last_decision = "escalated";
|
|
269
|
+
return { ok: true, decision: "escalated", exit_code: result.exit_code };
|
|
270
|
+
}
|
|
271
|
+
log({ level: "error", stage: "subsession_failed", id: event.id, cadence: event.cadence, exit_code: result.exit_code, error: result.error || null });
|
|
272
|
+
stats.spawn_failures += 1;
|
|
273
|
+
const outcome = failTick(agentRoot, event.id, result.error || `exit ${result.exit_code}`);
|
|
274
|
+
if (outcome?.destination === "dlq") stats.dlq += 1;
|
|
275
|
+
else stats.retries += 1;
|
|
276
|
+
return { ok: false, decision: outcome?.destination || "failed" };
|
|
277
|
+
}
|
|
278
|
+
|
|
279
|
+
// NOTE: do not name this `process` — function declarations are hoisted
|
|
280
|
+
// and would shadow the global `process` object inside startConsumer.
|
|
281
|
+
async function processEvent(event) {
|
|
282
|
+
stats.received += 1;
|
|
283
|
+
stats.last_event_id = event.id;
|
|
284
|
+
|
|
285
|
+
const def = getCadenceDef(event.cadence);
|
|
286
|
+
if (def?.mode === "inline" && typeof def.handler === "function") {
|
|
287
|
+
try {
|
|
288
|
+
const out = await def.handler({ event, agentRoot, log });
|
|
289
|
+
completeTick(agentRoot, event.id, {
|
|
290
|
+
decision: "inline",
|
|
291
|
+
cadence: event.cadence,
|
|
292
|
+
handler_result: out,
|
|
293
|
+
});
|
|
294
|
+
stats.inline += 1;
|
|
295
|
+
stats.last_decision = "inline";
|
|
296
|
+
return { decision: "inline", result: out };
|
|
297
|
+
} catch (err) {
|
|
298
|
+
log({ level: "error", stage: "inline_handler_threw", id: event.id, cadence: event.cadence, error: err.message });
|
|
299
|
+
const outcome = failTick(agentRoot, event.id, `inline-handler-threw: ${err.message}`);
|
|
300
|
+
if (outcome?.destination === "dlq") stats.dlq += 1;
|
|
301
|
+
return { decision: "failed", error: err.message };
|
|
302
|
+
}
|
|
303
|
+
}
|
|
304
|
+
|
|
305
|
+
if (def?.mode === "guarded" && typeof def.guard === "function") {
|
|
306
|
+
let guardOut;
|
|
307
|
+
try {
|
|
308
|
+
guardOut = await def.guard({ event, agentRoot, log });
|
|
309
|
+
} catch (err) {
|
|
310
|
+
log({ level: "error", stage: "guard_threw", id: event.id, cadence: event.cadence, error: err.message });
|
|
311
|
+
failTick(agentRoot, event.id, `guard-threw: ${err.message}`);
|
|
312
|
+
stats.retries += 1;
|
|
313
|
+
return { decision: "failed", error: err.message };
|
|
314
|
+
}
|
|
315
|
+
if (guardOut?.decision === "inline") {
|
|
316
|
+
completeTick(agentRoot, event.id, {
|
|
317
|
+
decision: "inline_via_guard",
|
|
318
|
+
cadence: event.cadence,
|
|
319
|
+
guard_result: guardOut,
|
|
320
|
+
});
|
|
321
|
+
stats.inline += 1;
|
|
322
|
+
stats.last_decision = "inline_via_guard";
|
|
323
|
+
return { decision: "inline_via_guard", result: guardOut };
|
|
324
|
+
}
|
|
325
|
+
// Guard says escalate — record WHY so the archived event shows the
|
|
326
|
+
// substantive pre-check that justified spawning a sub-session. Persist
|
|
327
|
+
// the annotated metadata to claimed/<id>.json so completeTick picks it
|
|
328
|
+
// up when it archives to processed/.
|
|
329
|
+
event.metadata = {
|
|
330
|
+
...(event.metadata || {}),
|
|
331
|
+
reason: guardOut?.reason || "guard:escalate",
|
|
332
|
+
guard: guardOut,
|
|
333
|
+
};
|
|
334
|
+
try {
|
|
335
|
+
const claimedPath = join(paths.claimed, `${event.id}.json`);
|
|
336
|
+
writeFileSync(claimedPath, JSON.stringify(event, null, 2) + "\n");
|
|
337
|
+
} catch (err) {
|
|
338
|
+
log({ level: "warn", stage: "persist_metadata_failed", id: event.id, error: err.message });
|
|
339
|
+
}
|
|
340
|
+
return escalate(event);
|
|
341
|
+
}
|
|
342
|
+
|
|
343
|
+
// No registry entry, or registry says escalate.
|
|
344
|
+
return escalate(event);
|
|
345
|
+
}
|
|
346
|
+
|
|
347
|
+
async function tickOnce() {
|
|
348
|
+
if (stopping) return { processed: 0 };
|
|
349
|
+
if (isEmergencyStop(paths)) {
|
|
350
|
+
stats.skipped_emergency_stop += 1;
|
|
351
|
+
heartbeat();
|
|
352
|
+
log({ level: "warn", stage: "emergency_stop_active" });
|
|
353
|
+
return { processed: 0, emergency_stop: true };
|
|
354
|
+
}
|
|
355
|
+
// Recover stale claims occasionally — done by the periodic timer too,
|
|
356
|
+
// but every tick is cheap because it short-circuits on empty claimed/.
|
|
357
|
+
recoverStaleClaims(agentRoot);
|
|
358
|
+
|
|
359
|
+
let processed = 0;
|
|
360
|
+
// Drain as much as the consumer can in one tick, but yield to the
|
|
361
|
+
// event loop between events so heartbeats and stop signals fire.
|
|
362
|
+
while (!stopping) {
|
|
363
|
+
const claim = claimNextTick(agentRoot);
|
|
364
|
+
if (!claim) break;
|
|
365
|
+
const event = claim.event;
|
|
366
|
+
activeTick = event.id;
|
|
367
|
+
try {
|
|
368
|
+
await processEvent(event);
|
|
369
|
+
} finally {
|
|
370
|
+
activeTick = null;
|
|
371
|
+
}
|
|
372
|
+
processed += 1;
|
|
373
|
+
if (processed >= 16) break; // soft batch cap
|
|
374
|
+
}
|
|
375
|
+
return { processed };
|
|
376
|
+
}
|
|
377
|
+
|
|
378
|
+
// Background loops
|
|
379
|
+
const pollTimer = setInterval(() => {
|
|
380
|
+
tickOnce().catch((err) => {
|
|
381
|
+
log({ level: "error", stage: "tick_threw", error: err?.message || String(err) });
|
|
382
|
+
});
|
|
383
|
+
}, pollMs);
|
|
384
|
+
pollTimer.unref?.();
|
|
385
|
+
timers.push(pollTimer);
|
|
386
|
+
|
|
387
|
+
const hbTimer = setInterval(() => heartbeat(), heartbeatMs);
|
|
388
|
+
hbTimer.unref?.();
|
|
389
|
+
timers.push(hbTimer);
|
|
390
|
+
|
|
391
|
+
const recoveryTimer = setInterval(() => {
|
|
392
|
+
try {
|
|
393
|
+
const out = recoverStaleClaims(agentRoot);
|
|
394
|
+
if (out.scanned > 0) {
|
|
395
|
+
log({ level: "info", stage: "periodic_recovery", ...out });
|
|
396
|
+
}
|
|
397
|
+
} catch (err) {
|
|
398
|
+
log({ level: "error", stage: "periodic_recovery_failed", error: err.message });
|
|
399
|
+
}
|
|
400
|
+
}, recoveryMs);
|
|
401
|
+
recoveryTimer.unref?.();
|
|
402
|
+
timers.push(recoveryTimer);
|
|
403
|
+
|
|
404
|
+
// Initial sweep + heartbeat
|
|
405
|
+
recoverStaleClaims(agentRoot);
|
|
406
|
+
heartbeat();
|
|
407
|
+
log({ level: "info", stage: "consumer_started", pollMs, heartbeatMs, recoveryMs });
|
|
408
|
+
|
|
409
|
+
async function stop() {
|
|
410
|
+
if (stopping) return;
|
|
411
|
+
stopping = true;
|
|
412
|
+
for (const t of timers) clearInterval(t);
|
|
413
|
+
timers = [];
|
|
414
|
+
log({ level: "info", stage: "consumer_stopping", active_tick: activeTick });
|
|
415
|
+
heartbeat();
|
|
416
|
+
}
|
|
417
|
+
|
|
418
|
+
function getStats() {
|
|
419
|
+
return { ...stats, depth: busDepth(agentRoot), agent_root: agentRoot };
|
|
420
|
+
}
|
|
421
|
+
|
|
422
|
+
return { stop, getStats, tickOnce, _paths: getBusPaths(agentRoot) };
|
|
423
|
+
}
|
|
424
|
+
|
|
425
|
+
// Allow `node scripts/daemon/cadence-consumer.mjs` to run the consumer
|
|
426
|
+
// standalone (useful for development and for migrations that need to
|
|
427
|
+
// drain the bus without starting the full daemon).
|
|
428
|
+
if (import.meta.url === `file://${process.argv[1]}`) {
|
|
429
|
+
const consumer = startConsumer({});
|
|
430
|
+
const shutdown = async (sig) => {
|
|
431
|
+
process.stderr.write(`[cadence-consumer] caught ${sig}, stopping…\n`);
|
|
432
|
+
await consumer.stop();
|
|
433
|
+
process.exit(0);
|
|
434
|
+
};
|
|
435
|
+
process.on("SIGTERM", () => shutdown("SIGTERM"));
|
|
436
|
+
process.on("SIGINT", () => shutdown("SIGINT"));
|
|
437
|
+
// Keep the process alive — unref'd timers won't otherwise.
|
|
438
|
+
setInterval(() => {}, 60_000);
|
|
439
|
+
}
|