@adaptic/maestro 1.1.8 → 1.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. package/.claude/commands/init-maestro.md +304 -8
  2. package/README.md +28 -0
  3. package/bin/maestro.mjs +1 -1
  4. package/docs/guides/agents-observe-setup.md +64 -0
  5. package/docs/guides/ccxray-diagnostics.md +65 -0
  6. package/docs/guides/claude-mem-setup.md +79 -0
  7. package/docs/guides/claude-pace-setup.md +56 -0
  8. package/docs/guides/claudraband-sessions.md +98 -0
  9. package/docs/guides/clawteam-swarm.md +116 -0
  10. package/docs/guides/code-review-graph-setup.md +86 -0
  11. package/docs/guides/self-optimization-pattern.md +82 -0
  12. package/docs/guides/slack-setup.md +4 -2
  13. package/docs/guides/twilio-subaccounts-setup.md +223 -0
  14. package/docs/guides/webhook-relay-setup.md +349 -0
  15. package/package.json +2 -1
  16. package/plugins/maestro-skills/plugin.json +16 -0
  17. package/plugins/maestro-skills/skills/agents-observe.md +110 -0
  18. package/plugins/maestro-skills/skills/ccxray-diagnostics.md +91 -0
  19. package/plugins/maestro-skills/skills/claude-pace.md +61 -0
  20. package/plugins/maestro-skills/skills/code-review-graph.md +99 -0
  21. package/scaffold/CLAUDE.md +64 -0
  22. package/scaffold/config/agent.ts.example +2 -1
  23. package/scaffold/config/known-agents.json +35 -0
  24. package/scripts/daemon/classifier.mjs +264 -50
  25. package/scripts/daemon/dispatcher.mjs +109 -5
  26. package/scripts/daemon/launchd-wrapper-generic.sh +96 -0
  27. package/scripts/daemon/launchd-wrapper-slack-events.sh +37 -0
  28. package/scripts/daemon/launchd-wrapper.sh +91 -0
  29. package/scripts/daemon/lib/session-router.mjs +274 -0
  30. package/scripts/daemon/lib/session-router.test.mjs +295 -0
  31. package/scripts/daemon/prompt-builder.mjs +51 -11
  32. package/scripts/daemon/responder.mjs +234 -19
  33. package/scripts/daemon/session-lock.mjs +194 -0
  34. package/scripts/daemon/sophie-daemon.mjs +16 -2
  35. package/scripts/email-signature.html +20 -4
  36. package/scripts/local-triggers/generate-plists.sh +62 -10
  37. package/scripts/poller/imap-client.mjs +4 -2
  38. package/scripts/poller/slack-poller.mjs +104 -52
  39. package/scripts/setup/init-agent.sh +91 -1
  40. package/scripts/setup/install-dev-tools.sh +150 -0
  41. package/scripts/spawn-session.sh +21 -6
  42. package/workflows/continuous/backlog-executor.yaml +141 -0
  43. package/workflows/daily/evening-wrap.yaml +41 -1
  44. package/workflows/daily/morning-brief.yaml +17 -0
  45. package/workflows/event-driven/agent-failure-investigation.yaml +137 -0
  46. package/workflows/event-driven/pr-review.yaml +104 -0
  47. package/workflows/weekly/engineering-health.yaml +154 -0
@@ -5,7 +5,8 @@
5
5
  import { spawn } from "child_process";
6
6
  import { appendFileSync, mkdirSync, writeFileSync, readFileSync, renameSync } from "fs";
7
7
  import { join } from "path";
8
- import { releaseLock, releaseThreadLock, releaseRequestClaim } from "./session-lock.mjs";
8
+ import { releaseLock, releaseThreadLock, releaseRequestClaim, claimItem, releaseItemClaim } from "./session-lock.mjs";
9
+ import { recordSession } from "./health.mjs";
9
10
 
10
11
  const SOPHIE_AI_DIR = join(new URL(".", import.meta.url).pathname, "../..");
11
12
  const CLAUDE_BIN = process.env.CLAUDE_BIN || "/Users/sophie/.local/bin/claude";
@@ -27,6 +28,12 @@ const priorityQueue = []; // critical/high items
27
28
  const normalQueue = []; // normal items
28
29
  let sessionCounter = 0;
29
30
 
31
+ // Tracks sessions whose proc.on("error") handler has already fired.
32
+ // Prevents double-counting + double-cleanup when a spawn failure (ENOENT,
33
+ // EACCES, ETIMEDOUT) triggers both "error" and a trailing "close" event.
34
+ // See ib-20260416-daemon-etimedout-failed-event + cycle 135 memo.
35
+ const spawnErrorHandled = new Set();
36
+
30
37
  // Backlog dedup: track which items have active sessions to prevent retry storms
31
38
  const activeBacklogKeys = new Set(); // backlog item key -> true (while session is running)
32
39
  const backlogRetryCount = new Map(); // backlog item key -> number of times dispatched
@@ -196,6 +203,26 @@ export function dispatch(prompt, item, classResult, source = "inbox") {
196
203
  }
197
204
  }
198
205
 
206
+ // Item-claim acquisition: file-based claim visible across daemon restarts
207
+ // and concurrent launchd triggers. Complements the in-memory activeBacklogKeys.
208
+ // (ib-20260407-001b: concurrent session coordination)
209
+ if (source === "backlog" && item.id) {
210
+ const sessionId = `s-${Date.now()}-${sessionCounter + 1}`;
211
+ const claim = claimItem(item.id, {
212
+ session_id: sessionId,
213
+ agent_description: classResult.summary || item.title || "",
214
+ ttl_minutes: classResult.model === "opus" ? 120 : 30,
215
+ source: "backlog",
216
+ queue_file: item.source_file || "",
217
+ pid: process.pid, // daemon PID; child PID not yet known
218
+ });
219
+ if (!claim.claimed) {
220
+ console.log(`[dispatcher] Item claim denied for ${item.id}: ${claim.reason} (holder: ${claim.holder || "unknown"})`);
221
+ logSession({ event: "skipped", reason: `item_claim_denied: ${claim.reason}`, summary: classResult.summary, holder: claim.holder });
222
+ return;
223
+ }
224
+ }
225
+
199
226
  // Backlog items respect the reserved slot cap
200
227
  if (source === "backlog") {
201
228
  const backlogCount = countBySource("backlog");
@@ -250,9 +277,13 @@ function spawnSession(entry) {
250
277
  prompt,
251
278
  ];
252
279
 
280
+ // Strip Anthropic API credentials from spawn env so claude CLI falls through
281
+ // to the keychain OAuth (Max subscription) per CEO directive 2026-04-27.
282
+ // A stale ANTHROPIC_API_KEY in the daemon's inherited env will otherwise
283
+ // override the OAuth token and cause "Invalid API key" failures.
253
284
  const proc = spawn(CLAUDE_BIN, args, {
254
285
  cwd: SOPHIE_AI_DIR,
255
- env: { ...process.env },
286
+ env: { ...process.env, ANTHROPIC_API_KEY: "", ANTHROPIC_AUTH_TOKEN: "" },
256
287
  stdio: ["ignore", "pipe", "pipe"],
257
288
  });
258
289
 
@@ -287,8 +318,17 @@ function spawnSession(entry) {
287
318
 
288
319
  proc.on("close", (code) => {
289
320
  clearTimeout(timer);
321
+ // If proc.on("error") already fired for this session (spawn failure path
322
+ // — ENOENT, EACCES, ETIMEDOUT), cleanup + metric + lock release already
323
+ // happened. Skip to avoid double-count and double-release.
324
+ if (spawnErrorHandled.has(sessionId)) {
325
+ spawnErrorHandled.delete(sessionId);
326
+ drainQueue();
327
+ return;
328
+ }
290
329
  activeSessions.delete(sessionId);
291
330
  removeActiveSession(sessionId);
331
+ recordSession(true, code === 0);
292
332
  const duration = ((Date.now() - startTime) / 1000).toFixed(1);
293
333
 
294
334
  // Release item lock — MUST use same key order as acquireLock in daemon
@@ -312,12 +352,15 @@ function spawnSession(entry) {
312
352
  });
313
353
  }
314
354
 
315
- // Release backlog tracking
355
+ // Release backlog tracking + item claim
316
356
  if (source === "backlog") {
317
357
  const key = backlogKey(item);
318
358
  activeBacklogKeys.delete(key);
319
359
  const retries = backlogRetryCount.get(key) || 0;
320
360
 
361
+ // Release file-based item claim (ib-20260407-001b)
362
+ if (item.id) releaseItemClaim(item.id);
363
+
321
364
  // If session timed out (143=SIGTERM) and hit retry limit, log it
322
365
  if (code === 143 && retries >= MAX_BACKLOG_RETRIES) {
323
366
  console.warn(`[dispatcher] Backlog item "${classResult.summary}" exhausted ${MAX_BACKLOG_RETRIES} retries — will not retry`);
@@ -347,9 +390,70 @@ function spawnSession(entry) {
347
390
 
348
391
  proc.on("error", (err) => {
349
392
  clearTimeout(timer);
393
+ // Mark so the trailing proc.on("close") doesn't double-process.
394
+ spawnErrorHandled.add(sessionId);
350
395
  activeSessions.delete(sessionId);
351
- console.error(`[dispatcher] Session ${sessionId} error: ${err.message}`);
352
- logSession({ event: "error", sessionId, error: err.message });
396
+ removeActiveSession(sessionId);
397
+ recordSession(true, false);
398
+ const duration = ((Date.now() - startTime) / 1000).toFixed(1);
399
+ const errorCode = err.code || "unknown";
400
+
401
+ // Release item lock — mirror of close-handler logic.
402
+ const itemId = item.raw_ref || item.id || item.title;
403
+ if (itemId) releaseLock(itemId);
404
+
405
+ // Release thread lock so new messages in this thread can be processed.
406
+ if (item.thread_id) {
407
+ const channel = item.channel_id || (item.raw_ref ? (item.raw_ref.match(/slack:([^:]+):/) || [])[1] : null) || item.channel;
408
+ if (channel) releaseThreadLock(channel, item.thread_id);
409
+ }
410
+
411
+ // Release request claim + emit explicit claim_released event so
412
+ // reconciliation audits (cycle 124 Agent C pattern) can distinguish
413
+ // genuine in-flight sessions from silent-exit ETIMEDOUT failures
414
+ // without cross-referencing logs/daemon/responses.jsonl.
415
+ let claimReleased = false;
416
+ if (classResult && classResult.summary) {
417
+ releaseRequestClaim({
418
+ recipient: item.channel_id || item.channel || item.sender || "unknown",
419
+ subject: classResult.summary || item.subject || "",
420
+ action_type: classResult.action || "respond",
421
+ });
422
+ claimReleased = true;
423
+ }
424
+
425
+ // Release backlog tracking + item claim.
426
+ if (source === "backlog") {
427
+ const key = backlogKey(item);
428
+ activeBacklogKeys.delete(key);
429
+ // Release file-based item claim (ib-20260407-001b)
430
+ if (item.id) releaseItemClaim(item.id);
431
+ }
432
+
433
+ console.error(`[dispatcher] Session ${sessionId} failed: ${errorCode} (${err.message})`);
434
+
435
+ // Rich "failed" event — see ib-20260416-daemon-etimedout-failed-event.
436
+ logSession({
437
+ event: "failed",
438
+ sessionId,
439
+ error: errorCode,
440
+ error_message: err.message,
441
+ model,
442
+ source,
443
+ priority: classResult?.priority,
444
+ summary: classResult?.summary,
445
+ duration_s: parseFloat(duration),
446
+ active_count: activeSessions.size,
447
+ });
448
+
449
+ if (claimReleased) {
450
+ logSession({
451
+ event: "claim_released",
452
+ sessionId,
453
+ reason: `spawn_failed_${errorCode}`,
454
+ });
455
+ }
456
+
353
457
  drainQueue();
354
458
  });
355
459
  }
@@ -0,0 +1,96 @@
1
+ #!/bin/bash
2
+ # launchd-wrapper-generic.sh — Universal env bootstrap for ANY maestro
3
+ # script spawned under launchd.
4
+ #
5
+ # Usage in a plist:
6
+ # <key>ProgramArguments</key>
7
+ # <array>
8
+ # <string>/path/to/scripts/daemon/launchd-wrapper-generic.sh</string>
9
+ # <string>/path/to/script-to-run.sh</string>
10
+ # <string>arg1</string>
11
+ # <string>arg2</string>
12
+ # </array>
13
+ #
14
+ # What it does:
15
+ # 1. Sets HOME, PATH, USER, AGENT_ROOT (launchd's bare env doesn't include them)
16
+ # 2. Detects external SSD and sets CLAUDE_CODE_TMPDIR + creates SSD_AGENT_ROOT
17
+ # 3. Picks a node binary (nvm, homebrew, system fallback)
18
+ # 4. Redirects stdout/stderr to a date-stamped log file on the SSD
19
+ # 5. Exec's the target script with all remaining args
20
+ #
21
+ # Why a wrapper instead of a plist EnvironmentVariables block?
22
+ # Putting these env vars directly in the plist has been observed to cause
23
+ # EX_CONFIG (78) failures on macOS Sequoia + when symlinks are involved.
24
+ # A wrapper script is more portable and easier to debug.
25
+
26
+ set -e
27
+
28
+ if [ $# -lt 1 ]; then
29
+ echo "[wrapper] FATAL: no target script provided" >&2
30
+ exit 64
31
+ fi
32
+
33
+ TARGET="$1"
34
+ shift
35
+
36
+ # AGENT_ROOT is the parent of the script being run, going up two levels.
37
+ # E.g. /Users/lucas/lucas-ai/scripts/local-triggers/run-trigger.sh → /Users/lucas/lucas-ai
38
+ TARGET_DIR="$(cd "$(dirname "$TARGET")" && pwd -P)"
39
+ # Walk up until we find package.json or config/agent.ts
40
+ CANDIDATE="$TARGET_DIR"
41
+ while [ "$CANDIDATE" != "/" ]; do
42
+ if [ -f "$CANDIDATE/package.json" ] && [ -d "$CANDIDATE/config" ]; then
43
+ AGENT_ROOT="$CANDIDATE"
44
+ break
45
+ fi
46
+ CANDIDATE="$(dirname "$CANDIDATE")"
47
+ done
48
+ AGENT_ROOT="${AGENT_ROOT:-$TARGET_DIR}"
49
+
50
+ export AGENT_ROOT
51
+ export HOME="${HOME:-/Users/$(whoami)}"
52
+ export USER="${USER:-$(whoami)}"
53
+ export PATH="/opt/homebrew/bin:/opt/homebrew/sbin:/usr/local/bin:/usr/bin:/bin:/usr/sbin:/sbin:$PATH"
54
+
55
+ # ── SSD detection ───────────────────────────────────────────────────────────
56
+ SSD_VOLUME="${MAESTRO_SSD_VOLUME:-}"
57
+ if [ -z "$SSD_VOLUME" ]; then
58
+ for v in /Volumes/*-SSD /Volumes/*SSD* /Volumes/maestro-data; do
59
+ if [ -d "$v" ] && [ "$v" != "/Volumes/Macintosh HD" ]; then
60
+ SSD_VOLUME="$v"
61
+ break
62
+ fi
63
+ done
64
+ fi
65
+
66
+ AGENT_NAME="$(basename "$AGENT_ROOT" | sed 's/-ai$//')"
67
+ SSD_AGENT_ROOT=""
68
+ SSD_WRITABLE=0
69
+ if [ -n "$SSD_VOLUME" ] && [ -d "$SSD_VOLUME" ]; then
70
+ SSD_AGENT_ROOT="$SSD_VOLUME/maestro/$AGENT_NAME"
71
+ if mkdir -p "$SSD_AGENT_ROOT/claude-tmp" "$SSD_AGENT_ROOT/logs/launchd" 2>/dev/null && \
72
+ touch "$SSD_AGENT_ROOT/.write-test-$$" 2>/dev/null; then
73
+ rm -f "$SSD_AGENT_ROOT/.write-test-$$"
74
+ SSD_WRITABLE=1
75
+ export CLAUDE_CODE_TMPDIR="$SSD_AGENT_ROOT/claude-tmp"
76
+ export MAESTRO_SSD_AGENT_ROOT="$SSD_AGENT_ROOT"
77
+ fi
78
+ fi
79
+
80
+ cd "$AGENT_ROOT"
81
+
82
+ # Determine the log file name from the target script's basename
83
+ SCRIPT_NAME="$(basename "$TARGET" | sed 's/\.[^.]*$//')"
84
+ LOG_DATE="$(date +%Y-%m-%d)"
85
+
86
+ if [ "$SSD_WRITABLE" = "1" ]; then
87
+ LOG_FILE="$SSD_AGENT_ROOT/logs/launchd/${SCRIPT_NAME}-${LOG_DATE}.log"
88
+ echo "[wrapper $(date -u +%H:%M:%SZ)] starting $SCRIPT_NAME (SSD log: $LOG_FILE)" >> "$LOG_FILE" 2>/dev/null || true
89
+ exec bash "$TARGET" "$@" >> "$LOG_FILE" 2>&1
90
+ else
91
+ # Fall back to internal disk log if SSD isn't writable (e.g. macOS denies launchd
92
+ # write access to external volumes until the user grants it via System Settings).
93
+ FALLBACK_LOG="$AGENT_ROOT/logs/launchd/${SCRIPT_NAME}-${LOG_DATE}.log"
94
+ mkdir -p "$(dirname "$FALLBACK_LOG")" 2>/dev/null || true
95
+ exec bash "$TARGET" "$@" >> "$FALLBACK_LOG" 2>&1
96
+ fi
@@ -0,0 +1,37 @@
1
+ #!/bin/bash
2
+ # launchd-wrapper-slack-events.sh — Bootstraps env for the slack-events server.
3
+ set -e
4
+ AGENT_ROOT="$(cd "$(dirname "$0")/../.." && pwd -P)"
5
+ export AGENT_ROOT
6
+ export HOME="${HOME:-/Users/$(whoami)}"
7
+ export USER="${USER:-$(whoami)}"
8
+ export PATH="/opt/homebrew/bin:/opt/homebrew/sbin:/usr/local/bin:/usr/bin:/bin:/usr/sbin:/sbin:$PATH"
9
+ if [ -d "/Volumes/4TB-SSD" ]; then
10
+ AGENT_NAME="$(basename "$AGENT_ROOT" | sed 's/-ai$//')"
11
+ CLAUDE_TMP_DIR="/Volumes/4TB-SSD/maestro/$AGENT_NAME/claude-tmp"
12
+ mkdir -p "$CLAUDE_TMP_DIR"
13
+ export CLAUDE_CODE_TMPDIR="$CLAUDE_TMP_DIR"
14
+ fi
15
+ cd "$AGENT_ROOT"
16
+ NODE_BIN=""
17
+ for candidate in \
18
+ "$HOME/.nvm/versions/node/v24.11.1/bin/node" \
19
+ "$HOME/.nvm/versions/node/v24/bin/node" \
20
+ "$HOME/.nvm/versions/node/v22/bin/node" \
21
+ "$HOME/.nvm/versions/node/v20/bin/node" \
22
+ /opt/homebrew/bin/node \
23
+ /usr/local/bin/node \
24
+ /usr/bin/node; do
25
+ if [ -x "$candidate" ]; then
26
+ NODE_BIN="$candidate"
27
+ break
28
+ fi
29
+ done
30
+ if [ -z "$NODE_BIN" ] && [ -d "$HOME/.nvm/versions/node" ]; then
31
+ NODE_BIN=$(ls -1d "$HOME/.nvm/versions/node"/v*/bin/node 2>/dev/null | sort -V | tail -1)
32
+ fi
33
+ if [ -z "$NODE_BIN" ] || [ ! -x "$NODE_BIN" ]; then
34
+ echo "[wrapper] FATAL: could not find node binary" >&2
35
+ exit 127
36
+ fi
37
+ exec "$NODE_BIN" "$AGENT_ROOT/scripts/slack-events-server.mjs"
@@ -0,0 +1,91 @@
1
+ #!/bin/bash
2
+ # launchd-wrapper.sh — Bootstraps env for the maestro daemon under launchd.
3
+ #
4
+ # launchd's default environment is bare and doesn't include HOME, PATH, or
5
+ # AGENT_ROOT. We set them here, then exec the daemon. This avoids putting
6
+ # them in the .plist directly, which has been observed to cause EX_CONFIG
7
+ # (78) failures on some macOS versions when symlinks are involved.
8
+ #
9
+ # Storage: when /Volumes/{SSD_NAME} is mounted, all daemon runtime data goes
10
+ # to the SSD. The plist's StandardErrorPath/StandardOutPath remain on the
11
+ # internal disk (launchd refuses external volumes there), but those files
12
+ # only capture launchd-level startup errors. The daemon's own stdout/stderr
13
+ # is redirected into /Volumes/{SSD_NAME}/maestro/{agent}/logs/daemon/ at the
14
+ # bottom of this script via shell redirection.
15
+ #
16
+ # This wrapper is exec'd by ai.adaptic.{firstname}-daemon.plist.
17
+
18
+ set -e
19
+
20
+ AGENT_ROOT="$(cd "$(dirname "$0")/../.." && pwd -P)"
21
+ export AGENT_ROOT
22
+ export HOME="${HOME:-/Users/$(whoami)}"
23
+ export USER="${USER:-$(whoami)}"
24
+ export PATH="/opt/homebrew/bin:/opt/homebrew/sbin:/usr/local/bin:/usr/bin:/bin:/usr/sbin:/sbin:$PATH"
25
+
26
+ # ── SSD redirect ────────────────────────────────────────────────────────────
27
+ # If an external SSD is mounted at /Volumes/{name}, redirect:
28
+ # - Claude Code per-cwd temp (CLAUDE_CODE_TMPDIR)
29
+ # - Daemon stdout/stderr (via shell redirection at exec time)
30
+ # - state/, outputs/, memory/, knowledge/ are already symlinked at init time
31
+
32
+ # Detect SSD volume — first volume under /Volumes that's not a system mount.
33
+ # Override with MAESTRO_SSD_VOLUME env var if you have multiple SSDs.
34
+ SSD_VOLUME="${MAESTRO_SSD_VOLUME:-}"
35
+ if [ -z "$SSD_VOLUME" ]; then
36
+ for v in /Volumes/*-SSD /Volumes/*SSD* /Volumes/maestro-data; do
37
+ if [ -d "$v" ] && [ "$v" != "/Volumes/Macintosh HD" ]; then
38
+ SSD_VOLUME="$v"
39
+ break
40
+ fi
41
+ done
42
+ fi
43
+
44
+ AGENT_NAME="$(basename "$AGENT_ROOT" | sed 's/-ai$//')"
45
+ SSD_AGENT_ROOT=""
46
+ SSD_WRITABLE=0
47
+ if [ -n "$SSD_VOLUME" ] && [ -d "$SSD_VOLUME" ]; then
48
+ SSD_AGENT_ROOT="$SSD_VOLUME/maestro/$AGENT_NAME"
49
+ if mkdir -p "$SSD_AGENT_ROOT/claude-tmp" "$SSD_AGENT_ROOT/logs/daemon" 2>/dev/null && \
50
+ touch "$SSD_AGENT_ROOT/.write-test-$$" 2>/dev/null; then
51
+ rm -f "$SSD_AGENT_ROOT/.write-test-$$"
52
+ SSD_WRITABLE=1
53
+ export CLAUDE_CODE_TMPDIR="$SSD_AGENT_ROOT/claude-tmp"
54
+ fi
55
+ fi
56
+
57
+ cd "$AGENT_ROOT"
58
+ # Resolve node binary — prefer nvm, fall back to homebrew, then system
59
+ NODE_BIN=""
60
+ for candidate in \
61
+ "$HOME/.nvm/versions/node/v24.11.1/bin/node" \
62
+ "$HOME/.nvm/versions/node/v24/bin/node" \
63
+ "$HOME/.nvm/versions/node/v22/bin/node" \
64
+ "$HOME/.nvm/versions/node/v20/bin/node" \
65
+ /opt/homebrew/bin/node \
66
+ /usr/local/bin/node \
67
+ /usr/bin/node; do
68
+ if [ -x "$candidate" ]; then
69
+ NODE_BIN="$candidate"
70
+ break
71
+ fi
72
+ done
73
+ if [ -z "$NODE_BIN" ] && [ -d "$HOME/.nvm/versions/node" ]; then
74
+ NODE_BIN=$(ls -1d "$HOME/.nvm/versions/node"/v*/bin/node 2>/dev/null | sort -V | tail -1)
75
+ fi
76
+ if [ -z "$NODE_BIN" ] || [ ! -x "$NODE_BIN" ]; then
77
+ echo "[wrapper] FATAL: could not find node binary" >&2
78
+ exit 127
79
+ fi
80
+
81
+ # Exec the daemon. Prefer SSD log path if writable, otherwise fall back to
82
+ # internal disk so the daemon stays up even when macOS denies launchd write
83
+ # access to /Volumes/{name}.
84
+ if [ "$SSD_WRITABLE" = "1" ]; then
85
+ DAEMON_LOG="$SSD_AGENT_ROOT/logs/daemon/daemon-$(date +%Y-%m-%d).log"
86
+ exec "$NODE_BIN" "$AGENT_ROOT/scripts/daemon/maestro-daemon.mjs" >> "$DAEMON_LOG" 2>&1
87
+ else
88
+ DAEMON_LOG="$AGENT_ROOT/logs/daemon/daemon-$(date +%Y-%m-%d).log"
89
+ mkdir -p "$(dirname "$DAEMON_LOG")" 2>/dev/null || true
90
+ exec "$NODE_BIN" "$AGENT_ROOT/scripts/daemon/maestro-daemon.mjs" >> "$DAEMON_LOG" 2>&1
91
+ fi
@@ -0,0 +1,274 @@
1
+ /**
2
+ * session-router.mjs — Routes daemon Claude CLI invocations to either an
3
+ * existing live session (RESUME) or a fresh ephemeral spawn (EPHEMERAL).
4
+ *
5
+ * Per design memo `outputs/drafts/2026-04-27-claude-cli-session-router.md`
6
+ * §4 (architecture) and §5 (migration plan, step 2). This module is
7
+ * scaffold-only and is NOT yet wired into dispatcher.mjs / responder.mjs.
8
+ * It is intended to ship behind a SESSION_ROUTER_ENABLED=1 env flag
9
+ * (memo §8 step 4) so its mere existence cannot alter daemon behaviour.
10
+ *
11
+ * Public API:
12
+ * - routingKey(item) — pure function, derives a stable conversation key
13
+ * - createRouter(opts) — async factory returning { route, touch,
14
+ * recordExit, evictExpired, _readForTests }
15
+ *
16
+ * Registry on disk: a single JSON file with shape
17
+ * { sessions: { <key>: { ... } }, lru: [ <key>, <key>, ... ] }
18
+ * Concurrent writes use the temp-file + fs.rename atomic pattern. Memo §4.1
19
+ * explicitly rejects flock — "no stale lock files if the daemon dies".
20
+ */
21
+
22
+ import { promises as fsp } from "fs";
23
+ import { dirname } from "path";
24
+
25
+ // canonicalizeSlackChannel exists in some installs of scripts/daemon/session-lock.mjs.
26
+ // In maestro, it may not be exported (sophie-ai-only helper). Both failure modes
27
+ // — import throws OR import succeeds but the symbol is undefined — degrade to
28
+ // identity so this module never breaks the daemon. The fallback only fires at
29
+ // load time, not per-call.
30
+ let canonicalizeSlackChannel;
31
+ try {
32
+ const mod = await import("../session-lock.mjs");
33
+ canonicalizeSlackChannel = typeof mod.canonicalizeSlackChannel === "function"
34
+ ? mod.canonicalizeSlackChannel
35
+ : (c) => c;
36
+ } catch {
37
+ // TODO(session-router): session-lock.mjs unavailable or canonicaliser missing
38
+ // — using identity. Fix the import or invent a shared canon helper.
39
+ canonicalizeSlackChannel = (c) => c;
40
+ }
41
+
42
+ const EMPTY_REGISTRY = () => ({ sessions: {}, lru: [] });
43
+
44
+ /**
45
+ * Pure key-derivation function (memo §4.2 table).
46
+ *
47
+ * @param {object} item Inbound queue/inbox item
48
+ * @returns {string} Routing key, e.g. "slack:D099N1JGKRQ:1777283277.123456"
49
+ */
50
+ export function routingKey(item) {
51
+ if (!item || typeof item !== "object") {
52
+ throw new TypeError("routingKey: item must be an object");
53
+ }
54
+ const source = item.source;
55
+
56
+ if (source === "slack") {
57
+ const channel = canonicalizeSlackChannel(item.channel || item.channel_id || "");
58
+ if (!channel) throw new Error("routingKey: slack item missing channel");
59
+ if (item.thread_ts) return `slack:${channel}:${item.thread_ts}`;
60
+ if (typeof channel === "string" && channel.startsWith("D")) {
61
+ return `slack:${channel}`;
62
+ }
63
+ // Channel non-DM, no thread → single-message bucket (effectively ephemeral)
64
+ const ts = item.ts || item.event_ts || "";
65
+ return `slack:${channel}:${ts}`;
66
+ }
67
+
68
+ if (source === "gmail") {
69
+ const tid = item.thread_id || item.threadId;
70
+ if (!tid) throw new Error("routingKey: gmail item missing thread_id");
71
+ return `gmail:${tid}`;
72
+ }
73
+
74
+ if (source === "calendar") {
75
+ const eid = item.event_id || item.eventId;
76
+ if (!eid) throw new Error("routingKey: calendar item missing event_id");
77
+ return `calendar:${eid}`;
78
+ }
79
+
80
+ if (source === "internal") {
81
+ if (!item.id) throw new Error("routingKey: internal item missing id");
82
+ return `internal:${item.id}`;
83
+ }
84
+
85
+ if (source === "backlog") {
86
+ if (item.topic_slug) return `backlog:${item.topic_slug}`;
87
+ if (item.id) return `internal:${item.id}`;
88
+ throw new Error("routingKey: backlog item missing topic_slug and id");
89
+ }
90
+
91
+ throw new Error(`routingKey: unknown source "${source}"`);
92
+ }
93
+
94
+ /**
95
+ * Read the registry from disk. Missing file → empty registry shape.
96
+ * Corrupted JSON also degrades to empty (memo §6: "Read failure on the
97
+ * main file falls back to an empty registry").
98
+ */
99
+ async function readRegistry(path) {
100
+ try {
101
+ const raw = await fsp.readFile(path, "utf-8");
102
+ const parsed = JSON.parse(raw);
103
+ // Defensive: ensure shape
104
+ if (!parsed || typeof parsed !== "object") return EMPTY_REGISTRY();
105
+ if (!parsed.sessions || typeof parsed.sessions !== "object") parsed.sessions = {};
106
+ if (!Array.isArray(parsed.lru)) parsed.lru = [];
107
+ return parsed;
108
+ } catch (err) {
109
+ if (err && err.code === "ENOENT") return EMPTY_REGISTRY();
110
+ return EMPTY_REGISTRY();
111
+ }
112
+ }
113
+
114
+ /**
115
+ * Atomic write: temp file + fs.rename. Memo §4.1 specifies this exact
116
+ * pattern instead of flock to avoid stale-lock pathologies.
117
+ */
118
+ async function writeRegistryAtomic(path, registry) {
119
+ const dir = dirname(path);
120
+ await fsp.mkdir(dir, { recursive: true });
121
+ const tmp = `${path}.tmp.${process.pid}.${Date.now()}`;
122
+ await fsp.writeFile(tmp, JSON.stringify(registry, null, 2), "utf-8");
123
+ await fsp.rename(tmp, path);
124
+ }
125
+
126
+ /**
127
+ * Move a key to the back of the LRU array (most-recently-used).
128
+ */
129
+ function bumpLru(lru, key) {
130
+ const idx = lru.indexOf(key);
131
+ if (idx !== -1) lru.splice(idx, 1);
132
+ lru.push(key);
133
+ }
134
+
135
+ /**
136
+ * Create a session router bound to a registry path.
137
+ *
138
+ * @param {object} opts
139
+ * @param {string} opts.registryPath Path to the JSON registry file.
140
+ * @param {number} [opts.ttlSeconds=1800] Idle TTL (memo §4.3 default 30m).
141
+ * @param {number} [opts.maxLiveSessions=8] LRU cap (memo §4.3, matches MAX_CLAUDE_PROCS).
142
+ * @param {() => number} [opts.now] Injectable clock for tests.
143
+ */
144
+ export async function createRouter({
145
+ registryPath,
146
+ ttlSeconds = 1800,
147
+ maxLiveSessions = 8,
148
+ now = () => Date.now(),
149
+ } = {}) {
150
+ if (!registryPath) {
151
+ throw new Error("createRouter: registryPath is required");
152
+ }
153
+
154
+ // Eagerly load so first call doesn't race with concurrent writers.
155
+ let registry = await readRegistry(registryPath);
156
+
157
+ /**
158
+ * Decide RESUME / EPHEMERAL / EPHEMERAL_REPLACE for a key (memo §4.4).
159
+ * If RESUME, also touches last_used_at in memory (persisted on next mutation).
160
+ *
161
+ * @param {string} key
162
+ * @returns {{ decision: "EPHEMERAL"|"EPHEMERAL_REPLACE"|"RESUME", resumeId: string|null }}
163
+ */
164
+ function route(key) {
165
+ const entry = registry.sessions[key];
166
+ if (!entry) return { decision: "EPHEMERAL", resumeId: null };
167
+
168
+ const ttlMs = ttlSeconds * 1000;
169
+ if (now() - entry.last_used_at > ttlMs) {
170
+ return { decision: "EPHEMERAL_REPLACE", resumeId: null };
171
+ }
172
+ if (entry.status !== "live") {
173
+ return { decision: "EPHEMERAL_REPLACE", resumeId: null };
174
+ }
175
+ if (entry.last_exit_code !== 0) {
176
+ return { decision: "EPHEMERAL_REPLACE", resumeId: null };
177
+ }
178
+
179
+ // Touch in-memory; persisted on next touch()/recordExit() write.
180
+ entry.last_used_at = now();
181
+ bumpLru(registry.lru, key);
182
+ return { decision: "RESUME", resumeId: entry.claude_session_id };
183
+ }
184
+
185
+ /**
186
+ * Insert or refresh a session entry after a successful spawn.
187
+ * Enforces the LRU cap by evicting the oldest entry when at capacity.
188
+ *
189
+ * @param {string} key
190
+ * @param {object} info
191
+ * @param {string} info.claudeSessionId CLI-resolved session id from JSON stdout.
192
+ * @param {string} [info.daemonSessionId] Daemon-local id (s-<epoch>-<n>).
193
+ * @param {string} [info.model] "sonnet" | "opus" | "haiku" | etc.
194
+ */
195
+ async function touch(key, { claudeSessionId, daemonSessionId, model } = {}) {
196
+ const ts = now();
197
+ const isoNow = new Date(ts).toISOString();
198
+ const existing = registry.sessions[key];
199
+
200
+ const entry = {
201
+ daemon_session_id: daemonSessionId || existing?.daemon_session_id || null,
202
+ claude_session_id: claudeSessionId ?? existing?.claude_session_id ?? null,
203
+ key,
204
+ model: model || existing?.model || null,
205
+ created_at: existing?.created_at || isoNow,
206
+ last_used_at: ts,
207
+ ttl_seconds: ttlSeconds,
208
+ status: "live",
209
+ last_exit_code: 0,
210
+ };
211
+ registry.sessions[key] = entry;
212
+ bumpLru(registry.lru, key);
213
+
214
+ // Enforce LRU cap. The lru array stores keys ordered oldest → newest.
215
+ while (registry.lru.length > maxLiveSessions) {
216
+ const evictKey = registry.lru.shift();
217
+ if (evictKey && evictKey !== key) {
218
+ delete registry.sessions[evictKey];
219
+ }
220
+ }
221
+ // Defensive: drop sessions that fell out of the lru array entirely.
222
+ for (const k of Object.keys(registry.sessions)) {
223
+ if (!registry.lru.includes(k)) delete registry.sessions[k];
224
+ }
225
+
226
+ await writeRegistryAtomic(registryPath, registry);
227
+ }
228
+
229
+ /**
230
+ * Record process exit. Non-zero codes flip status to "killed" so the
231
+ * next route() returns EPHEMERAL_REPLACE.
232
+ */
233
+ async function recordExit(key, exitCode) {
234
+ const entry = registry.sessions[key];
235
+ if (!entry) return; // No-op — key was never touched.
236
+ entry.last_exit_code = exitCode;
237
+ if (exitCode !== 0) {
238
+ entry.status = "killed";
239
+ }
240
+ await writeRegistryAtomic(registryPath, registry);
241
+ }
242
+
243
+ /**
244
+ * Sweep expired entries (memo §4.3 — runs at top of each dispatcher tick).
245
+ * Returns the count evicted.
246
+ */
247
+ async function evictExpired() {
248
+ const ttlMs = ttlSeconds * 1000;
249
+ const cutoff = now() - ttlMs;
250
+ let evicted = 0;
251
+ for (const [k, e] of Object.entries(registry.sessions)) {
252
+ if (e.last_used_at < cutoff) {
253
+ delete registry.sessions[k];
254
+ const idx = registry.lru.indexOf(k);
255
+ if (idx !== -1) registry.lru.splice(idx, 1);
256
+ evicted++;
257
+ }
258
+ }
259
+ if (evicted > 0) {
260
+ await writeRegistryAtomic(registryPath, registry);
261
+ }
262
+ return evicted;
263
+ }
264
+
265
+ /**
266
+ * Test hook — returns a deep-cloned snapshot so tests cannot mutate
267
+ * the live registry by accident.
268
+ */
269
+ function _readForTests() {
270
+ return JSON.parse(JSON.stringify(registry));
271
+ }
272
+
273
+ return { route, touch, recordExit, evictExpired, _readForTests };
274
+ }