npm - @pleri/olam-cli - Versions diffs - 0.1.168 → 0.1.170 - Mend

@pleri/olam-cli 0.1.168 → 0.1.170

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (64) hide show

package/README.md +38 -0
package/dist/commands/auth-status.d.ts +1 -0
package/dist/commands/auth-status.d.ts.map +1 -1
package/dist/commands/auth-status.js +45 -4
package/dist/commands/auth-status.js.map +1 -1
package/dist/commands/create.d.ts.map +1 -1
package/dist/commands/create.js +26 -0
package/dist/commands/create.js.map +1 -1
package/dist/commands/enter.d.ts.map +1 -1
package/dist/commands/enter.js +5 -0
package/dist/commands/enter.js.map +1 -1
package/dist/commands/resume.d.ts +63 -0
package/dist/commands/resume.d.ts.map +1 -0
package/dist/commands/resume.js +174 -0
package/dist/commands/resume.js.map +1 -0
package/dist/commands/setup.d.ts +19 -0
package/dist/commands/setup.d.ts.map +1 -1
package/dist/commands/setup.js +157 -19
package/dist/commands/setup.js.map +1 -1
package/dist/image-digests.json +8 -8
package/dist/index.js +1021 -576
package/dist/index.js.map +1 -1
package/dist/lib/health-probes.d.ts +28 -0
package/dist/lib/health-probes.d.ts.map +1 -1
package/dist/lib/health-probes.js +75 -0
package/dist/lib/health-probes.js.map +1 -1
package/dist/lib/k8s-context-discovery.d.ts +80 -0
package/dist/lib/k8s-context-discovery.d.ts.map +1 -0
package/dist/lib/k8s-context-discovery.js +102 -0
package/dist/lib/k8s-context-discovery.js.map +1 -0
package/dist/mcp-server.js +1273 -771
package/dist/spawn/home-override.d.ts +82 -0
package/dist/spawn/home-override.d.ts.map +1 -0
package/dist/spawn/home-override.js +107 -0
package/dist/spawn/home-override.js.map +1 -0
package/hermes-bundle/version.json +1 -1
package/host-cp/k8s/manifests/30-configmap.yaml +5 -0
package/host-cp/k8s/manifests/50-deployment.yaml +9 -2
package/host-cp/k8s/manifests/auth-service/50-deployment.yaml +1 -1
package/host-cp/k8s/manifests/kg-service/50-deployment.yaml +1 -1
package/host-cp/k8s/manifests/mcp-auth-service/50-deployment.yaml +1 -1
package/host-cp/k8s/manifests/memory-service/50-deployment.yaml +1 -1
package/host-cp/lifecycle/classify.mjs +110 -0
package/host-cp/lifecycle/emit.mjs +119 -0
package/host-cp/lifecycle/evidence.mjs +45 -0
package/host-cp/lifecycle/failure-kinds.mjs +56 -0
package/host-cp/lifecycle/index.mjs +22 -0
package/host-cp/lifecycle/phases.mjs +52 -0
package/host-cp/observability/grafana-port-forward.sh +1 -1
package/host-cp/observability/kyverno-cardinality-mutate.sh +2 -2
package/host-cp/observability/loki-ingest.sh +1 -1
package/host-cp/observability/ndjson-span-sink.mjs +131 -0
package/host-cp/observability/prom-no-double-grafana.sh +4 -4
package/host-cp/observability/redactor.mjs +72 -0
package/host-cp/recovery/engine.mjs +148 -0
package/host-cp/recovery/index.mjs +16 -0
package/host-cp/recovery/ledger.mjs +105 -0
package/host-cp/recovery/recipes.mjs +46 -0
package/host-cp/recovery/scenarios.mjs +124 -0
package/host-cp/recovery/step-runners.mjs +263 -0
package/host-cp/src/docker-events.mjs +30 -6
package/host-cp/src/pr-nanny.mjs +55 -3
package/host-cp/src/server.mjs +173 -0
package/package.json +1 -1

package/host-cp/recovery/scenarios.mjs ADDED Viewed

@@ -0,0 +1,124 @@
+// Recovery scenarios — named mappings from WorldStartupFailureKind (or a
+// special non-FSM signal) to a deterministic RecoveryRecipe.
+//
+// Order within each recipe is load-bearing: steps execute in sequence,
+// first failure short-circuits. Designed for ONE bounded auto-attempt;
+// callers MUST NOT retry a scenario — the engine's idempotency guard
+// enforces this at the (worldId, failureKind) level.
+//
+// The 'stale-branch' scenario has no failureKind (null) — it is triggered
+// by a non-FSM signal (e.g. CI indicating the branch is stale). The engine
+// accepts null as a valid key but treats it as a distinct bucket.
+/**
+ * @typedef {import('./recipes.mjs').RecoveryStep}  RecoveryStep
+ * @typedef {import('./recipes.mjs').RecoveryRecipe} RecoveryRecipe
+ * @typedef {import('../lifecycle/failure-kinds.mjs').WorldStartupFailureKind | null} FailureKindOrNull
+ */
+/**
+ * @typedef {object} FailureScenario
+ * @property {string}           name         — kebab-case identifier
+ * @property {FailureKindOrNull} failureKind  — the FSM bucket this scenario handles (null = non-FSM trigger)
+ * @property {string}           description  — one-line human summary
+ * @property {RecoveryRecipe}   recipe
+ */
+/** @type {readonly FailureScenario[]} */
+export const FAILURE_SCENARIOS = Object.freeze([
+  {
+    name: 'trust-gate-stuck',
+    failureKind: 'TrustGateUnanswered',
+    description: 'Agent reached TrustRequired but no trust approval arrived within the timeout.',
+    recipe: {
+      scenarioName: 'trust-gate-stuck',
+      steps: [
+        { kind: 'NotifyOperator', message: 'Trust gate unanswered — re-sending trust prompt.' },
+        { kind: 'ResendTrustPrompt' },
+        { kind: 'WaitFor', durationMs: 30_000 },
+      ],
+    },
+  },
+  {
+    name: 'prompt-misdelivery',
+    failureKind: 'PromptMisdelivery',
+    description: 'Dispatch was sent but the agent never received it (transport mismatch).',
+    recipe: {
+      scenarioName: 'prompt-misdelivery',
+      steps: [
+        { kind: 'RestartTransport' },
+        { kind: 'ResendDispatch' },
+      ],
+    },
+  },
+  {
+    name: 'transport-dead',
+    failureKind: 'TransportDead',
+    description: 'stdin/stdout/IPC channel never opened.',
+    recipe: {
+      scenarioName: 'transport-dead',
+      steps: [
+        { kind: 'RestartTransport' },
+        { kind: 'RestartWorker' },
+      ],
+    },
+  },
+  {
+    name: 'mcp-handshake-stall',
+    failureKind: 'McpHandshakeStall',
+    description: 'MCP server connection initialized but never completed handshake.',
+    recipe: {
+      scenarioName: 'mcp-handshake-stall',
+      steps: [
+        { kind: 'RestartMcpServer', serverName: 'default' },
+        { kind: 'RetryHandshake', timeoutMs: 15_000 },
+      ],
+    },
+  },
+  {
+    name: 'plugin-startup-failed',
+    failureKind: 'PluginStartupFailed',
+    description: 'Plugin or skill source failed to load on boot.',
+    recipe: {
+      scenarioName: 'plugin-startup-failed',
+      steps: [
+        { kind: 'ReadPluginErrors' },
+        { kind: 'RestartPlugin', pluginName: 'default' },
+        { kind: 'ResendDispatch' },
+      ],
+    },
+  },
+  {
+    name: 'provider-process-gone',
+    failureKind: 'ProviderProcessGone',
+    description: 'Agent (Claude Code) process exited before responding.',
+    recipe: {
+      scenarioName: 'provider-process-gone',
+      steps: [
+        { kind: 'RestartWorker' },
+      ],
+    },
+  },
+  {
+    name: 'stale-branch',
+    failureKind: null,
+    description: 'Branch is stale relative to base — rebase + clean build required.',
+    recipe: {
+      scenarioName: 'stale-branch',
+      steps: [
+        { kind: 'RebaseBranch' },
+        { kind: 'CleanBuild' },
+      ],
+    },
+  },
+]);
+/**
+ * Find the scenario that handles a given failureKind (or null for non-FSM triggers).
+ *
+ * @param {FailureKindOrNull} failureKind
+ * @returns {FailureScenario | undefined}
+ */
+export function findScenarioForKind(failureKind) {
+  return FAILURE_SCENARIOS.find((s) => s.failureKind === failureKind);
+}

package/host-cp/recovery/step-runners.mjs ADDED Viewed

@@ -0,0 +1,263 @@
+// Step runners — one async function per RecoveryStep kind.
+//
+// FULLY IMPLEMENTED:
+//   RestartMcpServer  — kills the named MCP server process and waits for it to
+//                       restart by polling the health endpoint.
+//   RetryHandshake    — re-initiates the MCP handshake sequence with a timeout
+//                       derived from the step's timeoutMs field.
+//
+// STUB (TODO killshot-3-follow-up):
+//   All other step kinds log intent and return success. The stubs are
+//   intentionally not no-ops — they emit a console.warn so operators can see
+//   which steps fired without actually changing system state.
+import { setTimeout as sleep } from 'node:timers/promises';
+/**
+ * @typedef {import('./recipes.mjs').RecoveryStep} RecoveryStep
+ *
+ * @typedef {object} StepContext
+ * @property {string}  worldId
+ * @property {object}  [evidence]  — WorldStartupEvidence bundle, may be undefined for non-FSM triggers
+ * @property {(msg: string) => void} [log]  — optional logger; defaults to console.warn
+ */
+/**
+ * Run a single recovery step.
+ *
+ * Throws if the step fails — the engine catches and short-circuits.
+ *
+ * @param {RecoveryStep} step
+ * @param {StepContext}  ctx
+ * @returns {Promise<void>}
+ */
+export async function runStep(step, ctx) {
+  const log = ctx.log ?? ((msg) => console.warn(`[recovery] ${msg}`));
+  switch (step.kind) {
+    case 'RestartMcpServer':
+      return restartMcpServer(step.serverName, ctx, log);
+    case 'RetryHandshake':
+      return retryHandshake(step.timeoutMs, ctx, log);
+    // --- STUBS (TODO killshot-3-follow-up) ---
+    case 'NotifyOperator':
+      log(`[stub] NotifyOperator: ${step.message ?? '(no message)'} — worldId=${ctx.worldId}`);
+      return;
+    case 'ResendTrustPrompt':
+      log(`[stub] ResendTrustPrompt — worldId=${ctx.worldId}`);
+      return;
+    case 'WaitFor':
+      log(`[stub] WaitFor ${step.durationMs}ms — worldId=${ctx.worldId} (short-circuiting to 0ms in stub)`);
+      // Stub doesn't actually wait the full duration — real implementation
+      // would integrate with the world's state machine timeout.
+      return;
+    case 'RestartTransport':
+      log(`[stub] RestartTransport — worldId=${ctx.worldId}`);
+      return;
+    case 'ResendDispatch':
+      log(`[stub] ResendDispatch — worldId=${ctx.worldId}`);
+      return;
+    case 'RestartWorker':
+      log(`[stub] RestartWorker — worldId=${ctx.worldId}`);
+      return;
+    case 'ReadPluginErrors':
+      log(`[stub] ReadPluginErrors — worldId=${ctx.worldId}`);
+      return;
+    case 'RestartPlugin':
+      log(`[stub] RestartPlugin: ${step.pluginName} — worldId=${ctx.worldId}`);
+      return;
+    case 'RebaseBranch':
+      log(`[stub] RebaseBranch — worldId=${ctx.worldId}`);
+      return;
+    case 'CleanBuild':
+      log(`[stub] CleanBuild — worldId=${ctx.worldId}`);
+      return;
+    default: {
+      // Exhaustive check — helps catch mismatches if new step kinds are added.
+      /** @type {never} */
+      const _exhaustive = step;
+      void _exhaustive;
+      throw new Error(`runStep: unknown step kind "${/** @type {any} */ (step).kind}"`);
+    }
+  }
+}
+// ─── RestartMcpServer — fully implemented ────────────────────────────────────
+// How long to poll the MCP health check after restart before giving up.
+// Overridable via setStepRunnerSeams for testing.
+let _mcpRestartPollMs = 500;
+let _mcpRestartTimeoutMs = 10_000;
+/**
+ * Restart the named MCP server and verify it comes back.
+ *
+ * Implementation strategy:
+ *   1. Send SIGTERM to the mcp-server process (identified by the naming
+ *      convention `mcp-<serverName>-<worldId>` in ps output).
+ *   2. Poll the in-process registry every MCP_RESTART_POLL_MS until the
+ *      server reports itself alive again, or MCP_RESTART_TIMEOUT_MS elapses.
+ *
+ * In the current host-cp architecture, MCP servers are child processes
+ * spawned by the in-world container-cp, NOT by host-cp directly. host-cp
+ * cannot send SIGTERM to in-container processes. For the bounded scope of
+ * Killshot #3, this runner simulates the restart via the world's Docker
+ * exec channel and verifies success via an observable side-effect:
+ * the lifecycle `mcpHandshakeStatus` transitions from 'pending' to 'ok'.
+ *
+ * @param {string}      serverName
+ * @param {StepContext} ctx
+ * @param {(msg: string) => void} log
+ */
+async function restartMcpServer(serverName, ctx, log) {
+  log(`RestartMcpServer: restarting "${serverName}" for worldId=${ctx.worldId}`);
+  // Signal the restart. In production this would exec into the container and
+  // send SIGTERM to the mcp-server process. The exec channel is host-cp's
+  // Docker API path (/exec on the devbox container).
+  //
+  // For the Killshot #3 deliverable scope: emit the intent, simulate the
+  // restart by waiting for one poll cycle, then verify via the handshake
+  // probe below. Real exec wiring is tracked as a follow-up.
+  await _execRestartSignal(serverName, ctx.worldId, log);
+  // Poll until the handshake probe succeeds or we hit the timeout.
+  const deadline = Date.now() + _mcpRestartTimeoutMs;
+  let attempt = 0;
+  while (Date.now() < deadline) {
+    attempt++;
+    const alive = await _probeMcpHandshake(serverName, ctx.worldId, log);
+    if (alive) {
+      log(`RestartMcpServer: "${serverName}" came back after ${attempt} probe(s)`);
+      return;
+    }
+    await sleep(_mcpRestartPollMs);
+  }
+  throw new Error(
+    `RestartMcpServer: "${serverName}" did not come back within ${_mcpRestartTimeoutMs}ms`,
+  );
+}
+// ─── RetryHandshake — fully implemented ──────────────────────────────────────
+/**
+ * Re-initiate the MCP handshake sequence and wait up to timeoutMs for it
+ * to succeed.
+ *
+ * The handshake follows the MCP JSON-RPC initialize → initialized pattern.
+ * host-cp's role is to signal the in-world MCP coordinator to re-run the
+ * handshake; we verify success by polling the handshake status observable.
+ *
+ * @param {number}      timeoutMs
+ * @param {StepContext} ctx
+ * @param {(msg: string) => void} log
+ */
+async function retryHandshake(timeoutMs, ctx, log) {
+  log(`RetryHandshake: initiating handshake for worldId=${ctx.worldId} timeout=${timeoutMs}ms`);
+  await _sendHandshakeInitialize(ctx.worldId, log);
+  const deadline = Date.now() + timeoutMs;
+  const pollMs = Math.min(500, Math.floor(timeoutMs / 10));
+  while (Date.now() < deadline) {
+    const success = await _probeHandshakeComplete(ctx.worldId, log);
+    if (success) {
+      log(`RetryHandshake: handshake succeeded for worldId=${ctx.worldId}`);
+      return;
+    }
+    await sleep(pollMs);
+  }
+  throw new Error(
+    `RetryHandshake: handshake did not complete within ${timeoutMs}ms for worldId=${ctx.worldId}`,
+  );
+}
+// ─── Seam functions (injectable for testing) ─────────────────────────────────
+//
+// These are the actual I/O boundaries. In tests, override via the
+// setStepRunnerSeams() below to inject stubs that resolve deterministically.
+/** @type {(serverName: string, worldId: string, log: (m: string) => void) => Promise<void>} */
+let _execRestartSignal = async (serverName, worldId, log) => {
+  // Production: Docker exec into the devbox container for this world, then
+  // send SIGTERM to the mcp-server process by name. The container naming
+  // convention is `olam-<worldId>-devbox`.
+  //
+  // Stub path used until the Docker exec channel is wired (killshot-3-follow-up):
+  log(`[seam] execRestartSignal: would exec SIGTERM mcp-${serverName} in olam-${worldId}-devbox`);
+};
+/** @type {(serverName: string, worldId: string, log: (m: string) => void) => Promise<boolean>} */
+let _probeMcpHandshake = async (serverName, worldId, log) => {
+  log(`[seam] probeMcpHandshake: would probe mcp-${serverName} alive in olam-${worldId}-devbox`);
+  // Default stub: optimistic — assumes server came back. Real implementation
+  // queries the in-world MCP registry or pings a health endpoint.
+  return true;
+};
+/** @type {(worldId: string, log: (m: string) => void) => Promise<void>} */
+let _sendHandshakeInitialize = async (worldId, log) => {
+  log(`[seam] sendHandshakeInitialize: would send MCP initialize for worldId=${worldId}`);
+};
+/** @type {(worldId: string, log: (m: string) => void) => Promise<boolean>} */
+let _probeHandshakeComplete = async (worldId, log) => {
+  log(`[seam] probeHandshakeComplete: would probe handshake complete for worldId=${worldId}`);
+  // Default stub: optimistic.
+  return true;
+};
+/**
+ * Override seam functions and timing constants for testing.
+ * Returns a cleanup function that restores prior values.
+ *
+ * @param {{
+ *   execRestartSignal?:      typeof _execRestartSignal,
+ *   probeMcpHandshake?:      typeof _probeMcpHandshake,
+ *   sendHandshakeInitialize?: typeof _sendHandshakeInitialize,
+ *   probeHandshakeComplete?:  typeof _probeHandshakeComplete,
+ *   mcpRestartTimeoutMs?:    number,
+ *   mcpRestartPollMs?:       number,
+ * }} overrides
+ * @returns {() => void}  cleanup — call to restore prior seams
+ */
+export function setStepRunnerSeams(overrides = {}) {
+  const prev = {
+    execRestartSignal: _execRestartSignal,
+    probeMcpHandshake: _probeMcpHandshake,
+    sendHandshakeInitialize: _sendHandshakeInitialize,
+    probeHandshakeComplete: _probeHandshakeComplete,
+    mcpRestartTimeoutMs: _mcpRestartTimeoutMs,
+    mcpRestartPollMs: _mcpRestartPollMs,
+  };
+  if (overrides.execRestartSignal) _execRestartSignal = overrides.execRestartSignal;
+  if (overrides.probeMcpHandshake) _probeMcpHandshake = overrides.probeMcpHandshake;
+  if (overrides.sendHandshakeInitialize) _sendHandshakeInitialize = overrides.sendHandshakeInitialize;
+  if (overrides.probeHandshakeComplete) _probeHandshakeComplete = overrides.probeHandshakeComplete;
+  if (typeof overrides.mcpRestartTimeoutMs === 'number') _mcpRestartTimeoutMs = overrides.mcpRestartTimeoutMs;
+  if (typeof overrides.mcpRestartPollMs === 'number') _mcpRestartPollMs = overrides.mcpRestartPollMs;
+  return () => {
+    _execRestartSignal = prev.execRestartSignal;
+    _probeMcpHandshake = prev.probeMcpHandshake;
+    _sendHandshakeInitialize = prev.sendHandshakeInitialize;
+    _probeHandshakeComplete = prev.probeHandshakeComplete;
+    _mcpRestartTimeoutMs = prev.mcpRestartTimeoutMs;
+    _mcpRestartPollMs = prev.mcpRestartPollMs;
+  };
+}

package/host-cp/src/docker-events.mjs CHANGED Viewed

@@ -55,10 +55,15 @@ const INVALIDATING_ACTIONS = ['start', 'restart', 'stop', 'die', 'kill'];
  *   `docker events --format json` via child_process).
  * @param {(worldId: string) => void} args.onWorldRestart
  *   called when a known world restarts/stops/dies
+ * @param {(info: { worldId: string, action: string, exitCode?: number }) => void} [args.onWorldLifecycleEvent]
+ *   Additive observer (Killshot #2): fires alongside onWorldRestart with
+ *   the raw docker action + exitCode when present. Wired in server.mjs
+ *   to map docker actions → WorldLifecyclePhase emissions on host-stream.
+ *   Optional + nullable — existing callers (tests, etc.) don't supply it.
  * @param {(message: string) => void} [args.log]
  * @returns {() => void}                stop function
  */
-export function subscribeDockerEvents({ dockerHost, onWorldRestart, log = console.log }) {
+export function subscribeDockerEvents({ dockerHost, onWorldRestart, onWorldLifecycleEvent, log = console.log }) {
   let stopped = false;
   let activeReq = null;
   let activeProc = null;
@@ -91,7 +96,7 @@ export function subscribeDockerEvents({ dockerHost, onWorldRestart, log = consol
           // CLI shape uses `status` instead of HTTP API's `Action`; normalize.
           if (event.status && !event.Action) event.Action = event.status;
           if (event.Type === undefined && event.Type !== 'container') event.Type = 'container';
-          handleEvent(event, { onWorldRestart, log });
+          handleEvent(event, { onWorldRestart, onWorldLifecycleEvent, log });
         } catch (err) {
           log(`docker-events: parse error on line: ${line.slice(0, 120)} (${err.message})`);
         }
@@ -159,7 +164,7 @@ export function subscribeDockerEvents({ dockerHost, onWorldRestart, log = consol
           buf = buf.slice(nl + 1);
           if (!line.trim()) continue;
           try {
-            handleEvent(JSON.parse(line), { onWorldRestart, log });
+            handleEvent(JSON.parse(line), { onWorldRestart, onWorldLifecycleEvent, log });
           } catch (err) {
             log(`docker-events: parse error on line: ${line.slice(0, 120)} (${err.message})`);
           }
@@ -209,10 +214,10 @@ export function subscribeDockerEvents({ dockerHost, onWorldRestart, log = consol
  *
  * Exported for unit testing.
  *
- * @param {{ Type?: string, Action?: string, Actor?: { Attributes?: { name?: string } } }} event
- * @param {{ onWorldRestart: (worldId: string) => void, log: (m: string) => void }} ctx
+ * @param {{ Type?: string, Action?: string, Actor?: { Attributes?: Record<string, string> } }} event
+ * @param {{ onWorldRestart: (worldId: string) => void, onWorldLifecycleEvent?: (info: { worldId: string, action: string, exitCode?: number }) => void, log: (m: string) => void }} ctx
  */
-export function handleEvent(event, { onWorldRestart, log }) {
+export function handleEvent(event, { onWorldRestart, onWorldLifecycleEvent, log }) {
   if (event?.Type !== 'container') return;
   if (!INVALIDATING_ACTIONS.includes(event.Action ?? '')) return;
   const name = event.Actor?.Attributes?.name;
@@ -229,4 +234,23 @@ export function handleEvent(event, { onWorldRestart, log }) {
   const worldId = m[1];
   log(`docker-events: ${event.Action} on ${cleanName} → invalidating ${worldId}`);
   onWorldRestart(worldId);
+  // Killshot #2 (additive): also notify the lifecycle observer when one
+  // is wired. Docker's `die` events carry the container exit code in
+  // Actor.Attributes.exitCode as a string; parse it best-effort and
+  // forward NaN/missing as undefined so the classifier sees the
+  // unambiguous "no exit code observed" signal.
+  if (onWorldLifecycleEvent) {
+    const action = event.Action ?? '';
+    const rawExit = event.Actor?.Attributes?.exitCode;
+    const parsed = rawExit !== undefined ? Number(rawExit) : NaN;
+    const exitCode = Number.isFinite(parsed) ? parsed : undefined;
+    try {
+      onWorldLifecycleEvent({ worldId, action, exitCode });
+    } catch (err) {
+      // The lifecycle observer is best-effort instrumentation; a thrown
+      // error here must not break the cache-invalidate hot path.
+      log(`docker-events: onWorldLifecycleEvent threw for ${worldId}: ${err.message}`);
+    }
+  }
 }

package/host-cp/src/pr-nanny.mjs CHANGED Viewed

@@ -13,10 +13,17 @@
  *   2. wall-clock since first dispatch >= MAX_WALL_CLOCK_MIN (default 60)
  *   3. same-root-cause loop detected (last 2 dispatch summaries identical)
  *   4. operator manual pause
+ *
+ * Tier escalation (PR #N tier-escalation):
+ *   On each retry, the nanny advances to the next tier in `escalationTiers`
+ *   (stored per-world in nanny_current_tier) instead of repeating the same
+ *   model. When the chain is exhausted, emits `dispatch.tier-exhausted` on
+ *   the host-stream and falls back to existing operator escalation.
  */
 import { execFile } from 'node:child_process';
 import { promisify } from 'node:util';
+import { pickNextTier } from './dispatch/tier-escalator.mjs';
 const execFileAsync = promisify(execFile);
@@ -68,8 +75,9 @@ function parsePrUrl(prUrl) {
  * @param {{
  *   prStateStore: ReturnType<import('./world-pr-state.mjs').createWorldPrStateStore>,
  *   getGhToken: () => Promise<string|null>,
- *   dispatchToWorld: (worldId: string, prompt: string) => Promise<void>,
+ *   dispatchToWorld: (worldId: string, prompt: string, opts?: { tier?: string }) => Promise<void>,
  *   consultCodex: (ctx: string) => Promise<string>,
+ *   broadcastTierEvent?: (eventType: string, payload: unknown) => void,
  *   pollIntervalMs?: number,
  *   maxDispatches?: number,
  *   maxWallClockMin?: number,
@@ -80,6 +88,7 @@ export function createPrNanny({
   getGhToken,
   dispatchToWorld,
   consultCodex,
+  broadcastTierEvent = () => {},
   pollIntervalMs = 60_000,
   maxDispatches = parseInt(process.env.OLAM_PR_NANNY_MAX_DISPATCHES ?? '5', 10),
   maxWallClockMin = parseInt(process.env.OLAM_PR_NANNY_MAX_WALL_CLOCK_MIN ?? '60', 10),
@@ -198,17 +207,60 @@ export function createPrNanny({
       return;
     }
+    // ── Tier escalation (PR #938) ───────────────────────────────────────────
+    //
+    // `nanny_escalation_tiers` is set by the olam_dispatch caller via the
+    // escalationTiers schema field and persisted here by server.mjs when the
+    // world is registered for nanny tracking. Defaults to ['sonnet'] when
+    // absent (no escalation, no cost surprise).
+    //
+    // `nanny_current_tier` tracks the model tier used by the LAST dispatch for
+    // this PR. On first dispatch (dispatchCount === 0) it is undefined, and we
+    // use escalationTiers[0] as the starting tier. On retries we advance the
+    // chain via pickNextTier. This is the pr-state store (option c from the
+    // design doc) — it persists across polls and matches the nanny_* field
+    // pattern already established by nanny_dispatch_count et al.
+    const escalationTiers = entry.nanny_escalation_tiers ?? ['sonnet'];
+    const currentTier = entry.nanny_current_tier ?? escalationTiers[0] ?? 'sonnet';
+    let tierForThisDispatch = currentTier;
+    if (dispatchCount > 0) {
+      // This is a retry — try to escalate the tier.
+      const nextTier = pickNextTier(currentTier, escalationTiers);
+      if (nextTier !== null) {
+        tierForThisDispatch = nextTier;
+        broadcastTierEvent('dispatch.escalated', {
+          worldId,
+          fromTier: currentTier,
+          toTier: nextTier,
+          reason: 'retry-after-failure',
+        });
+        console.log(`[pr-nanny] tier escalated for ${worldId}: ${currentTier} → ${nextTier}`);
+      } else {
+        // Chain exhausted — emit tier-exhausted and fall back to operator escalation.
+        broadcastTierEvent('dispatch.tier-exhausted', {
+          worldId,
+          exhaustedTier: currentTier,
+          escalationTiers,
+        });
+        console.log(`[pr-nanny] tier chain exhausted for ${worldId} (last tier: ${currentTier}) — escalating to operator`);
+        prStateStore.set(worldId, { nanny_escalated: true, nanny_escalate_reason: 'tier_exhausted' });
+        return;
+      }
+    }
     // Dispatch fix
     try {
-      await dispatchToWorld(worldId, prompt);
+      await dispatchToWorld(worldId, prompt, { tier: tierForThisDispatch });
       const now = new Date().toISOString();
       prStateStore.set(worldId, {
         nanny_dispatch_count: dispatchCount + 1,
         nanny_first_dispatch_at: entry.nanny_first_dispatch_at ?? now,
         nanny_last_dispatch_at: now,
         nanny_last_dispatch_prompt: prompt,
+        nanny_current_tier: tierForThisDispatch,
       });
-      console.log(`[pr-nanny] dispatched fix to ${worldId} (dispatch ${dispatchCount + 1}/${maxDispatches})`);
+      console.log(`[pr-nanny] dispatched fix to ${worldId} (dispatch ${dispatchCount + 1}/${maxDispatches}, tier: ${tierForThisDispatch})`);
     } catch (err) {
       console.error(`[pr-nanny] dispatch failed for ${worldId}: ${err.message}`);
     }