npm - @pleri/olam-cli - Versions diffs - 0.1.168 → 0.1.170 - Mend

@pleri/olam-cli 0.1.168 → 0.1.170

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (64) hide show

package/README.md +38 -0
package/dist/commands/auth-status.d.ts +1 -0
package/dist/commands/auth-status.d.ts.map +1 -1
package/dist/commands/auth-status.js +45 -4
package/dist/commands/auth-status.js.map +1 -1
package/dist/commands/create.d.ts.map +1 -1
package/dist/commands/create.js +26 -0
package/dist/commands/create.js.map +1 -1
package/dist/commands/enter.d.ts.map +1 -1
package/dist/commands/enter.js +5 -0
package/dist/commands/enter.js.map +1 -1
package/dist/commands/resume.d.ts +63 -0
package/dist/commands/resume.d.ts.map +1 -0
package/dist/commands/resume.js +174 -0
package/dist/commands/resume.js.map +1 -0
package/dist/commands/setup.d.ts +19 -0
package/dist/commands/setup.d.ts.map +1 -1
package/dist/commands/setup.js +157 -19
package/dist/commands/setup.js.map +1 -1
package/dist/image-digests.json +8 -8
package/dist/index.js +1021 -576
package/dist/index.js.map +1 -1
package/dist/lib/health-probes.d.ts +28 -0
package/dist/lib/health-probes.d.ts.map +1 -1
package/dist/lib/health-probes.js +75 -0
package/dist/lib/health-probes.js.map +1 -1
package/dist/lib/k8s-context-discovery.d.ts +80 -0
package/dist/lib/k8s-context-discovery.d.ts.map +1 -0
package/dist/lib/k8s-context-discovery.js +102 -0
package/dist/lib/k8s-context-discovery.js.map +1 -0
package/dist/mcp-server.js +1273 -771
package/dist/spawn/home-override.d.ts +82 -0
package/dist/spawn/home-override.d.ts.map +1 -0
package/dist/spawn/home-override.js +107 -0
package/dist/spawn/home-override.js.map +1 -0
package/hermes-bundle/version.json +1 -1
package/host-cp/k8s/manifests/30-configmap.yaml +5 -0
package/host-cp/k8s/manifests/50-deployment.yaml +9 -2
package/host-cp/k8s/manifests/auth-service/50-deployment.yaml +1 -1
package/host-cp/k8s/manifests/kg-service/50-deployment.yaml +1 -1
package/host-cp/k8s/manifests/mcp-auth-service/50-deployment.yaml +1 -1
package/host-cp/k8s/manifests/memory-service/50-deployment.yaml +1 -1
package/host-cp/lifecycle/classify.mjs +110 -0
package/host-cp/lifecycle/emit.mjs +119 -0
package/host-cp/lifecycle/evidence.mjs +45 -0
package/host-cp/lifecycle/failure-kinds.mjs +56 -0
package/host-cp/lifecycle/index.mjs +22 -0
package/host-cp/lifecycle/phases.mjs +52 -0
package/host-cp/observability/grafana-port-forward.sh +1 -1
package/host-cp/observability/kyverno-cardinality-mutate.sh +2 -2
package/host-cp/observability/loki-ingest.sh +1 -1
package/host-cp/observability/ndjson-span-sink.mjs +131 -0
package/host-cp/observability/prom-no-double-grafana.sh +4 -4
package/host-cp/observability/redactor.mjs +72 -0
package/host-cp/recovery/engine.mjs +148 -0
package/host-cp/recovery/index.mjs +16 -0
package/host-cp/recovery/ledger.mjs +105 -0
package/host-cp/recovery/recipes.mjs +46 -0
package/host-cp/recovery/scenarios.mjs +124 -0
package/host-cp/recovery/step-runners.mjs +263 -0
package/host-cp/src/docker-events.mjs +30 -6
package/host-cp/src/pr-nanny.mjs +55 -3
package/host-cp/src/server.mjs +173 -0
package/package.json +1 -1

package/host-cp/lifecycle/phases.mjs ADDED Viewed

@@ -0,0 +1,52 @@
+// World lifecycle phases — the canonical FSM every Olam world walks
+// through from spawn to terminal state.
+//
+// Order is load-bearing: a world's `lastPhase` is a monotonic high-water
+// mark, and the classifier's precedence rules in classify.mjs assume
+// this ordering when deciding which failure bucket to attribute a stall
+// to. Do NOT reorder without updating the classifier.
+/**
+ * @typedef {'Spawning' | 'TrustRequired' | 'ReadyForPrompt' | 'Running' | 'Finished' | 'Failed'} WorldLifecyclePhase
+ */
+/**
+ * @type {Readonly<Record<WorldLifecyclePhase, WorldLifecyclePhase>>}
+ */
+export const WorldLifecyclePhase = Object.freeze({
+  /** Container or worktree created; before any code runs inside. */
+  Spawning: 'Spawning',
+  /** Agent process up; awaiting trust-gate approval. */
+  TrustRequired: 'TrustRequired',
+  /** Trust granted; awaiting initial dispatch. */
+  ReadyForPrompt: 'ReadyForPrompt',
+  /** Actively processing dispatch. */
+  Running: 'Running',
+  /** Completed successfully. Terminal. */
+  Finished: 'Finished',
+  /** Terminal failure. Carries an evidence bundle + classified failure kind. */
+  Failed: 'Failed',
+});
+/** Phases in canonical order. Useful for ordinal comparison. */
+export const WORLD_LIFECYCLE_PHASE_ORDER = Object.freeze([
+  WorldLifecyclePhase.Spawning,
+  WorldLifecyclePhase.TrustRequired,
+  WorldLifecyclePhase.ReadyForPrompt,
+  WorldLifecyclePhase.Running,
+  WorldLifecyclePhase.Finished,
+  WorldLifecyclePhase.Failed,
+]);
+/** Terminal phases — no transitions out. */
+export const TERMINAL_PHASES = Object.freeze(
+  new Set([WorldLifecyclePhase.Finished, WorldLifecyclePhase.Failed]),
+);
+/**
+ * @param {unknown} value
+ * @returns {value is WorldLifecyclePhase}
+ */
+export function isWorldLifecyclePhase(value) {
+  return typeof value === 'string' && WORLD_LIFECYCLE_PHASE_ORDER.includes(/** @type {any} */ (value));
+}

package/host-cp/observability/grafana-port-forward.sh CHANGED Viewed

@@ -145,7 +145,7 @@ helm upgrade --install "$GRAFANA_RELEASE" grafana/grafana \
   --create-namespace \
   -f "$PERIPHERAL_SERVICES_DIR/helm-values/grafana-values.yaml" \
   --wait \
-  --timeout 300s
+  --timeout "${OLAM_HELM_TIMEOUT:-600s}"
 log "Grafana Helm install complete"

package/host-cp/observability/kyverno-cardinality-mutate.sh CHANGED Viewed

@@ -49,7 +49,7 @@ KYVERNO_NAMESPACE="kyverno"
 TEST_NAMESPACE="monitoring"
 PROM_LOCAL_PORT="9092"   # 9090, 9091 may be in use by sibling Phase C scripts
 PF_BIND_SECONDS=5
-TARGET_DISCOVERY_TIMEOUT=180
+TARGET_DISCOVERY_TIMEOUT="${OLAM_PROM_DISCOVERY_TIMEOUT:-240}"   # bumped from 180s; one CI attempt observed kyverno-emitter still not scraped at 180s
 SCRAPE_POLL_INTERVAL=10
 log()  { printf '[kyverno-mutate] %s\n' "$*" >&2; }
@@ -122,7 +122,7 @@ helm upgrade --install olam-kyverno kyverno/kyverno \
   --namespace "$KYVERNO_NAMESPACE" \
   --create-namespace \
   -f "$PERIPHERAL_SERVICES_DIR/helm-values/kyverno-values.yaml" \
-  --wait --timeout 300s 2>&1 | tail -8
+  --wait --timeout "${OLAM_HELM_TIMEOUT:-600s}" 2>&1 | tail -8
 # Sanity: kyverno-admission-controller Deployment Ready.
 kubectl get deployment -n "$KYVERNO_NAMESPACE" -l "app.kubernetes.io/component=admission-controller" \

package/host-cp/observability/loki-ingest.sh CHANGED Viewed

@@ -93,7 +93,7 @@ helm upgrade --install "$LOKI_RELEASE" grafana/loki \
   --create-namespace \
   -f "$PERIPHERAL_SERVICES_DIR/helm-values/loki-values.yaml" \
   --wait \
-  --timeout 300s
+  --timeout "${OLAM_HELM_TIMEOUT:-600s}"
 log "loki helm install complete"

package/host-cp/observability/ndjson-span-sink.mjs ADDED Viewed

@@ -0,0 +1,131 @@
+// NDJSON span sink — zero-config observability for host-cp.
+//
+// Subscribes to the host-stream broadcaster and writes one JSON line per
+// `span` event to ~/.olam/logs/host.trace.ndjson. Each span carries the
+// minimum surface needed for `jq`-based triage: identity, timing, exit.
+//
+// Wire shape per line:
+//   { traceId, spanId, parentSpanId, name, startedAt, durationMs,
+//     attributes, events[], exit: { _tag: 'Success'|'Failure', reason? } }
+//
+// Rotation: single level — at 50MB the file is renamed to `.1` and a
+// fresh file is opened. The previous `.1` (if any) is overwritten. We
+// keep at most one prior generation; deeper retention belongs to the
+// operator's normal disk-management tooling.
+//
+// Override path with OLAM_TRACE_LOG_PATH (set to /dev/null in tests that
+// don't care about file output, or to a temp file to assert on writes).
+import { open, mkdir, rename } from 'node:fs/promises';
+import { join, dirname } from 'node:path';
+import { homedir } from 'node:os';
+import { redactSensitive } from './redactor.mjs';
+const DEFAULT_ROTATE_BYTES = 50 * 1024 * 1024;
+const DEFAULT_LOG_PATH =
+  process.env.OLAM_TRACE_LOG_PATH ??
+  join(homedir(), '.olam', 'logs', 'host.trace.ndjson');
+export async function createNdjsonSpanSink({
+  logPath = DEFAULT_LOG_PATH,
+  rotateBytes = DEFAULT_ROTATE_BYTES,
+  hostStream,
+} = {}) {
+  await mkdir(dirname(logPath), { recursive: true });
+  let fh = await open(logPath, 'a');
+  let bytesWritten = (await fh.stat()).size;
+  let closed = false;
+  let chain = Promise.resolve();
+  async function writeLine(line) {
+    if (closed) return;
+    await fh.write(line);
+    bytesWritten += Buffer.byteLength(line);
+    if (bytesWritten >= rotateBytes) {
+      await fh.close();
+      await rename(logPath, `${logPath}.1`);
+      fh = await open(logPath, 'a');
+      bytesWritten = 0;
+    }
+  }
+  function recordSpan(span = {}) {
+    const {
+      name, startedAt, endedAt, attributes, events, exit,
+      traceId, spanId, parentSpanId, reason,
+    } = span;
+    const haveTimes = typeof endedAt === 'number' && typeof startedAt === 'number';
+    const durationMs = haveTimes ? endedAt - startedAt : null;
+    let finalExit;
+    if (exit && typeof exit === 'object' && (exit._tag === 'Success' || exit._tag === 'Failure')) {
+      finalExit = exit._tag === 'Failure' && exit.reason !== undefined
+        ? { _tag: 'Failure', reason: exit.reason }
+        : { _tag: exit._tag };
+    } else if (!haveTimes) {
+      finalExit = reason !== undefined ? { _tag: 'Failure', reason } : { _tag: 'Failure' };
+    } else {
+      finalExit = { _tag: 'Success' };
+    }
+    const record = {
+      traceId: traceId ?? null,
+      spanId: spanId ?? null,
+      parentSpanId: parentSpanId ?? null,
+      name: name ?? null,
+      startedAt: startedAt ?? null,
+      durationMs,
+      attributes: redactSensitive(attributes ?? {}),
+      events: redactSensitive(events ?? []),
+      exit: finalExit,
+    };
+    const next = chain.then(() => writeLine(JSON.stringify(record) + '\n')).catch(() => {});
+    chain = next;
+    return next;
+  }
+  let detach = null;
+  if (hostStream && typeof hostStream.addSink === 'function') {
+    detach = hostStream.addSink(createSseSpanAdapter((payload) => recordSpan(payload)));
+  }
+  return {
+    recordSpan,
+    async close() {
+      if (closed) return;
+      if (detach) detach();
+      // Drain queued writes BEFORE flipping the closed flag — `writeLine`
+      // bails on `closed`, so flipping first would silently drop spans
+      // recorded just prior to shutdown.
+      await chain;
+      closed = true;
+      try { await fh.close(); } catch { /* already closed */ }
+    },
+  };
+}
+// Duck-typed ServerResponse for host-stream's `addSink`. Parses SSE frames
+// (`event: <type>\ndata: <json>\n\n`) and dispatches `event: span` payloads
+// to `onSpan`. All other event types are silently ignored — host-stream
+// also replays per-type snapshots on attach; the sink is created at boot
+// before any spans are broadcast, so replay is a no-op in practice.
+function createSseSpanAdapter(onSpan) {
+  let buffer = '';
+  return {
+    writableEnded: false,
+    destroyed: false,
+    write(chunk) {
+      buffer += String(chunk);
+      let i;
+      while ((i = buffer.indexOf('\n\n')) !== -1) {
+        const frame = buffer.slice(0, i);
+        buffer = buffer.slice(i + 2);
+        if (!frame.startsWith('event: span\n') && !frame.includes('\nevent: span\n')) continue;
+        const dataLine = frame.split('\n').find((l) => l.startsWith('data: '));
+        if (!dataLine) continue;
+        try { onSpan(JSON.parse(dataLine.slice(6))); } catch { /* malformed frame */ }
+      }
+      return true;
+    },
+    once() { /* no drain handling needed — in-memory adapter never backpressures */ },
+    end() { this.writableEnded = true; },
+  };
+}

package/host-cp/observability/prom-no-double-grafana.sh CHANGED Viewed

@@ -96,7 +96,7 @@ helm upgrade --install "$PROM_RELEASE" prometheus-community/kube-prometheus-stac
   --create-namespace \
   -f "$PERIPHERAL_SERVICES_DIR/helm-values/kube-prom-stack-values.yaml" \
   --wait \
-  --timeout 600s
+  --timeout "${OLAM_HELM_TIMEOUT:-600s}"
 log "kube-prometheus-stack helm install complete"
@@ -143,7 +143,7 @@ helm upgrade olam-loki grafana/loki \
   --namespace "$NAMESPACE" \
   -f "$PERIPHERAL_SERVICES_DIR/helm-values/loki-values.yaml" \
   --wait \
-  --timeout 300s \
+  --timeout "${OLAM_HELM_TIMEOUT:-600s}" \
   --reuse-values \
   --set monitoring.serviceMonitor.enabled=true
@@ -154,7 +154,7 @@ helm upgrade olam-promtail grafana/promtail \
   --namespace "$NAMESPACE" \
   -f "$PERIPHERAL_SERVICES_DIR/helm-values/promtail-values.yaml" \
   --wait \
-  --timeout 300s \
+  --timeout "${OLAM_HELM_TIMEOUT:-600s}" \
   --reuse-values \
   --set serviceMonitor.enabled=true
@@ -165,7 +165,7 @@ helm upgrade "$GRAFANA_RELEASE" grafana/grafana \
   --namespace "$NAMESPACE" \
   -f "$PERIPHERAL_SERVICES_DIR/helm-values/grafana-values.yaml" \
   --wait \
-  --timeout 300s \
+  --timeout "${OLAM_HELM_TIMEOUT:-600s}" \
   --reuse-values \
   --set serviceMonitor.enabled=true

package/host-cp/observability/redactor.mjs ADDED Viewed

@@ -0,0 +1,72 @@
+// Privacy Guard — regex-based auto-redactor for trace + recovery ledger.
+//
+// Deep-walks an object, finds string values, applies an ordered list of
+// regex patterns, returns a redacted COPY (immutable; input untouched).
+// Each match is replaced with `<redacted:<kind>>`.
+//
+// Default-ON patterns (7): anthropic, openai, aws, gh-pat, jwt, bearer, slack.
+// Opt-in (env-gated): email PII (OLAM_REDACT_PII=1), high-entropy strings
+// (OLAM_REDACT_HIGH_ENTROPY=1). Hard short-circuit: OLAM_REDACTION_DISABLED=1.
+//
+// Precedence matters: anthropic runs before openai (otherwise the OpenAI
+// `sk-...` regex would steal `sk-ant-...` and emit the wrong tag). Bearer
+// runs after the high-specificity key patterns so a bearer-wrapped key
+// gets the tighter tag.
+const DEFAULT_PATTERNS = [
+  { kind: 'anthropic-key', re: /\bsk-ant-(?:api|admin)[A-Za-z0-9_-]{20,}\b/g },
+  { kind: 'openai-key',    re: /\bsk-(?:proj-)?[A-Za-z0-9_-]{32,}\b/g },
+  { kind: 'aws-key',       re: /\bAKIA[A-Z0-9]{16}\b/g },
+  { kind: 'gh-token',      re: /\bgh[poursa]_[A-Za-z0-9_]{36,}\b/g },
+  { kind: 'jwt',           re: /\beyJ[A-Za-z0-9_-]+\.[A-Za-z0-9_-]+\.[A-Za-z0-9_-]+\b/g },
+  { kind: 'slack-token',   re: /\bxox[abposr]-[A-Za-z0-9-]{10,}\b/g },
+  { kind: 'bearer',        re: /Bearer\s+[A-Za-z0-9._~+/-]+=*/gi, replacement: 'Bearer <redacted:bearer>' },
+];
+const EMAIL_PATTERN = { kind: 'email', re: /\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,}\b/gi };
+const HIGH_ENTROPY_PATTERN = { kind: 'high-entropy', re: /\b[A-Z0-9_-]{32,}\b/g };
+const HIGH_ENTROPY_ALLOWLIST = new Set(['UUID', 'CHUNK_ID', '__filename', '__dirname']);
+function redactString(s) {
+  if (process.env.OLAM_REDACTION_DISABLED === '1') return s;
+  let out = s;
+  for (const { kind, re, replacement } of DEFAULT_PATTERNS) {
+    out = out.replace(re, replacement ?? `<redacted:${kind}>`);
+  }
+  if (process.env.OLAM_REDACT_PII === '1') {
+    out = out.replace(EMAIL_PATTERN.re, `<redacted:${EMAIL_PATTERN.kind}>`);
+  }
+  if (process.env.OLAM_REDACT_HIGH_ENTROPY === '1') {
+    out = out.replace(HIGH_ENTROPY_PATTERN.re, (m) =>
+      HIGH_ENTROPY_ALLOWLIST.has(m) || /^<redacted:/.test(m) ? m : `<redacted:${HIGH_ENTROPY_PATTERN.kind}>`,
+    );
+  }
+  return out;
+}
+/**
+ * Deep-walk `value`, redacting strings. Returns a new value; input is
+ * never mutated. Primitives + null pass through unchanged (except strings,
+ * which are run through `redactString`). Cycles produce `'<cycle>'`.
+ *
+ * @template T
+ * @param {T} value
+ * @returns {T}
+ */
+export function redactSensitive(value) {
+  if (process.env.OLAM_REDACTION_DISABLED === '1') return value;
+  return walk(value, new WeakSet());
+}
+function walk(value, seen) {
+  if (typeof value === 'string') return redactString(value);
+  if (value === null || typeof value !== 'object') return value;
+  if (seen.has(value)) return '<cycle>';
+  seen.add(value);
+  if (Array.isArray(value)) return value.map((v) => walk(v, seen));
+  const out = {};
+  for (const k of Object.keys(value)) {
+    out[k] = walk(value[k], seen);
+  }
+  return out;
+}

package/host-cp/recovery/engine.mjs ADDED Viewed

@@ -0,0 +1,148 @@
+// Recovery engine — the single entry point for bounded auto-attempts.
+//
+// Key invariants:
+//   1. ONE attempt per (worldId, failureKind) pair. The ledger enforces
+//      idempotency: a second call with the same key returns the prior
+//      entry with outcome='escalated'.
+//   2. Concurrent calls for the same (worldId, failureKind) key fire only
+//      ONE attempt. An in-flight Map holds the running Promise; concurrent
+//      callers await the same Promise.
+//   3. Steps execute in order. First failing step short-circuits to
+//      outcome='failed'; subsequent steps are NOT executed.
+//   4. All attempts (success, failed, escalated) are written to the ledger.
+//
+// The engine is async and pure-functional with respect to the host-stream:
+// callers (server.mjs) are responsible for emitting the recovery.* events
+// AFTER receiving the returned RecoveryLedgerEntry. The engine does not
+// broadcast directly, keeping it testable without a hostStream fixture.
+import { findScenarioForKind } from './scenarios.mjs';
+import { appendLedgerEntry, findPriorEntry } from './ledger.mjs';
+import { runStep } from './step-runners.mjs';
+import { DEFAULT_LEDGER_PATH } from './ledger.mjs';
+/**
+ * @typedef {import('./ledger.mjs').RecoveryLedgerEntry} RecoveryLedgerEntry
+ * @typedef {import('../lifecycle/failure-kinds.mjs').WorldStartupFailureKind | null} FailureKindOrNull
+ */
+// In-flight promise map: key = `${worldId}::${failureKind ?? 'null'}` → Promise<RecoveryLedgerEntry>
+/** @type {Map<string, Promise<RecoveryLedgerEntry>>} */
+const _inFlight = new Map();
+/**
+ * Attempt a bounded recovery for the given world + failure kind.
+ *
+ * @param {string}            worldId
+ * @param {object}            [evidence]     — WorldStartupEvidence, optional
+ * @param {FailureKindOrNull} [failureKind]  — classified bucket, or null for non-FSM triggers
+ * @param {{ ledgerPath?: string, log?: (msg: string) => void }} [opts]
+ * @returns {Promise<RecoveryLedgerEntry>}
+ */
+export function attemptRecovery(worldId, evidence, failureKind = null, opts = {}) {
+  const key = `${worldId}::${failureKind ?? 'null'}`;
+  const existing = _inFlight.get(key);
+  if (existing) return existing;
+  const promise = _attempt(worldId, evidence, failureKind, opts).finally(() => {
+    _inFlight.delete(key);
+  });
+  _inFlight.set(key, promise);
+  return promise;
+}
+/**
+ * Internal: run the actual attempt. Always resolves (never rejects) — all
+ * errors are captured into the returned ledger entry.
+ *
+ * @param {string}            worldId
+ * @param {object}            [evidence]
+ * @param {FailureKindOrNull} failureKind
+ * @param {{ ledgerPath?: string, log?: (msg: string) => void }} opts
+ * @returns {Promise<RecoveryLedgerEntry>}
+ */
+async function _attempt(worldId, evidence, failureKind, opts) {
+  const { ledgerPath = DEFAULT_LEDGER_PATH, log = (msg) => console.warn(`[recovery] ${msg}`) } = opts;
+  const startedAt = Date.now();
+  // Idempotency check: if a prior entry exists for this key, return it
+  // with outcome='escalated' and write an escalated entry.
+  const prior = await findPriorEntry(worldId, failureKind, ledgerPath);
+  if (prior !== undefined) {
+    const escalated = /** @type {RecoveryLedgerEntry} */ ({
+      worldId,
+      failureKind: failureKind ?? null,
+      scenario: prior.scenario,
+      stepsRun: [],
+      startedAt,
+      endedAt: Date.now(),
+      outcome: 'escalated',
+      errorReason: `prior attempt already recorded (outcome=${prior.outcome})`,
+    });
+    await appendLedgerEntry(escalated, ledgerPath);
+    log(`recovery idempotency: escalated (worldId=${worldId}, kind=${failureKind})`);
+    return escalated;
+  }
+  // Find the scenario.
+  const scenario = findScenarioForKind(failureKind);
+  if (!scenario) {
+    const entry = /** @type {RecoveryLedgerEntry} */ ({
+      worldId,
+      failureKind: failureKind ?? null,
+      scenario: 'unmatched',
+      stepsRun: [],
+      startedAt,
+      endedAt: Date.now(),
+      outcome: 'failed',
+      errorReason: 'no scenario matched',
+    });
+    await appendLedgerEntry(entry, ledgerPath);
+    log(`recovery: no scenario for kind=${failureKind} (worldId=${worldId})`);
+    return entry;
+  }
+  log(`recovery: starting scenario="${scenario.name}" for worldId=${worldId}`);
+  // Execute steps in order, short-circuit on first failure.
+  /** @type {import('./recipes.mjs').RecoveryStep[]} */
+  const stepsRun = [];
+  /** @type {string | undefined} */
+  let errorReason;
+  /** @type {'success' | 'failed'} */
+  let outcome = 'success';
+  for (const step of scenario.recipe.steps) {
+    stepsRun.push(step);
+    try {
+      await runStep(step, { worldId, evidence, log });
+    } catch (err) {
+      outcome = 'failed';
+      errorReason = `step "${step.kind}" threw: ${err?.message ?? String(err)}`;
+      log(`recovery: step failed — ${errorReason}`);
+      break;
+    }
+  }
+  const entry = /** @type {RecoveryLedgerEntry} */ ({
+    worldId,
+    failureKind: failureKind ?? null,
+    scenario: scenario.name,
+    stepsRun,
+    startedAt,
+    endedAt: Date.now(),
+    outcome,
+    ...(errorReason !== undefined ? { errorReason } : {}),
+  });
+  await appendLedgerEntry(entry, ledgerPath);
+  log(`recovery: scenario="${scenario.name}" outcome=${outcome} (worldId=${worldId})`);
+  return entry;
+}
+/**
+ * Exposed for testing only: clear the in-flight map so tests don't bleed state.
+ */
+export function _clearInFlight() {
+  _inFlight.clear();
+}

package/host-cp/recovery/index.mjs ADDED Viewed

@@ -0,0 +1,16 @@
+// Recovery module barrel export.
+//
+// Public surface:
+//   - attemptRecovery   — the engine entry point for callers (server.mjs)
+//   - FAILURE_SCENARIOS — the 7 named scenarios
+//   - findScenarioForKind — scenario lookup by failureKind
+//   - appendLedgerEntry / readAllLedgerEntries / findPriorEntry — ledger I/O
+//   - setStepRunnerSeams — test seam injection for step runners
+//
+// Internal:
+//   - _clearInFlight — test helper; not intended for production use
+export { attemptRecovery, _clearInFlight } from './engine.mjs';
+export { FAILURE_SCENARIOS, findScenarioForKind } from './scenarios.mjs';
+export { appendLedgerEntry, readAllLedgerEntries, findPriorEntry, DEFAULT_LEDGER_PATH } from './ledger.mjs';
+export { runStep, setStepRunnerSeams } from './step-runners.mjs';

package/host-cp/recovery/ledger.mjs ADDED Viewed

@@ -0,0 +1,105 @@
+// RecoveryLedger — append-only NDJSON persistence for recovery attempts.
+//
+// Each attempt writes one JSON line to the ledger file. The file grows
+// monotonically; entries are never updated in-place. This keeps the
+// ledger auditable and safe to tail/parse with `jq` while the process
+// is running.
+//
+// Default path: ~/.olam/logs/recovery-ledger.ndjson
+// Override: set OLAM_RECOVERY_LEDGER_PATH (useful in tests — point at a
+// tmp file to isolate test runs from the real operator ledger).
+import { open, mkdir, access } from 'node:fs/promises';
+import { join, dirname } from 'node:path';
+import { homedir } from 'node:os';
+import { createReadStream } from 'node:fs';
+import { createInterface } from 'node:readline';
+import { redactSensitive } from '../observability/redactor.mjs';
+export const DEFAULT_LEDGER_PATH =
+  process.env.OLAM_RECOVERY_LEDGER_PATH ??
+  join(homedir(), '.olam', 'logs', 'recovery-ledger.ndjson');
+/**
+ * @typedef {object} RecoveryLedgerEntry
+ * @property {string}                   worldId
+ * @property {string | null}            failureKind    — WorldStartupFailureKind or null (non-FSM trigger)
+ * @property {string}                   scenario       — kebab-case scenario name, or 'unmatched'
+ * @property {import('./recipes.mjs').RecoveryStep[]} stepsRun  — steps actually executed (may be partial on failure)
+ * @property {number}                   startedAt      — epoch ms
+ * @property {number}                   endedAt        — epoch ms
+ * @property {'success' | 'failed' | 'escalated'} outcome
+ * @property {string}                   [errorReason]  — set on failed/escalated outcomes
+ */
+/**
+ * Append a single RecoveryLedgerEntry to the ledger file.
+ *
+ * @param {RecoveryLedgerEntry} entry
+ * @param {string} [ledgerPath]
+ * @returns {Promise<void>}
+ */
+export async function appendLedgerEntry(entry, ledgerPath = DEFAULT_LEDGER_PATH) {
+  await mkdir(dirname(ledgerPath), { recursive: true });
+  const fh = await open(ledgerPath, 'a');
+  try {
+    await fh.write(JSON.stringify(redactSensitive(entry)) + '\n');
+  } finally {
+    await fh.close();
+  }
+}
+/**
+ * Read all entries from the ledger (in append order).
+ *
+ * @param {string} [ledgerPath]
+ * @returns {Promise<RecoveryLedgerEntry[]>}
+ */
+export async function readAllLedgerEntries(ledgerPath = DEFAULT_LEDGER_PATH) {
+  /** @type {RecoveryLedgerEntry[]} */
+  const entries = [];
+  // Check existence before streaming — createReadStream emits ENOENT as an
+  // error event (not a synchronous throw), which propagates through the
+  // `for await` loop and would reject the caller. An explicit access check
+  // keeps the "not yet written" path simple.
+  try {
+    await access(ledgerPath);
+  } catch {
+    return entries; // File does not exist yet.
+  }
+  const stream = createReadStream(ledgerPath, { encoding: 'utf8' });
+  const rl = createInterface({ input: stream, crlfDelay: Infinity });
+  for await (const line of rl) {
+    const trimmed = line.trim();
+    if (!trimmed) continue;
+    try {
+      entries.push(JSON.parse(trimmed));
+    } catch {
+      // Malformed line — skip and continue.
+    }
+  }
+  return entries;
+}
+/**
+ * Find the most recent ledger entry for a (worldId, failureKind) pair.
+ * Returns undefined if no prior entry exists.
+ *
+ * @param {string}      worldId
+ * @param {string|null} failureKind
+ * @param {string}      [ledgerPath]
+ * @returns {Promise<RecoveryLedgerEntry | undefined>}
+ */
+export async function findPriorEntry(worldId, failureKind, ledgerPath = DEFAULT_LEDGER_PATH) {
+  const all = await readAllLedgerEntries(ledgerPath);
+  // Walk in reverse to find the most recent match.
+  for (let i = all.length - 1; i >= 0; i--) {
+    const e = all[i];
+    if (e.worldId === worldId && e.failureKind === (failureKind ?? null)) {
+      return e;
+    }
+  }
+  return undefined;
+}

package/host-cp/recovery/recipes.mjs ADDED Viewed

@@ -0,0 +1,46 @@
+// Recovery step types and recipe interface — the discriminated union of
+// all named steps that can appear in a RecoveryRecipe.
+//
+// Step runners for each kind live in step-runners.mjs. The engine in
+// engine.mjs iterates a recipe's steps array and dispatches each to the
+// appropriate runner.
+//
+// A RecoveryRecipe is an ordered list of steps. Steps execute in order;
+// the first failing step short-circuits to a 'failed' outcome.
+/**
+ * @typedef {{ kind: 'NotifyOperator', message?: string }}                NotifyOperatorStep
+ * @typedef {{ kind: 'ResendTrustPrompt' }}                                ResendTrustPromptStep
+ * @typedef {{ kind: 'WaitFor', durationMs: number }}                      WaitForStep
+ * @typedef {{ kind: 'RestartTransport' }}                                 RestartTransportStep
+ * @typedef {{ kind: 'ResendDispatch' }}                                   ResendDispatchStep
+ * @typedef {{ kind: 'RestartWorker' }}                                    RestartWorkerStep
+ * @typedef {{ kind: 'RestartMcpServer', serverName: string }}             RestartMcpServerStep
+ * @typedef {{ kind: 'RetryHandshake', timeoutMs: number }}                RetryHandshakeStep
+ * @typedef {{ kind: 'ReadPluginErrors' }}                                 ReadPluginErrorsStep
+ * @typedef {{ kind: 'RestartPlugin', pluginName: string }}                RestartPluginStep
+ * @typedef {{ kind: 'RebaseBranch' }}                                     RebaseBranchStep
+ * @typedef {{ kind: 'CleanBuild' }}                                       CleanBuildStep
+ *
+ * @typedef {| NotifyOperatorStep
+ *           | ResendTrustPromptStep
+ *           | WaitForStep
+ *           | RestartTransportStep
+ *           | ResendDispatchStep
+ *           | RestartWorkerStep
+ *           | RestartMcpServerStep
+ *           | RetryHandshakeStep
+ *           | ReadPluginErrorsStep
+ *           | RestartPluginStep
+ *           | RebaseBranchStep
+ *           | CleanBuildStep
+ *           } RecoveryStep
+ */
+/**
+ * @typedef {object} RecoveryRecipe
+ * @property {string}          scenarioName  — human-readable name of the scenario
+ * @property {RecoveryStep[]}  steps         — ordered list of steps to execute
+ */
+export {};