@pleri/olam-cli 0.1.168 → 0.1.170

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. package/README.md +38 -0
  2. package/dist/commands/auth-status.d.ts +1 -0
  3. package/dist/commands/auth-status.d.ts.map +1 -1
  4. package/dist/commands/auth-status.js +45 -4
  5. package/dist/commands/auth-status.js.map +1 -1
  6. package/dist/commands/create.d.ts.map +1 -1
  7. package/dist/commands/create.js +26 -0
  8. package/dist/commands/create.js.map +1 -1
  9. package/dist/commands/enter.d.ts.map +1 -1
  10. package/dist/commands/enter.js +5 -0
  11. package/dist/commands/enter.js.map +1 -1
  12. package/dist/commands/resume.d.ts +63 -0
  13. package/dist/commands/resume.d.ts.map +1 -0
  14. package/dist/commands/resume.js +174 -0
  15. package/dist/commands/resume.js.map +1 -0
  16. package/dist/commands/setup.d.ts +19 -0
  17. package/dist/commands/setup.d.ts.map +1 -1
  18. package/dist/commands/setup.js +157 -19
  19. package/dist/commands/setup.js.map +1 -1
  20. package/dist/image-digests.json +8 -8
  21. package/dist/index.js +1021 -576
  22. package/dist/index.js.map +1 -1
  23. package/dist/lib/health-probes.d.ts +28 -0
  24. package/dist/lib/health-probes.d.ts.map +1 -1
  25. package/dist/lib/health-probes.js +75 -0
  26. package/dist/lib/health-probes.js.map +1 -1
  27. package/dist/lib/k8s-context-discovery.d.ts +80 -0
  28. package/dist/lib/k8s-context-discovery.d.ts.map +1 -0
  29. package/dist/lib/k8s-context-discovery.js +102 -0
  30. package/dist/lib/k8s-context-discovery.js.map +1 -0
  31. package/dist/mcp-server.js +1273 -771
  32. package/dist/spawn/home-override.d.ts +82 -0
  33. package/dist/spawn/home-override.d.ts.map +1 -0
  34. package/dist/spawn/home-override.js +107 -0
  35. package/dist/spawn/home-override.js.map +1 -0
  36. package/hermes-bundle/version.json +1 -1
  37. package/host-cp/k8s/manifests/30-configmap.yaml +5 -0
  38. package/host-cp/k8s/manifests/50-deployment.yaml +9 -2
  39. package/host-cp/k8s/manifests/auth-service/50-deployment.yaml +1 -1
  40. package/host-cp/k8s/manifests/kg-service/50-deployment.yaml +1 -1
  41. package/host-cp/k8s/manifests/mcp-auth-service/50-deployment.yaml +1 -1
  42. package/host-cp/k8s/manifests/memory-service/50-deployment.yaml +1 -1
  43. package/host-cp/lifecycle/classify.mjs +110 -0
  44. package/host-cp/lifecycle/emit.mjs +119 -0
  45. package/host-cp/lifecycle/evidence.mjs +45 -0
  46. package/host-cp/lifecycle/failure-kinds.mjs +56 -0
  47. package/host-cp/lifecycle/index.mjs +22 -0
  48. package/host-cp/lifecycle/phases.mjs +52 -0
  49. package/host-cp/observability/grafana-port-forward.sh +1 -1
  50. package/host-cp/observability/kyverno-cardinality-mutate.sh +2 -2
  51. package/host-cp/observability/loki-ingest.sh +1 -1
  52. package/host-cp/observability/ndjson-span-sink.mjs +131 -0
  53. package/host-cp/observability/prom-no-double-grafana.sh +4 -4
  54. package/host-cp/observability/redactor.mjs +72 -0
  55. package/host-cp/recovery/engine.mjs +148 -0
  56. package/host-cp/recovery/index.mjs +16 -0
  57. package/host-cp/recovery/ledger.mjs +105 -0
  58. package/host-cp/recovery/recipes.mjs +46 -0
  59. package/host-cp/recovery/scenarios.mjs +124 -0
  60. package/host-cp/recovery/step-runners.mjs +263 -0
  61. package/host-cp/src/docker-events.mjs +30 -6
  62. package/host-cp/src/pr-nanny.mjs +55 -3
  63. package/host-cp/src/server.mjs +173 -0
  64. package/package.json +1 -1
@@ -0,0 +1,52 @@
1
+ // World lifecycle phases — the canonical FSM every Olam world walks
2
+ // through from spawn to terminal state.
3
+ //
4
+ // Order is load-bearing: a world's `lastPhase` is a monotonic high-water
5
+ // mark, and the classifier's precedence rules in classify.mjs assume
6
+ // this ordering when deciding which failure bucket to attribute a stall
7
+ // to. Do NOT reorder without updating the classifier.
8
+
9
+ /**
10
+ * @typedef {'Spawning' | 'TrustRequired' | 'ReadyForPrompt' | 'Running' | 'Finished' | 'Failed'} WorldLifecyclePhase
11
+ */
12
+
13
+ /**
14
+ * @type {Readonly<Record<WorldLifecyclePhase, WorldLifecyclePhase>>}
15
+ */
16
+ export const WorldLifecyclePhase = Object.freeze({
17
+ /** Container or worktree created; before any code runs inside. */
18
+ Spawning: 'Spawning',
19
+ /** Agent process up; awaiting trust-gate approval. */
20
+ TrustRequired: 'TrustRequired',
21
+ /** Trust granted; awaiting initial dispatch. */
22
+ ReadyForPrompt: 'ReadyForPrompt',
23
+ /** Actively processing dispatch. */
24
+ Running: 'Running',
25
+ /** Completed successfully. Terminal. */
26
+ Finished: 'Finished',
27
+ /** Terminal failure. Carries an evidence bundle + classified failure kind. */
28
+ Failed: 'Failed',
29
+ });
30
+
31
+ /** Phases in canonical order. Useful for ordinal comparison. */
32
+ export const WORLD_LIFECYCLE_PHASE_ORDER = Object.freeze([
33
+ WorldLifecyclePhase.Spawning,
34
+ WorldLifecyclePhase.TrustRequired,
35
+ WorldLifecyclePhase.ReadyForPrompt,
36
+ WorldLifecyclePhase.Running,
37
+ WorldLifecyclePhase.Finished,
38
+ WorldLifecyclePhase.Failed,
39
+ ]);
40
+
41
+ /** Terminal phases — no transitions out. */
42
+ export const TERMINAL_PHASES = Object.freeze(
43
+ new Set([WorldLifecyclePhase.Finished, WorldLifecyclePhase.Failed]),
44
+ );
45
+
46
+ /**
47
+ * @param {unknown} value
48
+ * @returns {value is WorldLifecyclePhase}
49
+ */
50
+ export function isWorldLifecyclePhase(value) {
51
+ return typeof value === 'string' && WORLD_LIFECYCLE_PHASE_ORDER.includes(/** @type {any} */ (value));
52
+ }
@@ -145,7 +145,7 @@ helm upgrade --install "$GRAFANA_RELEASE" grafana/grafana \
145
145
  --create-namespace \
146
146
  -f "$PERIPHERAL_SERVICES_DIR/helm-values/grafana-values.yaml" \
147
147
  --wait \
148
- --timeout 300s
148
+ --timeout "${OLAM_HELM_TIMEOUT:-600s}"
149
149
 
150
150
  log "Grafana Helm install complete"
151
151
 
@@ -49,7 +49,7 @@ KYVERNO_NAMESPACE="kyverno"
49
49
  TEST_NAMESPACE="monitoring"
50
50
  PROM_LOCAL_PORT="9092" # 9090, 9091 may be in use by sibling Phase C scripts
51
51
  PF_BIND_SECONDS=5
52
- TARGET_DISCOVERY_TIMEOUT=180
52
+ TARGET_DISCOVERY_TIMEOUT="${OLAM_PROM_DISCOVERY_TIMEOUT:-240}" # bumped from 180s; one CI attempt observed kyverno-emitter still not scraped at 180s
53
53
  SCRAPE_POLL_INTERVAL=10
54
54
 
55
55
  log() { printf '[kyverno-mutate] %s\n' "$*" >&2; }
@@ -122,7 +122,7 @@ helm upgrade --install olam-kyverno kyverno/kyverno \
122
122
  --namespace "$KYVERNO_NAMESPACE" \
123
123
  --create-namespace \
124
124
  -f "$PERIPHERAL_SERVICES_DIR/helm-values/kyverno-values.yaml" \
125
- --wait --timeout 300s 2>&1 | tail -8
125
+ --wait --timeout "${OLAM_HELM_TIMEOUT:-600s}" 2>&1 | tail -8
126
126
 
127
127
  # Sanity: kyverno-admission-controller Deployment Ready.
128
128
  kubectl get deployment -n "$KYVERNO_NAMESPACE" -l "app.kubernetes.io/component=admission-controller" \
@@ -93,7 +93,7 @@ helm upgrade --install "$LOKI_RELEASE" grafana/loki \
93
93
  --create-namespace \
94
94
  -f "$PERIPHERAL_SERVICES_DIR/helm-values/loki-values.yaml" \
95
95
  --wait \
96
- --timeout 300s
96
+ --timeout "${OLAM_HELM_TIMEOUT:-600s}"
97
97
 
98
98
  log "loki helm install complete"
99
99
 
@@ -0,0 +1,131 @@
1
+ // NDJSON span sink — zero-config observability for host-cp.
2
+ //
3
+ // Subscribes to the host-stream broadcaster and writes one JSON line per
4
+ // `span` event to ~/.olam/logs/host.trace.ndjson. Each span carries the
5
+ // minimum surface needed for `jq`-based triage: identity, timing, exit.
6
+ //
7
+ // Wire shape per line:
8
+ // { traceId, spanId, parentSpanId, name, startedAt, durationMs,
9
+ // attributes, events[], exit: { _tag: 'Success'|'Failure', reason? } }
10
+ //
11
+ // Rotation: single level — at 50MB the file is renamed to `.1` and a
12
+ // fresh file is opened. The previous `.1` (if any) is overwritten. We
13
+ // keep at most one prior generation; deeper retention belongs to the
14
+ // operator's normal disk-management tooling.
15
+ //
16
+ // Override path with OLAM_TRACE_LOG_PATH (set to /dev/null in tests that
17
+ // don't care about file output, or to a temp file to assert on writes).
18
+
19
+ import { open, mkdir, rename } from 'node:fs/promises';
20
+ import { join, dirname } from 'node:path';
21
+ import { homedir } from 'node:os';
22
+ import { redactSensitive } from './redactor.mjs';
23
+
24
+ const DEFAULT_ROTATE_BYTES = 50 * 1024 * 1024;
25
+ const DEFAULT_LOG_PATH =
26
+ process.env.OLAM_TRACE_LOG_PATH ??
27
+ join(homedir(), '.olam', 'logs', 'host.trace.ndjson');
28
+
29
+ export async function createNdjsonSpanSink({
30
+ logPath = DEFAULT_LOG_PATH,
31
+ rotateBytes = DEFAULT_ROTATE_BYTES,
32
+ hostStream,
33
+ } = {}) {
34
+ await mkdir(dirname(logPath), { recursive: true });
35
+ let fh = await open(logPath, 'a');
36
+ let bytesWritten = (await fh.stat()).size;
37
+ let closed = false;
38
+ let chain = Promise.resolve();
39
+
40
+ async function writeLine(line) {
41
+ if (closed) return;
42
+ await fh.write(line);
43
+ bytesWritten += Buffer.byteLength(line);
44
+ if (bytesWritten >= rotateBytes) {
45
+ await fh.close();
46
+ await rename(logPath, `${logPath}.1`);
47
+ fh = await open(logPath, 'a');
48
+ bytesWritten = 0;
49
+ }
50
+ }
51
+
52
+ function recordSpan(span = {}) {
53
+ const {
54
+ name, startedAt, endedAt, attributes, events, exit,
55
+ traceId, spanId, parentSpanId, reason,
56
+ } = span;
57
+ const haveTimes = typeof endedAt === 'number' && typeof startedAt === 'number';
58
+ const durationMs = haveTimes ? endedAt - startedAt : null;
59
+ let finalExit;
60
+ if (exit && typeof exit === 'object' && (exit._tag === 'Success' || exit._tag === 'Failure')) {
61
+ finalExit = exit._tag === 'Failure' && exit.reason !== undefined
62
+ ? { _tag: 'Failure', reason: exit.reason }
63
+ : { _tag: exit._tag };
64
+ } else if (!haveTimes) {
65
+ finalExit = reason !== undefined ? { _tag: 'Failure', reason } : { _tag: 'Failure' };
66
+ } else {
67
+ finalExit = { _tag: 'Success' };
68
+ }
69
+ const record = {
70
+ traceId: traceId ?? null,
71
+ spanId: spanId ?? null,
72
+ parentSpanId: parentSpanId ?? null,
73
+ name: name ?? null,
74
+ startedAt: startedAt ?? null,
75
+ durationMs,
76
+ attributes: redactSensitive(attributes ?? {}),
77
+ events: redactSensitive(events ?? []),
78
+ exit: finalExit,
79
+ };
80
+ const next = chain.then(() => writeLine(JSON.stringify(record) + '\n')).catch(() => {});
81
+ chain = next;
82
+ return next;
83
+ }
84
+
85
+ let detach = null;
86
+ if (hostStream && typeof hostStream.addSink === 'function') {
87
+ detach = hostStream.addSink(createSseSpanAdapter((payload) => recordSpan(payload)));
88
+ }
89
+
90
+ return {
91
+ recordSpan,
92
+ async close() {
93
+ if (closed) return;
94
+ if (detach) detach();
95
+ // Drain queued writes BEFORE flipping the closed flag — `writeLine`
96
+ // bails on `closed`, so flipping first would silently drop spans
97
+ // recorded just prior to shutdown.
98
+ await chain;
99
+ closed = true;
100
+ try { await fh.close(); } catch { /* already closed */ }
101
+ },
102
+ };
103
+ }
104
+
105
+ // Duck-typed ServerResponse for host-stream's `addSink`. Parses SSE frames
106
+ // (`event: <type>\ndata: <json>\n\n`) and dispatches `event: span` payloads
107
+ // to `onSpan`. All other event types are silently ignored — host-stream
108
+ // also replays per-type snapshots on attach; the sink is created at boot
109
+ // before any spans are broadcast, so replay is a no-op in practice.
110
+ function createSseSpanAdapter(onSpan) {
111
+ let buffer = '';
112
+ return {
113
+ writableEnded: false,
114
+ destroyed: false,
115
+ write(chunk) {
116
+ buffer += String(chunk);
117
+ let i;
118
+ while ((i = buffer.indexOf('\n\n')) !== -1) {
119
+ const frame = buffer.slice(0, i);
120
+ buffer = buffer.slice(i + 2);
121
+ if (!frame.startsWith('event: span\n') && !frame.includes('\nevent: span\n')) continue;
122
+ const dataLine = frame.split('\n').find((l) => l.startsWith('data: '));
123
+ if (!dataLine) continue;
124
+ try { onSpan(JSON.parse(dataLine.slice(6))); } catch { /* malformed frame */ }
125
+ }
126
+ return true;
127
+ },
128
+ once() { /* no drain handling needed — in-memory adapter never backpressures */ },
129
+ end() { this.writableEnded = true; },
130
+ };
131
+ }
@@ -96,7 +96,7 @@ helm upgrade --install "$PROM_RELEASE" prometheus-community/kube-prometheus-stac
96
96
  --create-namespace \
97
97
  -f "$PERIPHERAL_SERVICES_DIR/helm-values/kube-prom-stack-values.yaml" \
98
98
  --wait \
99
- --timeout 600s
99
+ --timeout "${OLAM_HELM_TIMEOUT:-600s}"
100
100
 
101
101
  log "kube-prometheus-stack helm install complete"
102
102
 
@@ -143,7 +143,7 @@ helm upgrade olam-loki grafana/loki \
143
143
  --namespace "$NAMESPACE" \
144
144
  -f "$PERIPHERAL_SERVICES_DIR/helm-values/loki-values.yaml" \
145
145
  --wait \
146
- --timeout 300s \
146
+ --timeout "${OLAM_HELM_TIMEOUT:-600s}" \
147
147
  --reuse-values \
148
148
  --set monitoring.serviceMonitor.enabled=true
149
149
 
@@ -154,7 +154,7 @@ helm upgrade olam-promtail grafana/promtail \
154
154
  --namespace "$NAMESPACE" \
155
155
  -f "$PERIPHERAL_SERVICES_DIR/helm-values/promtail-values.yaml" \
156
156
  --wait \
157
- --timeout 300s \
157
+ --timeout "${OLAM_HELM_TIMEOUT:-600s}" \
158
158
  --reuse-values \
159
159
  --set serviceMonitor.enabled=true
160
160
 
@@ -165,7 +165,7 @@ helm upgrade "$GRAFANA_RELEASE" grafana/grafana \
165
165
  --namespace "$NAMESPACE" \
166
166
  -f "$PERIPHERAL_SERVICES_DIR/helm-values/grafana-values.yaml" \
167
167
  --wait \
168
- --timeout 300s \
168
+ --timeout "${OLAM_HELM_TIMEOUT:-600s}" \
169
169
  --reuse-values \
170
170
  --set serviceMonitor.enabled=true
171
171
 
@@ -0,0 +1,72 @@
1
+ // Privacy Guard — regex-based auto-redactor for trace + recovery ledger.
2
+ //
3
+ // Deep-walks an object, finds string values, applies an ordered list of
4
+ // regex patterns, returns a redacted COPY (immutable; input untouched).
5
+ // Each match is replaced with `<redacted:<kind>>`.
6
+ //
7
+ // Default-ON patterns (7): anthropic, openai, aws, gh-pat, jwt, bearer, slack.
8
+ // Opt-in (env-gated): email PII (OLAM_REDACT_PII=1), high-entropy strings
9
+ // (OLAM_REDACT_HIGH_ENTROPY=1). Hard short-circuit: OLAM_REDACTION_DISABLED=1.
10
+ //
11
+ // Precedence matters: anthropic runs before openai (otherwise the OpenAI
12
+ // `sk-...` regex would steal `sk-ant-...` and emit the wrong tag). Bearer
13
+ // runs after the high-specificity key patterns so a bearer-wrapped key
14
+ // gets the tighter tag.
15
+
16
+ const DEFAULT_PATTERNS = [
17
+ { kind: 'anthropic-key', re: /\bsk-ant-(?:api|admin)[A-Za-z0-9_-]{20,}\b/g },
18
+ { kind: 'openai-key', re: /\bsk-(?:proj-)?[A-Za-z0-9_-]{32,}\b/g },
19
+ { kind: 'aws-key', re: /\bAKIA[A-Z0-9]{16}\b/g },
20
+ { kind: 'gh-token', re: /\bgh[poursa]_[A-Za-z0-9_]{36,}\b/g },
21
+ { kind: 'jwt', re: /\beyJ[A-Za-z0-9_-]+\.[A-Za-z0-9_-]+\.[A-Za-z0-9_-]+\b/g },
22
+ { kind: 'slack-token', re: /\bxox[abposr]-[A-Za-z0-9-]{10,}\b/g },
23
+ { kind: 'bearer', re: /Bearer\s+[A-Za-z0-9._~+/-]+=*/gi, replacement: 'Bearer <redacted:bearer>' },
24
+ ];
25
+
26
+ const EMAIL_PATTERN = { kind: 'email', re: /\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,}\b/gi };
27
+ const HIGH_ENTROPY_PATTERN = { kind: 'high-entropy', re: /\b[A-Z0-9_-]{32,}\b/g };
28
+ const HIGH_ENTROPY_ALLOWLIST = new Set(['UUID', 'CHUNK_ID', '__filename', '__dirname']);
29
+
30
+ function redactString(s) {
31
+ if (process.env.OLAM_REDACTION_DISABLED === '1') return s;
32
+ let out = s;
33
+ for (const { kind, re, replacement } of DEFAULT_PATTERNS) {
34
+ out = out.replace(re, replacement ?? `<redacted:${kind}>`);
35
+ }
36
+ if (process.env.OLAM_REDACT_PII === '1') {
37
+ out = out.replace(EMAIL_PATTERN.re, `<redacted:${EMAIL_PATTERN.kind}>`);
38
+ }
39
+ if (process.env.OLAM_REDACT_HIGH_ENTROPY === '1') {
40
+ out = out.replace(HIGH_ENTROPY_PATTERN.re, (m) =>
41
+ HIGH_ENTROPY_ALLOWLIST.has(m) || /^<redacted:/.test(m) ? m : `<redacted:${HIGH_ENTROPY_PATTERN.kind}>`,
42
+ );
43
+ }
44
+ return out;
45
+ }
46
+
47
+ /**
48
+ * Deep-walk `value`, redacting strings. Returns a new value; input is
49
+ * never mutated. Primitives + null pass through unchanged (except strings,
50
+ * which are run through `redactString`). Cycles produce `'<cycle>'`.
51
+ *
52
+ * @template T
53
+ * @param {T} value
54
+ * @returns {T}
55
+ */
56
+ export function redactSensitive(value) {
57
+ if (process.env.OLAM_REDACTION_DISABLED === '1') return value;
58
+ return walk(value, new WeakSet());
59
+ }
60
+
61
+ function walk(value, seen) {
62
+ if (typeof value === 'string') return redactString(value);
63
+ if (value === null || typeof value !== 'object') return value;
64
+ if (seen.has(value)) return '<cycle>';
65
+ seen.add(value);
66
+ if (Array.isArray(value)) return value.map((v) => walk(v, seen));
67
+ const out = {};
68
+ for (const k of Object.keys(value)) {
69
+ out[k] = walk(value[k], seen);
70
+ }
71
+ return out;
72
+ }
@@ -0,0 +1,148 @@
1
+ // Recovery engine — the single entry point for bounded auto-attempts.
2
+ //
3
+ // Key invariants:
4
+ // 1. ONE attempt per (worldId, failureKind) pair. The ledger enforces
5
+ // idempotency: a second call with the same key returns the prior
6
+ // entry with outcome='escalated'.
7
+ // 2. Concurrent calls for the same (worldId, failureKind) key fire only
8
+ // ONE attempt. An in-flight Map holds the running Promise; concurrent
9
+ // callers await the same Promise.
10
+ // 3. Steps execute in order. First failing step short-circuits to
11
+ // outcome='failed'; subsequent steps are NOT executed.
12
+ // 4. All attempts (success, failed, escalated) are written to the ledger.
13
+ //
14
+ // The engine is async and pure-functional with respect to the host-stream:
15
+ // callers (server.mjs) are responsible for emitting the recovery.* events
16
+ // AFTER receiving the returned RecoveryLedgerEntry. The engine does not
17
+ // broadcast directly, keeping it testable without a hostStream fixture.
18
+
19
+ import { findScenarioForKind } from './scenarios.mjs';
20
+ import { appendLedgerEntry, findPriorEntry } from './ledger.mjs';
21
+ import { runStep } from './step-runners.mjs';
22
+ import { DEFAULT_LEDGER_PATH } from './ledger.mjs';
23
+
24
+ /**
25
+ * @typedef {import('./ledger.mjs').RecoveryLedgerEntry} RecoveryLedgerEntry
26
+ * @typedef {import('../lifecycle/failure-kinds.mjs').WorldStartupFailureKind | null} FailureKindOrNull
27
+ */
28
+
29
+ // In-flight promise map: key = `${worldId}::${failureKind ?? 'null'}` → Promise<RecoveryLedgerEntry>
30
+ /** @type {Map<string, Promise<RecoveryLedgerEntry>>} */
31
+ const _inFlight = new Map();
32
+
33
+ /**
34
+ * Attempt a bounded recovery for the given world + failure kind.
35
+ *
36
+ * @param {string} worldId
37
+ * @param {object} [evidence] — WorldStartupEvidence, optional
38
+ * @param {FailureKindOrNull} [failureKind] — classified bucket, or null for non-FSM triggers
39
+ * @param {{ ledgerPath?: string, log?: (msg: string) => void }} [opts]
40
+ * @returns {Promise<RecoveryLedgerEntry>}
41
+ */
42
+ export function attemptRecovery(worldId, evidence, failureKind = null, opts = {}) {
43
+ const key = `${worldId}::${failureKind ?? 'null'}`;
44
+ const existing = _inFlight.get(key);
45
+ if (existing) return existing;
46
+
47
+ const promise = _attempt(worldId, evidence, failureKind, opts).finally(() => {
48
+ _inFlight.delete(key);
49
+ });
50
+ _inFlight.set(key, promise);
51
+ return promise;
52
+ }
53
+
54
+ /**
55
+ * Internal: run the actual attempt. Always resolves (never rejects) — all
56
+ * errors are captured into the returned ledger entry.
57
+ *
58
+ * @param {string} worldId
59
+ * @param {object} [evidence]
60
+ * @param {FailureKindOrNull} failureKind
61
+ * @param {{ ledgerPath?: string, log?: (msg: string) => void }} opts
62
+ * @returns {Promise<RecoveryLedgerEntry>}
63
+ */
64
+ async function _attempt(worldId, evidence, failureKind, opts) {
65
+ const { ledgerPath = DEFAULT_LEDGER_PATH, log = (msg) => console.warn(`[recovery] ${msg}`) } = opts;
66
+ const startedAt = Date.now();
67
+
68
+ // Idempotency check: if a prior entry exists for this key, return it
69
+ // with outcome='escalated' and write an escalated entry.
70
+ const prior = await findPriorEntry(worldId, failureKind, ledgerPath);
71
+ if (prior !== undefined) {
72
+ const escalated = /** @type {RecoveryLedgerEntry} */ ({
73
+ worldId,
74
+ failureKind: failureKind ?? null,
75
+ scenario: prior.scenario,
76
+ stepsRun: [],
77
+ startedAt,
78
+ endedAt: Date.now(),
79
+ outcome: 'escalated',
80
+ errorReason: `prior attempt already recorded (outcome=${prior.outcome})`,
81
+ });
82
+ await appendLedgerEntry(escalated, ledgerPath);
83
+ log(`recovery idempotency: escalated (worldId=${worldId}, kind=${failureKind})`);
84
+ return escalated;
85
+ }
86
+
87
+ // Find the scenario.
88
+ const scenario = findScenarioForKind(failureKind);
89
+ if (!scenario) {
90
+ const entry = /** @type {RecoveryLedgerEntry} */ ({
91
+ worldId,
92
+ failureKind: failureKind ?? null,
93
+ scenario: 'unmatched',
94
+ stepsRun: [],
95
+ startedAt,
96
+ endedAt: Date.now(),
97
+ outcome: 'failed',
98
+ errorReason: 'no scenario matched',
99
+ });
100
+ await appendLedgerEntry(entry, ledgerPath);
101
+ log(`recovery: no scenario for kind=${failureKind} (worldId=${worldId})`);
102
+ return entry;
103
+ }
104
+
105
+ log(`recovery: starting scenario="${scenario.name}" for worldId=${worldId}`);
106
+
107
+ // Execute steps in order, short-circuit on first failure.
108
+ /** @type {import('./recipes.mjs').RecoveryStep[]} */
109
+ const stepsRun = [];
110
+ /** @type {string | undefined} */
111
+ let errorReason;
112
+ /** @type {'success' | 'failed'} */
113
+ let outcome = 'success';
114
+
115
+ for (const step of scenario.recipe.steps) {
116
+ stepsRun.push(step);
117
+ try {
118
+ await runStep(step, { worldId, evidence, log });
119
+ } catch (err) {
120
+ outcome = 'failed';
121
+ errorReason = `step "${step.kind}" threw: ${err?.message ?? String(err)}`;
122
+ log(`recovery: step failed — ${errorReason}`);
123
+ break;
124
+ }
125
+ }
126
+
127
+ const entry = /** @type {RecoveryLedgerEntry} */ ({
128
+ worldId,
129
+ failureKind: failureKind ?? null,
130
+ scenario: scenario.name,
131
+ stepsRun,
132
+ startedAt,
133
+ endedAt: Date.now(),
134
+ outcome,
135
+ ...(errorReason !== undefined ? { errorReason } : {}),
136
+ });
137
+
138
+ await appendLedgerEntry(entry, ledgerPath);
139
+ log(`recovery: scenario="${scenario.name}" outcome=${outcome} (worldId=${worldId})`);
140
+ return entry;
141
+ }
142
+
143
+ /**
144
+ * Exposed for testing only: clear the in-flight map so tests don't bleed state.
145
+ */
146
+ export function _clearInFlight() {
147
+ _inFlight.clear();
148
+ }
@@ -0,0 +1,16 @@
1
+ // Recovery module barrel export.
2
+ //
3
+ // Public surface:
4
+ // - attemptRecovery — the engine entry point for callers (server.mjs)
5
+ // - FAILURE_SCENARIOS — the 7 named scenarios
6
+ // - findScenarioForKind — scenario lookup by failureKind
7
+ // - appendLedgerEntry / readAllLedgerEntries / findPriorEntry — ledger I/O
8
+ // - setStepRunnerSeams — test seam injection for step runners
9
+ //
10
+ // Internal:
11
+ // - _clearInFlight — test helper; not intended for production use
12
+
13
+ export { attemptRecovery, _clearInFlight } from './engine.mjs';
14
+ export { FAILURE_SCENARIOS, findScenarioForKind } from './scenarios.mjs';
15
+ export { appendLedgerEntry, readAllLedgerEntries, findPriorEntry, DEFAULT_LEDGER_PATH } from './ledger.mjs';
16
+ export { runStep, setStepRunnerSeams } from './step-runners.mjs';
@@ -0,0 +1,105 @@
1
+ // RecoveryLedger — append-only NDJSON persistence for recovery attempts.
2
+ //
3
+ // Each attempt writes one JSON line to the ledger file. The file grows
4
+ // monotonically; entries are never updated in-place. This keeps the
5
+ // ledger auditable and safe to tail/parse with `jq` while the process
6
+ // is running.
7
+ //
8
+ // Default path: ~/.olam/logs/recovery-ledger.ndjson
9
+ // Override: set OLAM_RECOVERY_LEDGER_PATH (useful in tests — point at a
10
+ // tmp file to isolate test runs from the real operator ledger).
11
+
12
+ import { open, mkdir, access } from 'node:fs/promises';
13
+ import { join, dirname } from 'node:path';
14
+ import { homedir } from 'node:os';
15
+ import { createReadStream } from 'node:fs';
16
+ import { createInterface } from 'node:readline';
17
+ import { redactSensitive } from '../observability/redactor.mjs';
18
+
19
+ export const DEFAULT_LEDGER_PATH =
20
+ process.env.OLAM_RECOVERY_LEDGER_PATH ??
21
+ join(homedir(), '.olam', 'logs', 'recovery-ledger.ndjson');
22
+
23
+ /**
24
+ * @typedef {object} RecoveryLedgerEntry
25
+ * @property {string} worldId
26
+ * @property {string | null} failureKind — WorldStartupFailureKind or null (non-FSM trigger)
27
+ * @property {string} scenario — kebab-case scenario name, or 'unmatched'
28
+ * @property {import('./recipes.mjs').RecoveryStep[]} stepsRun — steps actually executed (may be partial on failure)
29
+ * @property {number} startedAt — epoch ms
30
+ * @property {number} endedAt — epoch ms
31
+ * @property {'success' | 'failed' | 'escalated'} outcome
32
+ * @property {string} [errorReason] — set on failed/escalated outcomes
33
+ */
34
+
35
+ /**
36
+ * Append a single RecoveryLedgerEntry to the ledger file.
37
+ *
38
+ * @param {RecoveryLedgerEntry} entry
39
+ * @param {string} [ledgerPath]
40
+ * @returns {Promise<void>}
41
+ */
42
+ export async function appendLedgerEntry(entry, ledgerPath = DEFAULT_LEDGER_PATH) {
43
+ await mkdir(dirname(ledgerPath), { recursive: true });
44
+ const fh = await open(ledgerPath, 'a');
45
+ try {
46
+ await fh.write(JSON.stringify(redactSensitive(entry)) + '\n');
47
+ } finally {
48
+ await fh.close();
49
+ }
50
+ }
51
+
52
+ /**
53
+ * Read all entries from the ledger (in append order).
54
+ *
55
+ * @param {string} [ledgerPath]
56
+ * @returns {Promise<RecoveryLedgerEntry[]>}
57
+ */
58
+ export async function readAllLedgerEntries(ledgerPath = DEFAULT_LEDGER_PATH) {
59
+ /** @type {RecoveryLedgerEntry[]} */
60
+ const entries = [];
61
+
62
+ // Check existence before streaming — createReadStream emits ENOENT as an
63
+ // error event (not a synchronous throw), which propagates through the
64
+ // `for await` loop and would reject the caller. An explicit access check
65
+ // keeps the "not yet written" path simple.
66
+ try {
67
+ await access(ledgerPath);
68
+ } catch {
69
+ return entries; // File does not exist yet.
70
+ }
71
+
72
+ const stream = createReadStream(ledgerPath, { encoding: 'utf8' });
73
+ const rl = createInterface({ input: stream, crlfDelay: Infinity });
74
+ for await (const line of rl) {
75
+ const trimmed = line.trim();
76
+ if (!trimmed) continue;
77
+ try {
78
+ entries.push(JSON.parse(trimmed));
79
+ } catch {
80
+ // Malformed line — skip and continue.
81
+ }
82
+ }
83
+ return entries;
84
+ }
85
+
86
+ /**
87
+ * Find the most recent ledger entry for a (worldId, failureKind) pair.
88
+ * Returns undefined if no prior entry exists.
89
+ *
90
+ * @param {string} worldId
91
+ * @param {string|null} failureKind
92
+ * @param {string} [ledgerPath]
93
+ * @returns {Promise<RecoveryLedgerEntry | undefined>}
94
+ */
95
+ export async function findPriorEntry(worldId, failureKind, ledgerPath = DEFAULT_LEDGER_PATH) {
96
+ const all = await readAllLedgerEntries(ledgerPath);
97
+ // Walk in reverse to find the most recent match.
98
+ for (let i = all.length - 1; i >= 0; i--) {
99
+ const e = all[i];
100
+ if (e.worldId === worldId && e.failureKind === (failureKind ?? null)) {
101
+ return e;
102
+ }
103
+ }
104
+ return undefined;
105
+ }
@@ -0,0 +1,46 @@
1
+ // Recovery step types and recipe interface — the discriminated union of
2
+ // all named steps that can appear in a RecoveryRecipe.
3
+ //
4
+ // Step runners for each kind live in step-runners.mjs. The engine in
5
+ // engine.mjs iterates a recipe's steps array and dispatches each to the
6
+ // appropriate runner.
7
+ //
8
+ // A RecoveryRecipe is an ordered list of steps. Steps execute in order;
9
+ // the first failing step short-circuits to a 'failed' outcome.
10
+
11
+ /**
12
+ * @typedef {{ kind: 'NotifyOperator', message?: string }} NotifyOperatorStep
13
+ * @typedef {{ kind: 'ResendTrustPrompt' }} ResendTrustPromptStep
14
+ * @typedef {{ kind: 'WaitFor', durationMs: number }} WaitForStep
15
+ * @typedef {{ kind: 'RestartTransport' }} RestartTransportStep
16
+ * @typedef {{ kind: 'ResendDispatch' }} ResendDispatchStep
17
+ * @typedef {{ kind: 'RestartWorker' }} RestartWorkerStep
18
+ * @typedef {{ kind: 'RestartMcpServer', serverName: string }} RestartMcpServerStep
19
+ * @typedef {{ kind: 'RetryHandshake', timeoutMs: number }} RetryHandshakeStep
20
+ * @typedef {{ kind: 'ReadPluginErrors' }} ReadPluginErrorsStep
21
+ * @typedef {{ kind: 'RestartPlugin', pluginName: string }} RestartPluginStep
22
+ * @typedef {{ kind: 'RebaseBranch' }} RebaseBranchStep
23
+ * @typedef {{ kind: 'CleanBuild' }} CleanBuildStep
24
+ *
25
+ * @typedef {| NotifyOperatorStep
26
+ * | ResendTrustPromptStep
27
+ * | WaitForStep
28
+ * | RestartTransportStep
29
+ * | ResendDispatchStep
30
+ * | RestartWorkerStep
31
+ * | RestartMcpServerStep
32
+ * | RetryHandshakeStep
33
+ * | ReadPluginErrorsStep
34
+ * | RestartPluginStep
35
+ * | RebaseBranchStep
36
+ * | CleanBuildStep
37
+ * } RecoveryStep
38
+ */
39
+
40
+ /**
41
+ * @typedef {object} RecoveryRecipe
42
+ * @property {string} scenarioName — human-readable name of the scenario
43
+ * @property {RecoveryStep[]} steps — ordered list of steps to execute
44
+ */
45
+
46
+ export {};