@pleri/olam-cli 0.1.169 → 0.1.173
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +38 -0
- package/dist/agent-stream/driver-runner.js +13 -0
- package/dist/commands/auth-status.d.ts +1 -0
- package/dist/commands/auth-status.d.ts.map +1 -1
- package/dist/commands/auth-status.js +45 -4
- package/dist/commands/auth-status.js.map +1 -1
- package/dist/commands/create.d.ts.map +1 -1
- package/dist/commands/create.js +26 -0
- package/dist/commands/create.js.map +1 -1
- package/dist/commands/enter.d.ts.map +1 -1
- package/dist/commands/enter.js +5 -0
- package/dist/commands/enter.js.map +1 -1
- package/dist/commands/resume.d.ts +63 -0
- package/dist/commands/resume.d.ts.map +1 -0
- package/dist/commands/resume.js +174 -0
- package/dist/commands/resume.js.map +1 -0
- package/dist/commands/setup.d.ts +19 -0
- package/dist/commands/setup.d.ts.map +1 -1
- package/dist/commands/setup.js +157 -19
- package/dist/commands/setup.js.map +1 -1
- package/dist/image-digests.json +8 -8
- package/dist/index.js +1025 -577
- package/dist/index.js.map +1 -1
- package/dist/lib/health-probes.d.ts +28 -0
- package/dist/lib/health-probes.d.ts.map +1 -1
- package/dist/lib/health-probes.js +75 -0
- package/dist/lib/health-probes.js.map +1 -1
- package/dist/lib/k8s-context-discovery.d.ts +80 -0
- package/dist/lib/k8s-context-discovery.d.ts.map +1 -0
- package/dist/lib/k8s-context-discovery.js +102 -0
- package/dist/lib/k8s-context-discovery.js.map +1 -0
- package/dist/mcp-server.js +2417 -1060
- package/dist/spawn/home-override.d.ts +82 -0
- package/dist/spawn/home-override.d.ts.map +1 -0
- package/dist/spawn/home-override.js +107 -0
- package/dist/spawn/home-override.js.map +1 -0
- package/hermes-bundle/version.json +1 -1
- package/host-cp/k8s/manifests/30-configmap.yaml +5 -0
- package/host-cp/k8s/manifests/50-deployment.yaml +9 -2
- package/host-cp/k8s/manifests/auth-service/50-deployment.yaml +1 -1
- package/host-cp/k8s/manifests/kg-service/50-deployment.yaml +1 -1
- package/host-cp/k8s/manifests/mcp-auth-service/50-deployment.yaml +1 -1
- package/host-cp/k8s/manifests/memory-service/50-deployment.yaml +1 -1
- package/host-cp/lifecycle/classify.mjs +110 -0
- package/host-cp/lifecycle/emit.mjs +119 -0
- package/host-cp/lifecycle/evidence.mjs +45 -0
- package/host-cp/lifecycle/failure-kinds.mjs +56 -0
- package/host-cp/lifecycle/index.mjs +22 -0
- package/host-cp/lifecycle/phases.mjs +52 -0
- package/host-cp/observability/grafana-port-forward.sh +1 -1
- package/host-cp/observability/kyverno-cardinality-mutate.sh +2 -2
- package/host-cp/observability/loki-ingest.sh +1 -1
- package/host-cp/observability/ndjson-span-sink.mjs +183 -0
- package/host-cp/observability/prom-no-double-grafana.sh +4 -4
- package/host-cp/observability/redactor.mjs +72 -0
- package/host-cp/recovery/engine.mjs +148 -0
- package/host-cp/recovery/index.mjs +16 -0
- package/host-cp/recovery/ledger.mjs +105 -0
- package/host-cp/recovery/recipes.mjs +46 -0
- package/host-cp/recovery/scenarios.mjs +124 -0
- package/host-cp/recovery/step-runners.mjs +263 -0
- package/host-cp/src/docker-events.mjs +30 -6
- package/host-cp/src/linear-sync.mjs +43 -0
- package/host-cp/src/plan-chat-service.mjs +129 -1
- package/host-cp/src/pr-nanny.mjs +55 -3
- package/host-cp/src/server.mjs +261 -0
- package/package.json +1 -1
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
// World lifecycle phases — the canonical FSM every Olam world walks
|
|
2
|
+
// through from spawn to terminal state.
|
|
3
|
+
//
|
|
4
|
+
// Order is load-bearing: a world's `lastPhase` is a monotonic high-water
|
|
5
|
+
// mark, and the classifier's precedence rules in classify.mjs assume
|
|
6
|
+
// this ordering when deciding which failure bucket to attribute a stall
|
|
7
|
+
// to. Do NOT reorder without updating the classifier.
|
|
8
|
+
|
|
9
|
+
/**
|
|
10
|
+
* @typedef {'Spawning' | 'TrustRequired' | 'ReadyForPrompt' | 'Running' | 'Finished' | 'Failed'} WorldLifecyclePhase
|
|
11
|
+
*/
|
|
12
|
+
|
|
13
|
+
/**
|
|
14
|
+
* @type {Readonly<Record<WorldLifecyclePhase, WorldLifecyclePhase>>}
|
|
15
|
+
*/
|
|
16
|
+
export const WorldLifecyclePhase = Object.freeze({
|
|
17
|
+
/** Container or worktree created; before any code runs inside. */
|
|
18
|
+
Spawning: 'Spawning',
|
|
19
|
+
/** Agent process up; awaiting trust-gate approval. */
|
|
20
|
+
TrustRequired: 'TrustRequired',
|
|
21
|
+
/** Trust granted; awaiting initial dispatch. */
|
|
22
|
+
ReadyForPrompt: 'ReadyForPrompt',
|
|
23
|
+
/** Actively processing dispatch. */
|
|
24
|
+
Running: 'Running',
|
|
25
|
+
/** Completed successfully. Terminal. */
|
|
26
|
+
Finished: 'Finished',
|
|
27
|
+
/** Terminal failure. Carries an evidence bundle + classified failure kind. */
|
|
28
|
+
Failed: 'Failed',
|
|
29
|
+
});
|
|
30
|
+
|
|
31
|
+
/** Phases in canonical order. Useful for ordinal comparison. */
|
|
32
|
+
export const WORLD_LIFECYCLE_PHASE_ORDER = Object.freeze([
|
|
33
|
+
WorldLifecyclePhase.Spawning,
|
|
34
|
+
WorldLifecyclePhase.TrustRequired,
|
|
35
|
+
WorldLifecyclePhase.ReadyForPrompt,
|
|
36
|
+
WorldLifecyclePhase.Running,
|
|
37
|
+
WorldLifecyclePhase.Finished,
|
|
38
|
+
WorldLifecyclePhase.Failed,
|
|
39
|
+
]);
|
|
40
|
+
|
|
41
|
+
/** Terminal phases — no transitions out. */
|
|
42
|
+
export const TERMINAL_PHASES = Object.freeze(
|
|
43
|
+
new Set([WorldLifecyclePhase.Finished, WorldLifecyclePhase.Failed]),
|
|
44
|
+
);
|
|
45
|
+
|
|
46
|
+
/**
|
|
47
|
+
* @param {unknown} value
|
|
48
|
+
* @returns {value is WorldLifecyclePhase}
|
|
49
|
+
*/
|
|
50
|
+
export function isWorldLifecyclePhase(value) {
|
|
51
|
+
return typeof value === 'string' && WORLD_LIFECYCLE_PHASE_ORDER.includes(/** @type {any} */ (value));
|
|
52
|
+
}
|
|
@@ -145,7 +145,7 @@ helm upgrade --install "$GRAFANA_RELEASE" grafana/grafana \
|
|
|
145
145
|
--create-namespace \
|
|
146
146
|
-f "$PERIPHERAL_SERVICES_DIR/helm-values/grafana-values.yaml" \
|
|
147
147
|
--wait \
|
|
148
|
-
--timeout
|
|
148
|
+
--timeout "${OLAM_HELM_TIMEOUT:-600s}"
|
|
149
149
|
|
|
150
150
|
log "Grafana Helm install complete"
|
|
151
151
|
|
|
@@ -49,7 +49,7 @@ KYVERNO_NAMESPACE="kyverno"
|
|
|
49
49
|
TEST_NAMESPACE="monitoring"
|
|
50
50
|
PROM_LOCAL_PORT="9092" # 9090, 9091 may be in use by sibling Phase C scripts
|
|
51
51
|
PF_BIND_SECONDS=5
|
|
52
|
-
TARGET_DISCOVERY_TIMEOUT=
|
|
52
|
+
TARGET_DISCOVERY_TIMEOUT="${OLAM_PROM_DISCOVERY_TIMEOUT:-240}" # bumped from 180s; one CI attempt observed kyverno-emitter still not scraped at 180s
|
|
53
53
|
SCRAPE_POLL_INTERVAL=10
|
|
54
54
|
|
|
55
55
|
log() { printf '[kyverno-mutate] %s\n' "$*" >&2; }
|
|
@@ -122,7 +122,7 @@ helm upgrade --install olam-kyverno kyverno/kyverno \
|
|
|
122
122
|
--namespace "$KYVERNO_NAMESPACE" \
|
|
123
123
|
--create-namespace \
|
|
124
124
|
-f "$PERIPHERAL_SERVICES_DIR/helm-values/kyverno-values.yaml" \
|
|
125
|
-
--wait --timeout
|
|
125
|
+
--wait --timeout "${OLAM_HELM_TIMEOUT:-600s}" 2>&1 | tail -8
|
|
126
126
|
|
|
127
127
|
# Sanity: kyverno-admission-controller Deployment Ready.
|
|
128
128
|
kubectl get deployment -n "$KYVERNO_NAMESPACE" -l "app.kubernetes.io/component=admission-controller" \
|
|
@@ -0,0 +1,183 @@
|
|
|
1
|
+
// NDJSON span sink — zero-config observability for host-cp.
|
|
2
|
+
//
|
|
3
|
+
// Subscribes to the host-stream broadcaster and writes one JSON line per
|
|
4
|
+
// `span` event to ~/.olam/logs/host.trace.ndjson. Each span carries the
|
|
5
|
+
// minimum surface needed for `jq`-based triage: identity, timing, exit.
|
|
6
|
+
//
|
|
7
|
+
// Wire shape per line:
|
|
8
|
+
// { traceId, spanId, parentSpanId, name, startedAt, durationMs,
|
|
9
|
+
// attributes, events[], exit: { _tag: 'Success'|'Failure', reason? } }
|
|
10
|
+
//
|
|
11
|
+
// Rotation: single level — at 50MB the file is renamed to `.1` and a
|
|
12
|
+
// fresh file is opened. The previous `.1` (if any) is overwritten. We
|
|
13
|
+
// keep at most one prior generation; deeper retention belongs to the
|
|
14
|
+
// operator's normal disk-management tooling.
|
|
15
|
+
//
|
|
16
|
+
// Override path with OLAM_TRACE_LOG_PATH (set to /dev/null in tests that
|
|
17
|
+
// don't care about file output, or to a temp file to assert on writes).
|
|
18
|
+
|
|
19
|
+
import { open, mkdir, rename } from 'node:fs/promises';
|
|
20
|
+
import { join, dirname } from 'node:path';
|
|
21
|
+
import { homedir } from 'node:os';
|
|
22
|
+
import { redactSensitive } from './redactor.mjs';
|
|
23
|
+
|
|
24
|
+
const DEFAULT_ROTATE_BYTES = 50 * 1024 * 1024;
|
|
25
|
+
const DEFAULT_LOG_PATH =
|
|
26
|
+
process.env.OLAM_TRACE_LOG_PATH ??
|
|
27
|
+
join(homedir(), '.olam', 'logs', 'host.trace.ndjson');
|
|
28
|
+
|
|
29
|
+
export async function createNdjsonSpanSink({
|
|
30
|
+
logPath = DEFAULT_LOG_PATH,
|
|
31
|
+
rotateBytes = DEFAULT_ROTATE_BYTES,
|
|
32
|
+
hostStream,
|
|
33
|
+
} = {}) {
|
|
34
|
+
await mkdir(dirname(logPath), { recursive: true });
|
|
35
|
+
let fh = await open(logPath, 'a');
|
|
36
|
+
let bytesWritten = (await fh.stat()).size;
|
|
37
|
+
let closed = false;
|
|
38
|
+
let chain = Promise.resolve();
|
|
39
|
+
|
|
40
|
+
async function writeLine(line) {
|
|
41
|
+
if (closed) return;
|
|
42
|
+
await fh.write(line);
|
|
43
|
+
bytesWritten += Buffer.byteLength(line);
|
|
44
|
+
if (bytesWritten >= rotateBytes) {
|
|
45
|
+
await fh.close();
|
|
46
|
+
await rename(logPath, `${logPath}.1`);
|
|
47
|
+
fh = await open(logPath, 'a');
|
|
48
|
+
bytesWritten = 0;
|
|
49
|
+
}
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
function recordSpan(span = {}) {
|
|
53
|
+
const {
|
|
54
|
+
name, startedAt, endedAt, attributes, events, exit,
|
|
55
|
+
traceId, spanId, parentSpanId, reason,
|
|
56
|
+
} = span;
|
|
57
|
+
const haveTimes = typeof endedAt === 'number' && typeof startedAt === 'number';
|
|
58
|
+
const durationMs = haveTimes ? endedAt - startedAt : null;
|
|
59
|
+
let finalExit;
|
|
60
|
+
if (exit && typeof exit === 'object' && (exit._tag === 'Success' || exit._tag === 'Failure')) {
|
|
61
|
+
finalExit = exit._tag === 'Failure' && exit.reason !== undefined
|
|
62
|
+
? { _tag: 'Failure', reason: exit.reason }
|
|
63
|
+
: { _tag: exit._tag };
|
|
64
|
+
} else if (!haveTimes) {
|
|
65
|
+
finalExit = reason !== undefined ? { _tag: 'Failure', reason } : { _tag: 'Failure' };
|
|
66
|
+
} else {
|
|
67
|
+
finalExit = { _tag: 'Success' };
|
|
68
|
+
}
|
|
69
|
+
const record = {
|
|
70
|
+
traceId: traceId ?? null,
|
|
71
|
+
spanId: spanId ?? null,
|
|
72
|
+
parentSpanId: parentSpanId ?? null,
|
|
73
|
+
name: name ?? null,
|
|
74
|
+
startedAt: startedAt ?? null,
|
|
75
|
+
durationMs,
|
|
76
|
+
attributes: redactSensitive(attributes ?? {}),
|
|
77
|
+
events: redactSensitive(events ?? []),
|
|
78
|
+
exit: finalExit,
|
|
79
|
+
};
|
|
80
|
+
const next = chain.then(() => writeLine(JSON.stringify(record) + '\n')).catch(() => {});
|
|
81
|
+
chain = next;
|
|
82
|
+
return next;
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
let detach = null;
|
|
86
|
+
if (hostStream && typeof hostStream.addSink === 'function') {
|
|
87
|
+
detach = hostStream.addSink(createSseSpanAdapter((payload) => recordSpan(payload)));
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
return {
|
|
91
|
+
recordSpan,
|
|
92
|
+
async close() {
|
|
93
|
+
if (closed) return;
|
|
94
|
+
if (detach) detach();
|
|
95
|
+
// Drain queued writes BEFORE flipping the closed flag — `writeLine`
|
|
96
|
+
// bails on `closed`, so flipping first would silently drop spans
|
|
97
|
+
// recorded just prior to shutdown.
|
|
98
|
+
await chain;
|
|
99
|
+
closed = true;
|
|
100
|
+
try { await fh.close(); } catch { /* already closed */ }
|
|
101
|
+
},
|
|
102
|
+
};
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
/**
|
|
106
|
+
* Subscribe an NDJSON sink to `@olam/auth-client`'s `betaResponseEmitter`.
|
|
107
|
+
* Each `beta-response` event becomes a `withCredential.beta-response` span
|
|
108
|
+
* with the beta payload exploded onto `attributes` — downstream `jq`
|
|
109
|
+
* consumers can query e.g.
|
|
110
|
+
*
|
|
111
|
+
* jq 'select(.name == "withCredential.beta-response")
|
|
112
|
+
* | {ts: .startedAt, cred: .attributes.credentialName,
|
|
113
|
+
* cache: .attributes.cacheStatus,
|
|
114
|
+
* thinking: .attributes.thinkingTokens,
|
|
115
|
+
* latencyMs: .durationMs}' ~/.olam/logs/host.trace.ndjson
|
|
116
|
+
*
|
|
117
|
+
* Wire is opt-in (call from server boot). Returns a detach function so the
|
|
118
|
+
* subscription can be removed in tests or on shutdown.
|
|
119
|
+
*
|
|
120
|
+
* Pure additive: spans flowing from other sources (docker lifecycle,
|
|
121
|
+
* plan-orchestrator, etc.) are unaffected.
|
|
122
|
+
*/
|
|
123
|
+
export function attachBetaResponseEvents({ sink, emitter }) {
|
|
124
|
+
if (!sink || typeof sink.recordSpan !== 'function') {
|
|
125
|
+
throw new Error('attachBetaResponseEvents: sink.recordSpan required');
|
|
126
|
+
}
|
|
127
|
+
if (!emitter || typeof emitter.on !== 'function') {
|
|
128
|
+
throw new Error('attachBetaResponseEvents: emitter.on required');
|
|
129
|
+
}
|
|
130
|
+
|
|
131
|
+
const handler = (info) => {
|
|
132
|
+
const now = Date.now();
|
|
133
|
+
const latency = typeof info?.latencyMs === 'number' ? info.latencyMs : 0;
|
|
134
|
+
sink.recordSpan({
|
|
135
|
+
name: 'withCredential.beta-response',
|
|
136
|
+
startedAt: now - latency,
|
|
137
|
+
endedAt: now,
|
|
138
|
+
attributes: {
|
|
139
|
+
credentialName: info?.credentialName ?? null,
|
|
140
|
+
credId: info?.credId ?? null,
|
|
141
|
+
betas: Array.isArray(info?.betas) ? [...info.betas] : [],
|
|
142
|
+
cacheStatus: info?.cacheStatus ?? null,
|
|
143
|
+
thinkingTokens: info?.tokenCounts?.thinking ?? null,
|
|
144
|
+
statusCode: typeof info?.statusCode === 'number' ? info.statusCode : null,
|
|
145
|
+
extraHeaders: info?.extraHeaders && typeof info.extraHeaders === 'object'
|
|
146
|
+
? { ...info.extraHeaders }
|
|
147
|
+
: {},
|
|
148
|
+
},
|
|
149
|
+
exit: { _tag: 'Success' },
|
|
150
|
+
});
|
|
151
|
+
};
|
|
152
|
+
|
|
153
|
+
emitter.on('beta-response', handler);
|
|
154
|
+
return () => emitter.off('beta-response', handler);
|
|
155
|
+
}
|
|
156
|
+
|
|
157
|
+
// Duck-typed ServerResponse for host-stream's `addSink`. Parses SSE frames
|
|
158
|
+
// (`event: <type>\ndata: <json>\n\n`) and dispatches `event: span` payloads
|
|
159
|
+
// to `onSpan`. All other event types are silently ignored — host-stream
|
|
160
|
+
// also replays per-type snapshots on attach; the sink is created at boot
|
|
161
|
+
// before any spans are broadcast, so replay is a no-op in practice.
|
|
162
|
+
function createSseSpanAdapter(onSpan) {
|
|
163
|
+
let buffer = '';
|
|
164
|
+
return {
|
|
165
|
+
writableEnded: false,
|
|
166
|
+
destroyed: false,
|
|
167
|
+
write(chunk) {
|
|
168
|
+
buffer += String(chunk);
|
|
169
|
+
let i;
|
|
170
|
+
while ((i = buffer.indexOf('\n\n')) !== -1) {
|
|
171
|
+
const frame = buffer.slice(0, i);
|
|
172
|
+
buffer = buffer.slice(i + 2);
|
|
173
|
+
if (!frame.startsWith('event: span\n') && !frame.includes('\nevent: span\n')) continue;
|
|
174
|
+
const dataLine = frame.split('\n').find((l) => l.startsWith('data: '));
|
|
175
|
+
if (!dataLine) continue;
|
|
176
|
+
try { onSpan(JSON.parse(dataLine.slice(6))); } catch { /* malformed frame */ }
|
|
177
|
+
}
|
|
178
|
+
return true;
|
|
179
|
+
},
|
|
180
|
+
once() { /* no drain handling needed — in-memory adapter never backpressures */ },
|
|
181
|
+
end() { this.writableEnded = true; },
|
|
182
|
+
};
|
|
183
|
+
}
|
|
@@ -96,7 +96,7 @@ helm upgrade --install "$PROM_RELEASE" prometheus-community/kube-prometheus-stac
|
|
|
96
96
|
--create-namespace \
|
|
97
97
|
-f "$PERIPHERAL_SERVICES_DIR/helm-values/kube-prom-stack-values.yaml" \
|
|
98
98
|
--wait \
|
|
99
|
-
--timeout 600s
|
|
99
|
+
--timeout "${OLAM_HELM_TIMEOUT:-600s}"
|
|
100
100
|
|
|
101
101
|
log "kube-prometheus-stack helm install complete"
|
|
102
102
|
|
|
@@ -143,7 +143,7 @@ helm upgrade olam-loki grafana/loki \
|
|
|
143
143
|
--namespace "$NAMESPACE" \
|
|
144
144
|
-f "$PERIPHERAL_SERVICES_DIR/helm-values/loki-values.yaml" \
|
|
145
145
|
--wait \
|
|
146
|
-
--timeout
|
|
146
|
+
--timeout "${OLAM_HELM_TIMEOUT:-600s}" \
|
|
147
147
|
--reuse-values \
|
|
148
148
|
--set monitoring.serviceMonitor.enabled=true
|
|
149
149
|
|
|
@@ -154,7 +154,7 @@ helm upgrade olam-promtail grafana/promtail \
|
|
|
154
154
|
--namespace "$NAMESPACE" \
|
|
155
155
|
-f "$PERIPHERAL_SERVICES_DIR/helm-values/promtail-values.yaml" \
|
|
156
156
|
--wait \
|
|
157
|
-
--timeout
|
|
157
|
+
--timeout "${OLAM_HELM_TIMEOUT:-600s}" \
|
|
158
158
|
--reuse-values \
|
|
159
159
|
--set serviceMonitor.enabled=true
|
|
160
160
|
|
|
@@ -165,7 +165,7 @@ helm upgrade "$GRAFANA_RELEASE" grafana/grafana \
|
|
|
165
165
|
--namespace "$NAMESPACE" \
|
|
166
166
|
-f "$PERIPHERAL_SERVICES_DIR/helm-values/grafana-values.yaml" \
|
|
167
167
|
--wait \
|
|
168
|
-
--timeout
|
|
168
|
+
--timeout "${OLAM_HELM_TIMEOUT:-600s}" \
|
|
169
169
|
--reuse-values \
|
|
170
170
|
--set serviceMonitor.enabled=true
|
|
171
171
|
|
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
// Privacy Guard — regex-based auto-redactor for trace + recovery ledger.
|
|
2
|
+
//
|
|
3
|
+
// Deep-walks an object, finds string values, applies an ordered list of
|
|
4
|
+
// regex patterns, returns a redacted COPY (immutable; input untouched).
|
|
5
|
+
// Each match is replaced with `<redacted:<kind>>`.
|
|
6
|
+
//
|
|
7
|
+
// Default-ON patterns (7): anthropic, openai, aws, gh-pat, jwt, bearer, slack.
|
|
8
|
+
// Opt-in (env-gated): email PII (OLAM_REDACT_PII=1), high-entropy strings
|
|
9
|
+
// (OLAM_REDACT_HIGH_ENTROPY=1). Hard short-circuit: OLAM_REDACTION_DISABLED=1.
|
|
10
|
+
//
|
|
11
|
+
// Precedence matters: anthropic runs before openai (otherwise the OpenAI
|
|
12
|
+
// `sk-...` regex would steal `sk-ant-...` and emit the wrong tag). Bearer
|
|
13
|
+
// runs after the high-specificity key patterns so a bearer-wrapped key
|
|
14
|
+
// gets the tighter tag.
|
|
15
|
+
|
|
16
|
+
const DEFAULT_PATTERNS = [
|
|
17
|
+
{ kind: 'anthropic-key', re: /\bsk-ant-(?:api|admin)[A-Za-z0-9_-]{20,}\b/g },
|
|
18
|
+
{ kind: 'openai-key', re: /\bsk-(?:proj-)?[A-Za-z0-9_-]{32,}\b/g },
|
|
19
|
+
{ kind: 'aws-key', re: /\bAKIA[A-Z0-9]{16}\b/g },
|
|
20
|
+
{ kind: 'gh-token', re: /\bgh[poursa]_[A-Za-z0-9_]{36,}\b/g },
|
|
21
|
+
{ kind: 'jwt', re: /\beyJ[A-Za-z0-9_-]+\.[A-Za-z0-9_-]+\.[A-Za-z0-9_-]+\b/g },
|
|
22
|
+
{ kind: 'slack-token', re: /\bxox[abposr]-[A-Za-z0-9-]{10,}\b/g },
|
|
23
|
+
{ kind: 'bearer', re: /Bearer\s+[A-Za-z0-9._~+/-]+=*/gi, replacement: 'Bearer <redacted:bearer>' },
|
|
24
|
+
];
|
|
25
|
+
|
|
26
|
+
const EMAIL_PATTERN = { kind: 'email', re: /\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,}\b/gi };
|
|
27
|
+
const HIGH_ENTROPY_PATTERN = { kind: 'high-entropy', re: /\b[A-Z0-9_-]{32,}\b/g };
|
|
28
|
+
const HIGH_ENTROPY_ALLOWLIST = new Set(['UUID', 'CHUNK_ID', '__filename', '__dirname']);
|
|
29
|
+
|
|
30
|
+
function redactString(s) {
|
|
31
|
+
if (process.env.OLAM_REDACTION_DISABLED === '1') return s;
|
|
32
|
+
let out = s;
|
|
33
|
+
for (const { kind, re, replacement } of DEFAULT_PATTERNS) {
|
|
34
|
+
out = out.replace(re, replacement ?? `<redacted:${kind}>`);
|
|
35
|
+
}
|
|
36
|
+
if (process.env.OLAM_REDACT_PII === '1') {
|
|
37
|
+
out = out.replace(EMAIL_PATTERN.re, `<redacted:${EMAIL_PATTERN.kind}>`);
|
|
38
|
+
}
|
|
39
|
+
if (process.env.OLAM_REDACT_HIGH_ENTROPY === '1') {
|
|
40
|
+
out = out.replace(HIGH_ENTROPY_PATTERN.re, (m) =>
|
|
41
|
+
HIGH_ENTROPY_ALLOWLIST.has(m) || /^<redacted:/.test(m) ? m : `<redacted:${HIGH_ENTROPY_PATTERN.kind}>`,
|
|
42
|
+
);
|
|
43
|
+
}
|
|
44
|
+
return out;
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
/**
|
|
48
|
+
* Deep-walk `value`, redacting strings. Returns a new value; input is
|
|
49
|
+
* never mutated. Primitives + null pass through unchanged (except strings,
|
|
50
|
+
* which are run through `redactString`). Cycles produce `'<cycle>'`.
|
|
51
|
+
*
|
|
52
|
+
* @template T
|
|
53
|
+
* @param {T} value
|
|
54
|
+
* @returns {T}
|
|
55
|
+
*/
|
|
56
|
+
export function redactSensitive(value) {
|
|
57
|
+
if (process.env.OLAM_REDACTION_DISABLED === '1') return value;
|
|
58
|
+
return walk(value, new WeakSet());
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
function walk(value, seen) {
|
|
62
|
+
if (typeof value === 'string') return redactString(value);
|
|
63
|
+
if (value === null || typeof value !== 'object') return value;
|
|
64
|
+
if (seen.has(value)) return '<cycle>';
|
|
65
|
+
seen.add(value);
|
|
66
|
+
if (Array.isArray(value)) return value.map((v) => walk(v, seen));
|
|
67
|
+
const out = {};
|
|
68
|
+
for (const k of Object.keys(value)) {
|
|
69
|
+
out[k] = walk(value[k], seen);
|
|
70
|
+
}
|
|
71
|
+
return out;
|
|
72
|
+
}
|
|
@@ -0,0 +1,148 @@
|
|
|
1
|
+
// Recovery engine — the single entry point for bounded auto-attempts.
|
|
2
|
+
//
|
|
3
|
+
// Key invariants:
|
|
4
|
+
// 1. ONE attempt per (worldId, failureKind) pair. The ledger enforces
|
|
5
|
+
// idempotency: a second call with the same key returns the prior
|
|
6
|
+
// entry with outcome='escalated'.
|
|
7
|
+
// 2. Concurrent calls for the same (worldId, failureKind) key fire only
|
|
8
|
+
// ONE attempt. An in-flight Map holds the running Promise; concurrent
|
|
9
|
+
// callers await the same Promise.
|
|
10
|
+
// 3. Steps execute in order. First failing step short-circuits to
|
|
11
|
+
// outcome='failed'; subsequent steps are NOT executed.
|
|
12
|
+
// 4. All attempts (success, failed, escalated) are written to the ledger.
|
|
13
|
+
//
|
|
14
|
+
// The engine is async and pure-functional with respect to the host-stream:
|
|
15
|
+
// callers (server.mjs) are responsible for emitting the recovery.* events
|
|
16
|
+
// AFTER receiving the returned RecoveryLedgerEntry. The engine does not
|
|
17
|
+
// broadcast directly, keeping it testable without a hostStream fixture.
|
|
18
|
+
|
|
19
|
+
import { findScenarioForKind } from './scenarios.mjs';
|
|
20
|
+
import { appendLedgerEntry, findPriorEntry } from './ledger.mjs';
|
|
21
|
+
import { runStep } from './step-runners.mjs';
|
|
22
|
+
import { DEFAULT_LEDGER_PATH } from './ledger.mjs';
|
|
23
|
+
|
|
24
|
+
/**
|
|
25
|
+
* @typedef {import('./ledger.mjs').RecoveryLedgerEntry} RecoveryLedgerEntry
|
|
26
|
+
* @typedef {import('../lifecycle/failure-kinds.mjs').WorldStartupFailureKind | null} FailureKindOrNull
|
|
27
|
+
*/
|
|
28
|
+
|
|
29
|
+
// In-flight promise map: key = `${worldId}::${failureKind ?? 'null'}` → Promise<RecoveryLedgerEntry>
|
|
30
|
+
/** @type {Map<string, Promise<RecoveryLedgerEntry>>} */
|
|
31
|
+
const _inFlight = new Map();
|
|
32
|
+
|
|
33
|
+
/**
|
|
34
|
+
* Attempt a bounded recovery for the given world + failure kind.
|
|
35
|
+
*
|
|
36
|
+
* @param {string} worldId
|
|
37
|
+
* @param {object} [evidence] — WorldStartupEvidence, optional
|
|
38
|
+
* @param {FailureKindOrNull} [failureKind] — classified bucket, or null for non-FSM triggers
|
|
39
|
+
* @param {{ ledgerPath?: string, log?: (msg: string) => void }} [opts]
|
|
40
|
+
* @returns {Promise<RecoveryLedgerEntry>}
|
|
41
|
+
*/
|
|
42
|
+
export function attemptRecovery(worldId, evidence, failureKind = null, opts = {}) {
|
|
43
|
+
const key = `${worldId}::${failureKind ?? 'null'}`;
|
|
44
|
+
const existing = _inFlight.get(key);
|
|
45
|
+
if (existing) return existing;
|
|
46
|
+
|
|
47
|
+
const promise = _attempt(worldId, evidence, failureKind, opts).finally(() => {
|
|
48
|
+
_inFlight.delete(key);
|
|
49
|
+
});
|
|
50
|
+
_inFlight.set(key, promise);
|
|
51
|
+
return promise;
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
/**
|
|
55
|
+
* Internal: run the actual attempt. Always resolves (never rejects) — all
|
|
56
|
+
* errors are captured into the returned ledger entry.
|
|
57
|
+
*
|
|
58
|
+
* @param {string} worldId
|
|
59
|
+
* @param {object} [evidence]
|
|
60
|
+
* @param {FailureKindOrNull} failureKind
|
|
61
|
+
* @param {{ ledgerPath?: string, log?: (msg: string) => void }} opts
|
|
62
|
+
* @returns {Promise<RecoveryLedgerEntry>}
|
|
63
|
+
*/
|
|
64
|
+
async function _attempt(worldId, evidence, failureKind, opts) {
|
|
65
|
+
const { ledgerPath = DEFAULT_LEDGER_PATH, log = (msg) => console.warn(`[recovery] ${msg}`) } = opts;
|
|
66
|
+
const startedAt = Date.now();
|
|
67
|
+
|
|
68
|
+
// Idempotency check: if a prior entry exists for this key, return it
|
|
69
|
+
// with outcome='escalated' and write an escalated entry.
|
|
70
|
+
const prior = await findPriorEntry(worldId, failureKind, ledgerPath);
|
|
71
|
+
if (prior !== undefined) {
|
|
72
|
+
const escalated = /** @type {RecoveryLedgerEntry} */ ({
|
|
73
|
+
worldId,
|
|
74
|
+
failureKind: failureKind ?? null,
|
|
75
|
+
scenario: prior.scenario,
|
|
76
|
+
stepsRun: [],
|
|
77
|
+
startedAt,
|
|
78
|
+
endedAt: Date.now(),
|
|
79
|
+
outcome: 'escalated',
|
|
80
|
+
errorReason: `prior attempt already recorded (outcome=${prior.outcome})`,
|
|
81
|
+
});
|
|
82
|
+
await appendLedgerEntry(escalated, ledgerPath);
|
|
83
|
+
log(`recovery idempotency: escalated (worldId=${worldId}, kind=${failureKind})`);
|
|
84
|
+
return escalated;
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
// Find the scenario.
|
|
88
|
+
const scenario = findScenarioForKind(failureKind);
|
|
89
|
+
if (!scenario) {
|
|
90
|
+
const entry = /** @type {RecoveryLedgerEntry} */ ({
|
|
91
|
+
worldId,
|
|
92
|
+
failureKind: failureKind ?? null,
|
|
93
|
+
scenario: 'unmatched',
|
|
94
|
+
stepsRun: [],
|
|
95
|
+
startedAt,
|
|
96
|
+
endedAt: Date.now(),
|
|
97
|
+
outcome: 'failed',
|
|
98
|
+
errorReason: 'no scenario matched',
|
|
99
|
+
});
|
|
100
|
+
await appendLedgerEntry(entry, ledgerPath);
|
|
101
|
+
log(`recovery: no scenario for kind=${failureKind} (worldId=${worldId})`);
|
|
102
|
+
return entry;
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
log(`recovery: starting scenario="${scenario.name}" for worldId=${worldId}`);
|
|
106
|
+
|
|
107
|
+
// Execute steps in order, short-circuit on first failure.
|
|
108
|
+
/** @type {import('./recipes.mjs').RecoveryStep[]} */
|
|
109
|
+
const stepsRun = [];
|
|
110
|
+
/** @type {string | undefined} */
|
|
111
|
+
let errorReason;
|
|
112
|
+
/** @type {'success' | 'failed'} */
|
|
113
|
+
let outcome = 'success';
|
|
114
|
+
|
|
115
|
+
for (const step of scenario.recipe.steps) {
|
|
116
|
+
stepsRun.push(step);
|
|
117
|
+
try {
|
|
118
|
+
await runStep(step, { worldId, evidence, log });
|
|
119
|
+
} catch (err) {
|
|
120
|
+
outcome = 'failed';
|
|
121
|
+
errorReason = `step "${step.kind}" threw: ${err?.message ?? String(err)}`;
|
|
122
|
+
log(`recovery: step failed — ${errorReason}`);
|
|
123
|
+
break;
|
|
124
|
+
}
|
|
125
|
+
}
|
|
126
|
+
|
|
127
|
+
const entry = /** @type {RecoveryLedgerEntry} */ ({
|
|
128
|
+
worldId,
|
|
129
|
+
failureKind: failureKind ?? null,
|
|
130
|
+
scenario: scenario.name,
|
|
131
|
+
stepsRun,
|
|
132
|
+
startedAt,
|
|
133
|
+
endedAt: Date.now(),
|
|
134
|
+
outcome,
|
|
135
|
+
...(errorReason !== undefined ? { errorReason } : {}),
|
|
136
|
+
});
|
|
137
|
+
|
|
138
|
+
await appendLedgerEntry(entry, ledgerPath);
|
|
139
|
+
log(`recovery: scenario="${scenario.name}" outcome=${outcome} (worldId=${worldId})`);
|
|
140
|
+
return entry;
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
/**
|
|
144
|
+
* Exposed for testing only: clear the in-flight map so tests don't bleed state.
|
|
145
|
+
*/
|
|
146
|
+
export function _clearInFlight() {
|
|
147
|
+
_inFlight.clear();
|
|
148
|
+
}
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
// Recovery module barrel export.
|
|
2
|
+
//
|
|
3
|
+
// Public surface:
|
|
4
|
+
// - attemptRecovery — the engine entry point for callers (server.mjs)
|
|
5
|
+
// - FAILURE_SCENARIOS — the 7 named scenarios
|
|
6
|
+
// - findScenarioForKind — scenario lookup by failureKind
|
|
7
|
+
// - appendLedgerEntry / readAllLedgerEntries / findPriorEntry — ledger I/O
|
|
8
|
+
// - setStepRunnerSeams — test seam injection for step runners
|
|
9
|
+
//
|
|
10
|
+
// Internal:
|
|
11
|
+
// - _clearInFlight — test helper; not intended for production use
|
|
12
|
+
|
|
13
|
+
export { attemptRecovery, _clearInFlight } from './engine.mjs';
|
|
14
|
+
export { FAILURE_SCENARIOS, findScenarioForKind } from './scenarios.mjs';
|
|
15
|
+
export { appendLedgerEntry, readAllLedgerEntries, findPriorEntry, DEFAULT_LEDGER_PATH } from './ledger.mjs';
|
|
16
|
+
export { runStep, setStepRunnerSeams } from './step-runners.mjs';
|
|
@@ -0,0 +1,105 @@
|
|
|
1
|
+
// RecoveryLedger — append-only NDJSON persistence for recovery attempts.
|
|
2
|
+
//
|
|
3
|
+
// Each attempt writes one JSON line to the ledger file. The file grows
|
|
4
|
+
// monotonically; entries are never updated in-place. This keeps the
|
|
5
|
+
// ledger auditable and safe to tail/parse with `jq` while the process
|
|
6
|
+
// is running.
|
|
7
|
+
//
|
|
8
|
+
// Default path: ~/.olam/logs/recovery-ledger.ndjson
|
|
9
|
+
// Override: set OLAM_RECOVERY_LEDGER_PATH (useful in tests — point at a
|
|
10
|
+
// tmp file to isolate test runs from the real operator ledger).
|
|
11
|
+
|
|
12
|
+
import { open, mkdir, access } from 'node:fs/promises';
|
|
13
|
+
import { join, dirname } from 'node:path';
|
|
14
|
+
import { homedir } from 'node:os';
|
|
15
|
+
import { createReadStream } from 'node:fs';
|
|
16
|
+
import { createInterface } from 'node:readline';
|
|
17
|
+
import { redactSensitive } from '../observability/redactor.mjs';
|
|
18
|
+
|
|
19
|
+
export const DEFAULT_LEDGER_PATH =
|
|
20
|
+
process.env.OLAM_RECOVERY_LEDGER_PATH ??
|
|
21
|
+
join(homedir(), '.olam', 'logs', 'recovery-ledger.ndjson');
|
|
22
|
+
|
|
23
|
+
/**
|
|
24
|
+
* @typedef {object} RecoveryLedgerEntry
|
|
25
|
+
* @property {string} worldId
|
|
26
|
+
* @property {string | null} failureKind — WorldStartupFailureKind or null (non-FSM trigger)
|
|
27
|
+
* @property {string} scenario — kebab-case scenario name, or 'unmatched'
|
|
28
|
+
* @property {import('./recipes.mjs').RecoveryStep[]} stepsRun — steps actually executed (may be partial on failure)
|
|
29
|
+
* @property {number} startedAt — epoch ms
|
|
30
|
+
* @property {number} endedAt — epoch ms
|
|
31
|
+
* @property {'success' | 'failed' | 'escalated'} outcome
|
|
32
|
+
* @property {string} [errorReason] — set on failed/escalated outcomes
|
|
33
|
+
*/
|
|
34
|
+
|
|
35
|
+
/**
|
|
36
|
+
* Append a single RecoveryLedgerEntry to the ledger file.
|
|
37
|
+
*
|
|
38
|
+
* @param {RecoveryLedgerEntry} entry
|
|
39
|
+
* @param {string} [ledgerPath]
|
|
40
|
+
* @returns {Promise<void>}
|
|
41
|
+
*/
|
|
42
|
+
export async function appendLedgerEntry(entry, ledgerPath = DEFAULT_LEDGER_PATH) {
|
|
43
|
+
await mkdir(dirname(ledgerPath), { recursive: true });
|
|
44
|
+
const fh = await open(ledgerPath, 'a');
|
|
45
|
+
try {
|
|
46
|
+
await fh.write(JSON.stringify(redactSensitive(entry)) + '\n');
|
|
47
|
+
} finally {
|
|
48
|
+
await fh.close();
|
|
49
|
+
}
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
/**
|
|
53
|
+
* Read all entries from the ledger (in append order).
|
|
54
|
+
*
|
|
55
|
+
* @param {string} [ledgerPath]
|
|
56
|
+
* @returns {Promise<RecoveryLedgerEntry[]>}
|
|
57
|
+
*/
|
|
58
|
+
export async function readAllLedgerEntries(ledgerPath = DEFAULT_LEDGER_PATH) {
|
|
59
|
+
/** @type {RecoveryLedgerEntry[]} */
|
|
60
|
+
const entries = [];
|
|
61
|
+
|
|
62
|
+
// Check existence before streaming — createReadStream emits ENOENT as an
|
|
63
|
+
// error event (not a synchronous throw), which propagates through the
|
|
64
|
+
// `for await` loop and would reject the caller. An explicit access check
|
|
65
|
+
// keeps the "not yet written" path simple.
|
|
66
|
+
try {
|
|
67
|
+
await access(ledgerPath);
|
|
68
|
+
} catch {
|
|
69
|
+
return entries; // File does not exist yet.
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
const stream = createReadStream(ledgerPath, { encoding: 'utf8' });
|
|
73
|
+
const rl = createInterface({ input: stream, crlfDelay: Infinity });
|
|
74
|
+
for await (const line of rl) {
|
|
75
|
+
const trimmed = line.trim();
|
|
76
|
+
if (!trimmed) continue;
|
|
77
|
+
try {
|
|
78
|
+
entries.push(JSON.parse(trimmed));
|
|
79
|
+
} catch {
|
|
80
|
+
// Malformed line — skip and continue.
|
|
81
|
+
}
|
|
82
|
+
}
|
|
83
|
+
return entries;
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
/**
|
|
87
|
+
* Find the most recent ledger entry for a (worldId, failureKind) pair.
|
|
88
|
+
* Returns undefined if no prior entry exists.
|
|
89
|
+
*
|
|
90
|
+
* @param {string} worldId
|
|
91
|
+
* @param {string|null} failureKind
|
|
92
|
+
* @param {string} [ledgerPath]
|
|
93
|
+
* @returns {Promise<RecoveryLedgerEntry | undefined>}
|
|
94
|
+
*/
|
|
95
|
+
export async function findPriorEntry(worldId, failureKind, ledgerPath = DEFAULT_LEDGER_PATH) {
|
|
96
|
+
const all = await readAllLedgerEntries(ledgerPath);
|
|
97
|
+
// Walk in reverse to find the most recent match.
|
|
98
|
+
for (let i = all.length - 1; i >= 0; i--) {
|
|
99
|
+
const e = all[i];
|
|
100
|
+
if (e.worldId === worldId && e.failureKind === (failureKind ?? null)) {
|
|
101
|
+
return e;
|
|
102
|
+
}
|
|
103
|
+
}
|
|
104
|
+
return undefined;
|
|
105
|
+
}
|