@pleri/olam-cli 0.1.168 → 0.1.170

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. package/README.md +38 -0
  2. package/dist/commands/auth-status.d.ts +1 -0
  3. package/dist/commands/auth-status.d.ts.map +1 -1
  4. package/dist/commands/auth-status.js +45 -4
  5. package/dist/commands/auth-status.js.map +1 -1
  6. package/dist/commands/create.d.ts.map +1 -1
  7. package/dist/commands/create.js +26 -0
  8. package/dist/commands/create.js.map +1 -1
  9. package/dist/commands/enter.d.ts.map +1 -1
  10. package/dist/commands/enter.js +5 -0
  11. package/dist/commands/enter.js.map +1 -1
  12. package/dist/commands/resume.d.ts +63 -0
  13. package/dist/commands/resume.d.ts.map +1 -0
  14. package/dist/commands/resume.js +174 -0
  15. package/dist/commands/resume.js.map +1 -0
  16. package/dist/commands/setup.d.ts +19 -0
  17. package/dist/commands/setup.d.ts.map +1 -1
  18. package/dist/commands/setup.js +157 -19
  19. package/dist/commands/setup.js.map +1 -1
  20. package/dist/image-digests.json +8 -8
  21. package/dist/index.js +1021 -576
  22. package/dist/index.js.map +1 -1
  23. package/dist/lib/health-probes.d.ts +28 -0
  24. package/dist/lib/health-probes.d.ts.map +1 -1
  25. package/dist/lib/health-probes.js +75 -0
  26. package/dist/lib/health-probes.js.map +1 -1
  27. package/dist/lib/k8s-context-discovery.d.ts +80 -0
  28. package/dist/lib/k8s-context-discovery.d.ts.map +1 -0
  29. package/dist/lib/k8s-context-discovery.js +102 -0
  30. package/dist/lib/k8s-context-discovery.js.map +1 -0
  31. package/dist/mcp-server.js +1273 -771
  32. package/dist/spawn/home-override.d.ts +82 -0
  33. package/dist/spawn/home-override.d.ts.map +1 -0
  34. package/dist/spawn/home-override.js +107 -0
  35. package/dist/spawn/home-override.js.map +1 -0
  36. package/hermes-bundle/version.json +1 -1
  37. package/host-cp/k8s/manifests/30-configmap.yaml +5 -0
  38. package/host-cp/k8s/manifests/50-deployment.yaml +9 -2
  39. package/host-cp/k8s/manifests/auth-service/50-deployment.yaml +1 -1
  40. package/host-cp/k8s/manifests/kg-service/50-deployment.yaml +1 -1
  41. package/host-cp/k8s/manifests/mcp-auth-service/50-deployment.yaml +1 -1
  42. package/host-cp/k8s/manifests/memory-service/50-deployment.yaml +1 -1
  43. package/host-cp/lifecycle/classify.mjs +110 -0
  44. package/host-cp/lifecycle/emit.mjs +119 -0
  45. package/host-cp/lifecycle/evidence.mjs +45 -0
  46. package/host-cp/lifecycle/failure-kinds.mjs +56 -0
  47. package/host-cp/lifecycle/index.mjs +22 -0
  48. package/host-cp/lifecycle/phases.mjs +52 -0
  49. package/host-cp/observability/grafana-port-forward.sh +1 -1
  50. package/host-cp/observability/kyverno-cardinality-mutate.sh +2 -2
  51. package/host-cp/observability/loki-ingest.sh +1 -1
  52. package/host-cp/observability/ndjson-span-sink.mjs +131 -0
  53. package/host-cp/observability/prom-no-double-grafana.sh +4 -4
  54. package/host-cp/observability/redactor.mjs +72 -0
  55. package/host-cp/recovery/engine.mjs +148 -0
  56. package/host-cp/recovery/index.mjs +16 -0
  57. package/host-cp/recovery/ledger.mjs +105 -0
  58. package/host-cp/recovery/recipes.mjs +46 -0
  59. package/host-cp/recovery/scenarios.mjs +124 -0
  60. package/host-cp/recovery/step-runners.mjs +263 -0
  61. package/host-cp/src/docker-events.mjs +30 -6
  62. package/host-cp/src/pr-nanny.mjs +55 -3
  63. package/host-cp/src/server.mjs +173 -0
  64. package/package.json +1 -1
@@ -0,0 +1,82 @@
1
+ /**
2
+ * --claude-home — per-world Claude Code instance isolation.
3
+ *
4
+ * Today, every world inherits the operator's single `~/.claude/` (host login,
5
+ * settings, hooks, MCP topology, 429 bucket). When the operator wants to run
6
+ * one world against their personal Anthropic account and another against
7
+ * their work account on the same host, they have no clean seam — both worlds
8
+ * share `~/.claude/.credentials.json` and the same 429 cooldown.
9
+ *
10
+ * Inspired by t3code (pingdotgg/t3code) — one server process hosts N Claude
11
+ * Code instances, each with its own `HOME` env var pointing at a separate
12
+ * `~/.claude.instance-N` directory. Zero containers, zero processes-per-
13
+ * account, just `env: { HOME: '...' }` on the spawn.
14
+ *
15
+ * This module exports two pure helpers used by `olam create --claude-home`:
16
+ *
17
+ * - {@link resolveClaudeHome} — pick the directory to use as HOME for the
18
+ * world's Claude Code config snapshot. Precedence:
19
+ * 1. Explicit `--claude-home <id-or-path>` flag.
20
+ * 2. Stored claudeHome from a prior `olam create` (re-runs of
21
+ * `olam enter <worldId>` honour the original choice).
22
+ * 3. Operator's default `$HOME` (legacy behaviour — no behavioural
23
+ * change for worlds created without the flag).
24
+ *
25
+ * - {@link ensureClaudeHomeDir} — `mkdir -p` the target dir and write a
26
+ * `.olam-claude-home` sentinel so first-use detection works.
27
+ *
28
+ * Composes with @olam/auth-client's vault: the vault still rotates
29
+ * credentials WITHIN a HOME; `--claude-home` isolates DIFFERENT HOMEs from
30
+ * each other.
31
+ *
32
+ * See `docs/decisions/045-claude-home-override.md` for the full rationale
33
+ * + operator workflow.
34
+ */
35
+ /** Relative path under `$HOME` where bare-id homes are materialised. */
36
+ export declare const CLAUDE_HOMES_BASE: string;
37
+ /** First-use sentinel filename — written into a freshly-created home. */
38
+ export declare const SENTINEL_FILENAME = ".olam-claude-home";
39
+ /**
40
+ * Allowed shape of a bare `--claude-home <id>` argument.
41
+ *
42
+ * Same shape rule as world ids and skill prefixes: a leading lowercase-or-
43
+ * digit char, then 0–63 more of `[a-z0-9_-]`. Rejects `..`, spaces,
44
+ * absolute-path injections, uppercase. Absolute paths are handled
45
+ * separately (see {@link resolveClaudeHome}) — they don't go through this
46
+ * regex.
47
+ */
48
+ export declare const HOME_ID_RE: RegExp;
49
+ export interface ResolveClaudeHomeArgs {
50
+ /** `--claude-home <id-or-path>` flag value from CLI parse. Optional. */
51
+ readonly flag?: string;
52
+ /**
53
+ * Prior `claudeHome` stored on the world's metadata. When set and the
54
+ * caller didn't pass `flag`, this wins over the operator's `$HOME`.
55
+ */
56
+ readonly existingWorldConfig?: {
57
+ readonly claudeHome?: string;
58
+ };
59
+ /** Test hook — defaults to `os.homedir()`. */
60
+ readonly homeDir?: string;
61
+ /** Test hook — defaults to `process.env.HOME ?? os.homedir()`. */
62
+ readonly envHome?: string;
63
+ }
64
+ /**
65
+ * Resolve the absolute filesystem path to use as the world's Claude Code
66
+ * HOME. Pure function — no IO, no env reads beyond the injected hooks.
67
+ *
68
+ * @throws Error if `flag` (or `existingWorldConfig.claudeHome`) is a
69
+ * relative path that doesn't match {@link HOME_ID_RE}.
70
+ */
71
+ export declare function resolveClaudeHome(args: ResolveClaudeHomeArgs): string;
72
+ /**
73
+ * Ensure the resolved Claude home directory exists and carries the
74
+ * `.olam-claude-home` sentinel. Idempotent — re-running on an existing
75
+ * home leaves the sentinel byte-identical (so `olam enter` re-runs don't
76
+ * mutate the home's mtime gratuitously).
77
+ *
78
+ * Does NOT seed credentials, settings, or MCP config — that's the
79
+ * operator's job (one-time `HOME=<resolved-path> claude login`).
80
+ */
81
+ export declare function ensureClaudeHomeDir(targetPath: string): Promise<void>;
82
+ //# sourceMappingURL=home-override.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"home-override.d.ts","sourceRoot":"","sources":["../../src/spawn/home-override.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GAiCG;AAMH,wEAAwE;AACxE,eAAO,MAAM,iBAAiB,QAAqC,CAAC;AAEpE,yEAAyE;AACzE,eAAO,MAAM,iBAAiB,sBAAsB,CAAC;AAErD;;;;;;;;GAQG;AACH,eAAO,MAAM,UAAU,QAA+B,CAAC;AAEvD,MAAM,WAAW,qBAAqB;IACpC,wEAAwE;IACxE,QAAQ,CAAC,IAAI,CAAC,EAAE,MAAM,CAAC;IACvB;;;OAGG;IACH,QAAQ,CAAC,mBAAmB,CAAC,EAAE;QAAE,QAAQ,CAAC,UAAU,CAAC,EAAE,MAAM,CAAA;KAAE,CAAC;IAChE,8CAA8C;IAC9C,QAAQ,CAAC,OAAO,CAAC,EAAE,MAAM,CAAC;IAC1B,kEAAkE;IAClE,QAAQ,CAAC,OAAO,CAAC,EAAE,MAAM,CAAC;CAC3B;AAED;;;;;;GAMG;AACH,wBAAgB,iBAAiB,CAAC,IAAI,EAAE,qBAAqB,GAAG,MAAM,CAerE;AAiBD;;;;;;;;GAQG;AACH,wBAAsB,mBAAmB,CAAC,UAAU,EAAE,MAAM,GAAG,OAAO,CAAC,IAAI,CAAC,CAgB3E"}
@@ -0,0 +1,107 @@
1
+ /**
2
+ * --claude-home — per-world Claude Code instance isolation.
3
+ *
4
+ * Today, every world inherits the operator's single `~/.claude/` (host login,
5
+ * settings, hooks, MCP topology, 429 bucket). When the operator wants to run
6
+ * one world against their personal Anthropic account and another against
7
+ * their work account on the same host, they have no clean seam — both worlds
8
+ * share `~/.claude/.credentials.json` and the same 429 cooldown.
9
+ *
10
+ * Inspired by t3code (pingdotgg/t3code) — one server process hosts N Claude
11
+ * Code instances, each with its own `HOME` env var pointing at a separate
12
+ * `~/.claude.instance-N` directory. Zero containers, zero processes-per-
13
+ * account, just `env: { HOME: '...' }` on the spawn.
14
+ *
15
+ * This module exports two pure helpers used by `olam create --claude-home`:
16
+ *
17
+ * - {@link resolveClaudeHome} — pick the directory to use as HOME for the
18
+ * world's Claude Code config snapshot. Precedence:
19
+ * 1. Explicit `--claude-home <id-or-path>` flag.
20
+ * 2. Stored claudeHome from a prior `olam create` (re-runs of
21
+ * `olam enter <worldId>` honour the original choice).
22
+ * 3. Operator's default `$HOME` (legacy behaviour — no behavioural
23
+ * change for worlds created without the flag).
24
+ *
25
+ * - {@link ensureClaudeHomeDir} — `mkdir -p` the target dir and write a
26
+ * `.olam-claude-home` sentinel so first-use detection works.
27
+ *
28
+ * Composes with @olam/auth-client's vault: the vault still rotates
29
+ * credentials WITHIN a HOME; `--claude-home` isolates DIFFERENT HOMEs from
30
+ * each other.
31
+ *
32
+ * See `docs/decisions/045-claude-home-override.md` for the full rationale
33
+ * + operator workflow.
34
+ */
35
+ import { mkdir, writeFile, access } from 'node:fs/promises';
36
+ import { homedir } from 'node:os';
37
+ import path from 'node:path';
38
+ /** Relative path under `$HOME` where bare-id homes are materialised. */
39
+ export const CLAUDE_HOMES_BASE = path.join('.olam', 'claude-homes');
40
+ /** First-use sentinel filename — written into a freshly-created home. */
41
+ export const SENTINEL_FILENAME = '.olam-claude-home';
42
+ /**
43
+ * Allowed shape of a bare `--claude-home <id>` argument.
44
+ *
45
+ * Same shape rule as world ids and skill prefixes: a leading lowercase-or-
46
+ * digit char, then 0–63 more of `[a-z0-9_-]`. Rejects `..`, spaces,
47
+ * absolute-path injections, uppercase. Absolute paths are handled
48
+ * separately (see {@link resolveClaudeHome}) — they don't go through this
49
+ * regex.
50
+ */
51
+ export const HOME_ID_RE = /^[a-z0-9][a-z0-9_-]{0,63}$/;
52
+ /**
53
+ * Resolve the absolute filesystem path to use as the world's Claude Code
54
+ * HOME. Pure function — no IO, no env reads beyond the injected hooks.
55
+ *
56
+ * @throws Error if `flag` (or `existingWorldConfig.claudeHome`) is a
57
+ * relative path that doesn't match {@link HOME_ID_RE}.
58
+ */
59
+ export function resolveClaudeHome(args) {
60
+ const home = args.homeDir ?? homedir();
61
+ if (args.flag !== undefined && args.flag.length > 0) {
62
+ return resolveSpec(args.flag, home);
63
+ }
64
+ const stored = args.existingWorldConfig?.claudeHome;
65
+ if (stored !== undefined && stored.length > 0) {
66
+ return resolveSpec(stored, home);
67
+ }
68
+ // Default — legacy behaviour. Operator's $HOME, with same fallback as
69
+ // node's stdlib (HOME env → os.homedir()).
70
+ return args.envHome ?? home;
71
+ }
72
+ function resolveSpec(spec, home) {
73
+ if (path.isAbsolute(spec)) {
74
+ return spec;
75
+ }
76
+ if (!HOME_ID_RE.test(spec)) {
77
+ throw new Error(`--claude-home value "${spec}" must be either an absolute path ` +
78
+ `or a bare id matching ${HOME_ID_RE} (alphanumeric, underscore, dash; ` +
79
+ `starts with alphanumeric; max 64 chars). Rejected: relative paths, ` +
80
+ `parent refs, uppercase, whitespace.`);
81
+ }
82
+ return path.join(home, CLAUDE_HOMES_BASE, spec);
83
+ }
84
+ /**
85
+ * Ensure the resolved Claude home directory exists and carries the
86
+ * `.olam-claude-home` sentinel. Idempotent — re-running on an existing
87
+ * home leaves the sentinel byte-identical (so `olam enter` re-runs don't
88
+ * mutate the home's mtime gratuitously).
89
+ *
90
+ * Does NOT seed credentials, settings, or MCP config — that's the
91
+ * operator's job (one-time `HOME=<resolved-path> claude login`).
92
+ */
93
+ export async function ensureClaudeHomeDir(targetPath) {
94
+ await mkdir(targetPath, { recursive: true });
95
+ const sentinelPath = path.join(targetPath, SENTINEL_FILENAME);
96
+ try {
97
+ await access(sentinelPath);
98
+ // Sentinel exists — leave it alone (idempotency).
99
+ return;
100
+ }
101
+ catch {
102
+ // Sentinel absent → write it.
103
+ }
104
+ await writeFile(sentinelPath, '# olam claude-home — managed by `olam create --claude-home`.\n' +
105
+ '# Do not delete. Re-create by running `olam create --claude-home <id>` again.\n', 'utf-8');
106
+ }
107
+ //# sourceMappingURL=home-override.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"home-override.js","sourceRoot":"","sources":["../../src/spawn/home-override.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GAiCG;AAEH,OAAO,EAAE,KAAK,EAAE,SAAS,EAAE,MAAM,EAAE,MAAM,kBAAkB,CAAC;AAC5D,OAAO,EAAE,OAAO,EAAE,MAAM,SAAS,CAAC;AAClC,OAAO,IAAI,MAAM,WAAW,CAAC;AAE7B,wEAAwE;AACxE,MAAM,CAAC,MAAM,iBAAiB,GAAG,IAAI,CAAC,IAAI,CAAC,OAAO,EAAE,cAAc,CAAC,CAAC;AAEpE,yEAAyE;AACzE,MAAM,CAAC,MAAM,iBAAiB,GAAG,mBAAmB,CAAC;AAErD;;;;;;;;GAQG;AACH,MAAM,CAAC,MAAM,UAAU,GAAG,4BAA4B,CAAC;AAgBvD;;;;;;GAMG;AACH,MAAM,UAAU,iBAAiB,CAAC,IAA2B;IAC3D,MAAM,IAAI,GAAG,IAAI,CAAC,OAAO,IAAI,OAAO,EAAE,CAAC;IAEvC,IAAI,IAAI,CAAC,IAAI,KAAK,SAAS,IAAI,IAAI,CAAC,IAAI,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;QACpD,OAAO,WAAW,CAAC,IAAI,CAAC,IAAI,EAAE,IAAI,CAAC,CAAC;IACtC,CAAC;IAED,MAAM,MAAM,GAAG,IAAI,CAAC,mBAAmB,EAAE,UAAU,CAAC;IACpD,IAAI,MAAM,KAAK,SAAS,IAAI,MAAM,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;QAC9C,OAAO,WAAW,CAAC,MAAM,EAAE,IAAI,CAAC,CAAC;IACnC,CAAC;IAED,sEAAsE;IACtE,2CAA2C;IAC3C,OAAO,IAAI,CAAC,OAAO,IAAI,IAAI,CAAC;AAC9B,CAAC;AAED,SAAS,WAAW,CAAC,IAAY,EAAE,IAAY;IAC7C,IAAI,IAAI,CAAC,UAAU,CAAC,IAAI,CAAC,EAAE,CAAC;QAC1B,OAAO,IAAI,CAAC;IACd,CAAC;IACD,IAAI,CAAC,UAAU,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE,CAAC;QAC3B,MAAM,IAAI,KAAK,CACb,wBAAwB,IAAI,oCAAoC;YAC9D,yBAAyB,UAAU,oCAAoC;YACvE,qEAAqE;YACrE,qCAAqC,CACxC,CAAC;IACJ,CAAC;IACD,OAAO,IAAI,CAAC,IAAI,CAAC,IAAI,EAAE,iBAAiB,EAAE,IAAI,CAAC,CAAC;AAClD,CAAC;AAED;;;;;;;;GAQG;AACH,MAAM,CAAC,KAAK,UAAU,mBAAmB,CAAC,UAAkB;IAC1D,MAAM,KAAK,CAAC,UAAU,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC,CAAC;IAC7C,MAAM,YAAY,GAAG,IAAI,CAAC,IAAI,CAAC,UAAU,EAAE,iBAAiB,CAAC,CAAC;IAC9D,IAAI,CAAC;QACH,MAAM,MAAM,CAAC,YAAY,CAAC,CAAC;QAC3B,kDAAkD;QAClD,OAAO;IACT,CAAC;IAAC,MAAM,CAAC;QACP,8BAA8B;IAChC,CAAC;IACD,MAAM,SAAS,CACb,YAAY,EACZ,gEAAgE;QAC9D,iFAAiF,EACnF,OAAO,CACR,CAAC;AACJ,CAAC"}
@@ -1,4 +1,4 @@
1
1
  {
2
- "bundledAt": "2026-05-23T09:39:55.721Z",
2
+ "bundledAt": "2026-05-24T03:12:41.846Z",
3
3
  "kgFirstSha": "29a9ccce1b115d049e375c4a90eb5cf7c123e610e2d0590270a4db2cdbc64a28"
4
4
  }
@@ -30,6 +30,11 @@ data:
30
30
  # the env override is mandatory here, not optional. Bind-mounted /data
31
31
  # is the writable PVC.
32
32
  OLAM_PLAN_CHAT_SECRET_PATH: "/data/plan-chat-secret"
33
+ # NDJSON span sink + recovery ledger — route to the writable PVC mount at
34
+ # /data rather than the default ~/.olam/logs (which resolves to
35
+ # /home/node/.olam/logs and is not writable with readOnlyRootFilesystem: true).
36
+ OLAM_TRACE_LOG_PATH: "/data/logs/host.trace.ndjson"
37
+ OLAM_RECOVERY_LEDGER_PATH: "/data/logs/recovery-ledger.ndjson"
33
38
  # Tunable defaults.
34
39
  OLAM_SECRET_CACHE_TTL_SEC: "300"
35
40
  OLAM_PR_POLL_INTERVAL_MS: "300000"
@@ -1,7 +1,14 @@
1
1
  # Deployment for olam-host-cp.
2
2
  #
3
3
  # Image: pinned to sha256 digest (not :latest or named tag) per T4 threat model.
4
- # Digest resolves to ghcr.io/pleri/olam-host-cp:0.1.143 (multi-arch index).
4
+ # Digest resolves to ghcr.io/pleri/olam-host-cp:0.1.168 (multi-arch index).
5
+ # Pinned to the last image built before PRs #915/#919/#920/#921 introduced
6
+ # lifecycle/, observability/, and recovery/ module directories — those PRs
7
+ # updated server.mjs imports but the Dockerfile was not updated to COPY
8
+ # the new directories, so all images from 0.1.169+ crash with
9
+ # ERR_MODULE_NOT_FOUND. The Dockerfile fix (COPY lifecycle/ / observability/
10
+ # / recovery/) lands in PR #940; the next release will ship a working image.
11
+ # At that point, refresh this digest via the instructions below.
5
12
  # To update: resolve the new tag's digest via:
6
13
  # TOKEN=$(curl -s "https://ghcr.io/token?scope=repository:pleri/olam-host-cp:pull&service=ghcr.io" | jq -r .token)
7
14
  # curl -sI -H "Authorization: Bearer $TOKEN" \
@@ -111,7 +118,7 @@ spec:
111
118
  # k3d), started by `olam upgrade` Step 0.7 — not inside this Pod.
112
119
  containers:
113
120
  - name: olam-host-cp
114
- image: ghcr.io/pleri/olam-host-cp@sha256:766e07263fcf7e765c3689a7b8d40c47754b4ab90c697710843265a7fc84969a
121
+ image: ghcr.io/pleri/olam-host-cp@sha256:1206e857af61f8907d76d9324adbe8d2d5638a94fe2411c6713ffb4f570e8f58
115
122
  imagePullPolicy: IfNotPresent
116
123
  securityContext:
117
124
  runAsNonRoot: true
@@ -70,7 +70,7 @@ spec:
70
70
  mountPath: /data
71
71
  containers:
72
72
  - name: olam-auth-service
73
- image: ghcr.io/pleri/olam-auth@sha256:c6d163f7ac5fe1ca4652ed34afb1d8555c6f61d06398db767db65fee0944b209
73
+ image: ghcr.io/pleri/olam-auth@sha256:2d32d178380641bcdae11f9ad05851238bd4b121adfc9638c8abed3b25467846
74
74
  imagePullPolicy: IfNotPresent
75
75
  securityContext:
76
76
  runAsNonRoot: true
@@ -61,7 +61,7 @@ spec:
61
61
  mountPath: /data
62
62
  containers:
63
63
  - name: olam-kg-service
64
- image: ghcr.io/pleri/olam-kg-service@sha256:77fd9b19d87c6f4cba4d33d76ff476dd7677f78725f3bf75a9076009e17355cc
64
+ image: ghcr.io/pleri/olam-kg-service@sha256:ee636804b8cffd40a1fb75ba3f79cc0c30a17e89c9a135864567859ccdf895d7
65
65
  imagePullPolicy: IfNotPresent
66
66
  securityContext:
67
67
  runAsNonRoot: true
@@ -68,7 +68,7 @@ spec:
68
68
  mountPath: /data
69
69
  containers:
70
70
  - name: olam-mcp-auth-service
71
- image: ghcr.io/pleri/olam-mcp-auth@sha256:cb5b1d7caece5bca4a4723eb20522a748cd48001aa94a7e7ec106d29bd2142b0
71
+ image: ghcr.io/pleri/olam-mcp-auth@sha256:07cdd816ac1d991c065f2936b142a5c6909da683d9a6d4efbe7fe66f0c811821
72
72
  imagePullPolicy: IfNotPresent
73
73
  securityContext:
74
74
  runAsNonRoot: true
@@ -70,7 +70,7 @@ spec:
70
70
  # bootstrap-placeholder comment + run `npm run refresh:manifest-digests`
71
71
  # once ghcr.io/pleri/olam-memory-service has a real published digest.
72
72
  # bootstrap-placeholder: pre-publish; refresh after first release
73
- image: ghcr.io/pleri/olam-memory-service@sha256:38b2c1f36e49183f5d36999c6519533a8402f3e784109ede8ad7f6e1a205c195
73
+ image: ghcr.io/pleri/olam-memory-service@sha256:20443e8e6725151f7523a8a85c73c7449767782de1d03bb172ba395df19a0939
74
74
  imagePullPolicy: IfNotPresent
75
75
  securityContext:
76
76
  runAsNonRoot: true
@@ -0,0 +1,110 @@
1
+ // classifyStartupFailure — pure mapping from evidence shape to bucket.
2
+ //
3
+ // Precedence rules (walked top-down; first match wins):
4
+ //
5
+ // 1. processExitCode !== undefined → ProviderProcessGone
6
+ // The agent process is dead; nothing else matters. This is the
7
+ // highest-confidence signal because it's observable from outside
8
+ // the container (docker exit code, child_process exit).
9
+ //
10
+ // 2. pluginErrors.length > 0 → PluginStartupFailed
11
+ // Boot-time stderr from a plugin/skill source is definitive.
12
+ // Comes before transport/handshake checks because a failed
13
+ // plugin can leave transport+mcp in 'pending' permanently.
14
+ //
15
+ // 3. transportStatus === 'failed' → TransportDead
16
+ // Channel-open never succeeded — agent is alive but unreachable.
17
+ //
18
+ // 4. mcpHandshakeStatus === 'failed' → McpHandshakeStall
19
+ // Channel opened, MCP handshake explicitly failed.
20
+ //
21
+ // 5. mcpHandshakeStatus === 'pending'
22
+ // AND elapsedSecondsSinceCreation > 30 → McpHandshakeStall
23
+ // Time-bounded inference: a never-completed handshake after 30s
24
+ // is the stall signal even without an explicit failure marker.
25
+ //
26
+ // 6. lastPhase === 'TrustRequired'
27
+ // AND elapsedSecondsSinceCreation > 10 → TrustGateUnanswered
28
+ // Agent reached the trust gate; no approval ever came back.
29
+ // 10s is the operator's attention budget — past that, the
30
+ // agent is silently stuck on a human gate.
31
+ //
32
+ // 7. promptSentAt !== undefined
33
+ // AND firstThoughtAt === undefined → PromptMisdelivery
34
+ // Dispatch landed on the host side but the agent never produced
35
+ // a first thought — the prompt didn't reach the agent process.
36
+ //
37
+ // 8. lastPhase === 'TrustRequired' → TrustGateUnanswered (fallback)
38
+ // Stuck at the trust gate even under 10s — still the most likely
39
+ // explanation for a Failed transition from that phase.
40
+ //
41
+ // 9. fallthrough → PromptMisdelivery
42
+ // The classifier is total: every Failed transition gets a bucket.
43
+ // PromptMisdelivery is the most operator-actionable "we don't
44
+ // know why but the dispatch path is the prime suspect" default.
45
+ //
46
+ // Tests in __tests__/classify.test.mjs assert exactly one case per
47
+ // bucket. The function is pure: no I/O, no side effects, deterministic
48
+ // — same evidence in always yields the same bucket out.
49
+
50
+ import { WorldStartupFailureKind } from './failure-kinds.mjs';
51
+
52
+ const MCP_HANDSHAKE_STALL_THRESHOLD_SECONDS = 30;
53
+ const TRUST_GATE_UNANSWERED_THRESHOLD_SECONDS = 10;
54
+
55
+ /**
56
+ * Map a WorldStartupEvidence bundle to its WorldStartupFailureKind.
57
+ *
58
+ * @param {import('./evidence.mjs').WorldStartupEvidence} evidence
59
+ * @returns {import('./failure-kinds.mjs').WorldStartupFailureKind}
60
+ */
61
+ export function classifyStartupFailure(evidence) {
62
+ // 1. Process exited — terminal signal, short-circuits all other checks.
63
+ if (evidence.processExitCode !== undefined) {
64
+ return WorldStartupFailureKind.ProviderProcessGone;
65
+ }
66
+
67
+ // 2. Plugin boot errors — definitive boot-time failure.
68
+ if (evidence.pluginErrors.length > 0) {
69
+ return WorldStartupFailureKind.PluginStartupFailed;
70
+ }
71
+
72
+ // 3. Transport explicitly failed — agent alive but unreachable.
73
+ if (evidence.transportStatus === 'failed') {
74
+ return WorldStartupFailureKind.TransportDead;
75
+ }
76
+
77
+ // 4. MCP handshake explicitly failed.
78
+ if (evidence.mcpHandshakeStatus === 'failed') {
79
+ return WorldStartupFailureKind.McpHandshakeStall;
80
+ }
81
+
82
+ // 5. MCP handshake pending past threshold — inferred stall.
83
+ if (
84
+ evidence.mcpHandshakeStatus === 'pending' &&
85
+ evidence.elapsedSecondsSinceCreation > MCP_HANDSHAKE_STALL_THRESHOLD_SECONDS
86
+ ) {
87
+ return WorldStartupFailureKind.McpHandshakeStall;
88
+ }
89
+
90
+ // 6. Stuck on trust gate past operator-attention threshold.
91
+ if (
92
+ evidence.lastPhase === 'TrustRequired' &&
93
+ evidence.elapsedSecondsSinceCreation > TRUST_GATE_UNANSWERED_THRESHOLD_SECONDS
94
+ ) {
95
+ return WorldStartupFailureKind.TrustGateUnanswered;
96
+ }
97
+
98
+ // 7. Prompt sent but agent never produced a first thought.
99
+ if (evidence.promptSentAt !== undefined && evidence.firstThoughtAt === undefined) {
100
+ return WorldStartupFailureKind.PromptMisdelivery;
101
+ }
102
+
103
+ // 8. Still at trust gate under threshold — bucket as trust-gate.
104
+ if (evidence.lastPhase === 'TrustRequired') {
105
+ return WorldStartupFailureKind.TrustGateUnanswered;
106
+ }
107
+
108
+ // 9. Total-function fallback.
109
+ return WorldStartupFailureKind.PromptMisdelivery;
110
+ }
@@ -0,0 +1,119 @@
1
+ // recordWorldLifecycle — the single broadcast helper every host-cp
2
+ // surface uses to emit a lifecycle transition.
3
+ //
4
+ // Emits TWO event types on the host-stream:
5
+ //
6
+ // 1. event: 'world.lifecycle' → live SSE consumers (SPA, MCP, etc.).
7
+ // Shape: { worldId, phase, at, evidence?, failureKind? }
8
+ //
9
+ // 2. event: 'span' → NDJSON trace sink (PR #915 + follow-ups).
10
+ // Shape: { name: 'world.lifecycle', startedAt: at, endedAt: at,
11
+ // attributes: { worldId, phase, evidence?, failureKind? },
12
+ // exit: { _tag: 'Success' | 'Failure', reason? } }
13
+ //
14
+ // The dual-emit keeps live consumers and trace consumers on the same
15
+ // substrate without either path coupling to the other. The README jq
16
+ // example `select(.name == "world.lifecycle" ...)` matches the span
17
+ // emission; the SPA's `useHostStream().subscribe('world.lifecycle', ...)`
18
+ // matches the live emission.
19
+ //
20
+ // Failed transitions auto-classify via classifyStartupFailure(evidence)
21
+ // when caller passes evidence but omits an explicit failureKind. Callers
22
+ // MAY provide their own failureKind to override the inference (e.g.
23
+ // docker SIGKILL — the caller knows it was ProviderProcessGone before
24
+ // the classifier could trip its time-thresholds).
25
+
26
+ import { TERMINAL_PHASES, WorldLifecyclePhase } from './phases.mjs';
27
+ import { classifyStartupFailure } from './classify.mjs';
28
+ import { redactSensitive } from '../observability/redactor.mjs';
29
+
30
+ /**
31
+ * @typedef {object} HostStreamLike
32
+ * @property {(eventType: string, payload: unknown) => unknown} broadcast
33
+ */
34
+
35
+ /**
36
+ * @typedef {object} WorldLifecycleEvent
37
+ * @property {string} worldId
38
+ * @property {import('./phases.mjs').WorldLifecyclePhase} phase
39
+ * @property {number} at
40
+ * @property {import('./evidence.mjs').WorldStartupEvidence} [evidence]
41
+ * @property {import('./failure-kinds.mjs').WorldStartupFailureKind} [failureKind]
42
+ */
43
+
44
+ /**
45
+ * Emit a world lifecycle transition on both `world.lifecycle` and `span`
46
+ * host-stream channels.
47
+ *
48
+ * @param {HostStreamLike} hostStream
49
+ * @param {object} args
50
+ * @param {string} args.worldId
51
+ * @param {import('./phases.mjs').WorldLifecyclePhase} args.phase
52
+ * @param {number} [args.at]
53
+ * @param {import('./evidence.mjs').WorldStartupEvidence} [args.evidence]
54
+ * @param {import('./failure-kinds.mjs').WorldStartupFailureKind} [args.failureKind]
55
+ * @returns {WorldLifecycleEvent} the payload that was broadcast (test convenience)
56
+ */
57
+ export function recordWorldLifecycle(hostStream, args) {
58
+ if (!hostStream || typeof hostStream.broadcast !== 'function') {
59
+ throw new TypeError('recordWorldLifecycle: hostStream.broadcast is required');
60
+ }
61
+ if (typeof args?.worldId !== 'string' || args.worldId.length === 0) {
62
+ throw new TypeError('recordWorldLifecycle: worldId is required');
63
+ }
64
+ if (typeof args?.phase !== 'string') {
65
+ throw new TypeError('recordWorldLifecycle: phase is required');
66
+ }
67
+
68
+ const at = typeof args.at === 'number' ? args.at : Date.now();
69
+
70
+ // Resolve failureKind: explicit override > classifier inference > undefined.
71
+ let failureKind = args.failureKind;
72
+ if (
73
+ failureKind === undefined &&
74
+ args.phase === WorldLifecyclePhase.Failed &&
75
+ args.evidence !== undefined
76
+ ) {
77
+ failureKind = classifyStartupFailure(args.evidence);
78
+ }
79
+
80
+ /** @type {WorldLifecycleEvent} */
81
+ const livePayload = {
82
+ worldId: args.worldId,
83
+ phase: args.phase,
84
+ at,
85
+ };
86
+ if (args.evidence !== undefined) livePayload.evidence = redactSensitive(args.evidence);
87
+ if (failureKind !== undefined) livePayload.failureKind = failureKind;
88
+
89
+ hostStream.broadcast('world.lifecycle', livePayload);
90
+
91
+ // Mirror as a span so the NDJSON trace sink (PR #915) records it.
92
+ // Lifecycle transitions are point-in-time events — startedAt === endedAt.
93
+ /** @type {Record<string, unknown>} */
94
+ const spanAttributes = {
95
+ worldId: args.worldId,
96
+ phase: args.phase,
97
+ };
98
+ if (args.evidence !== undefined) spanAttributes.evidence = redactSensitive(args.evidence);
99
+ if (failureKind !== undefined) spanAttributes.failureKind = failureKind;
100
+
101
+ /** @type {{ _tag: 'Success' | 'Failure', reason?: string }} */
102
+ const exit =
103
+ args.phase === WorldLifecyclePhase.Failed
104
+ ? { _tag: 'Failure', reason: failureKind ?? 'unclassified' }
105
+ : { _tag: 'Success' };
106
+
107
+ hostStream.broadcast('span', {
108
+ name: 'world.lifecycle',
109
+ startedAt: at,
110
+ endedAt: at,
111
+ attributes: spanAttributes,
112
+ exit,
113
+ });
114
+
115
+ return livePayload;
116
+ }
117
+
118
+ /** Re-export so callers don't need to import both modules. */
119
+ export { WorldLifecyclePhase, TERMINAL_PHASES };
@@ -0,0 +1,45 @@
1
+ // WorldStartupEvidence — the typed bundle the classifier consumes.
2
+ //
3
+ // Every Failed lifecycle transition carries one of these. Fields are
4
+ // strict-optional (undefined, not null) so consumers can use the
5
+ // presence/absence as a signal directly (`promptSentAt === undefined`
6
+ // is itself the PromptMisdelivery signal).
7
+
8
+ /**
9
+ * @typedef {'pending' | 'ok' | 'failed'} HandshakeStatus
10
+ */
11
+
12
+ /**
13
+ * @typedef {object} WorldStartupEvidence
14
+ * @property {string} worldId
15
+ * @property {import('./phases.mjs').WorldLifecyclePhase} lastPhase
16
+ * @property {number} lastPhaseAt epoch ms
17
+ * @property {number} [promptSentAt] undefined if no dispatch ever sent
18
+ * @property {number} [firstThoughtAt] undefined if no thoughts ever produced
19
+ * @property {HandshakeStatus} mcpHandshakeStatus
20
+ * @property {HandshakeStatus} transportStatus
21
+ * @property {string[]} pluginErrors captured stderr lines from plugin boot
22
+ * @property {number} [processExitCode]
23
+ * @property {number} elapsedSecondsSinceCreation
24
+ */
25
+
26
+ /**
27
+ * Construct an empty evidence bundle for a freshly-spawned world.
28
+ * Caller mutates fields as transitions happen, then passes to the
29
+ * classifier on Failed.
30
+ *
31
+ * @param {string} worldId
32
+ * @param {number} [now]
33
+ * @returns {WorldStartupEvidence}
34
+ */
35
+ export function emptyEvidence(worldId, now = Date.now()) {
36
+ return {
37
+ worldId,
38
+ lastPhase: 'Spawning',
39
+ lastPhaseAt: now,
40
+ mcpHandshakeStatus: 'pending',
41
+ transportStatus: 'pending',
42
+ pluginErrors: [],
43
+ elapsedSecondsSinceCreation: 0,
44
+ };
45
+ }
@@ -0,0 +1,56 @@
1
+ // World startup failure buckets — the six canonical classes the
2
+ // classifier maps every observed Failed transition into.
3
+ //
4
+ // Order is load-bearing: the classifier walks these in declaration
5
+ // order on ambiguous evidence, so higher-confidence buckets
6
+ // (PromptMisdelivery, TransportDead) come before time-bounded
7
+ // inferences (TrustGateUnanswered, McpHandshakeStall). Adding a 7th
8
+ // bucket requires updating the classifier precedence and the
9
+ // `world.lifecycle.Failed` consumers in the SPA + NDJSON trace.
10
+
11
+ /**
12
+ * @typedef {| 'PromptMisdelivery'
13
+ * | 'TransportDead'
14
+ * | 'TrustGateUnanswered'
15
+ * | 'McpHandshakeStall'
16
+ * | 'PluginStartupFailed'
17
+ * | 'ProviderProcessGone'} WorldStartupFailureKind
18
+ */
19
+
20
+ /**
21
+ * @type {Readonly<Record<WorldStartupFailureKind, WorldStartupFailureKind>>}
22
+ */
23
+ export const WorldStartupFailureKind = Object.freeze({
24
+ /** Dispatch sent but agent never received it (transport mismatch). */
25
+ PromptMisdelivery: 'PromptMisdelivery',
26
+ /** stdin/stdout/IPC channel never opened. */
27
+ TransportDead: 'TransportDead',
28
+ /** Agent reached TrustRequired, no approval ever arrived. */
29
+ TrustGateUnanswered: 'TrustGateUnanswered',
30
+ /** MCP server connection initialized but never completed handshake. */
31
+ McpHandshakeStall: 'McpHandshakeStall',
32
+ /** Plugin or skill source failed to load on boot. */
33
+ PluginStartupFailed: 'PluginStartupFailed',
34
+ /** Agent (Claude Code) process exited before responding. */
35
+ ProviderProcessGone: 'ProviderProcessGone',
36
+ });
37
+
38
+ export const WORLD_STARTUP_FAILURE_KIND_ORDER = Object.freeze([
39
+ WorldStartupFailureKind.PromptMisdelivery,
40
+ WorldStartupFailureKind.TransportDead,
41
+ WorldStartupFailureKind.TrustGateUnanswered,
42
+ WorldStartupFailureKind.McpHandshakeStall,
43
+ WorldStartupFailureKind.PluginStartupFailed,
44
+ WorldStartupFailureKind.ProviderProcessGone,
45
+ ]);
46
+
47
+ /**
48
+ * @param {unknown} value
49
+ * @returns {value is WorldStartupFailureKind}
50
+ */
51
+ export function isWorldStartupFailureKind(value) {
52
+ return (
53
+ typeof value === 'string' &&
54
+ WORLD_STARTUP_FAILURE_KIND_ORDER.includes(/** @type {any} */ (value))
55
+ );
56
+ }
@@ -0,0 +1,22 @@
1
+ // Barrel re-export for the lifecycle module. Importers should pull
2
+ // from '@olam/host-cp/lifecycle' (or the relative path equivalent)
3
+ // rather than reaching into individual files.
4
+
5
+ export {
6
+ WorldLifecyclePhase,
7
+ WORLD_LIFECYCLE_PHASE_ORDER,
8
+ TERMINAL_PHASES,
9
+ isWorldLifecyclePhase,
10
+ } from './phases.mjs';
11
+
12
+ export {
13
+ WorldStartupFailureKind,
14
+ WORLD_STARTUP_FAILURE_KIND_ORDER,
15
+ isWorldStartupFailureKind,
16
+ } from './failure-kinds.mjs';
17
+
18
+ export { emptyEvidence } from './evidence.mjs';
19
+
20
+ export { classifyStartupFailure } from './classify.mjs';
21
+
22
+ export { recordWorldLifecycle } from './emit.mjs';