@pleri/olam-cli 0.1.168 → 0.1.170

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. package/README.md +38 -0
  2. package/dist/commands/auth-status.d.ts +1 -0
  3. package/dist/commands/auth-status.d.ts.map +1 -1
  4. package/dist/commands/auth-status.js +45 -4
  5. package/dist/commands/auth-status.js.map +1 -1
  6. package/dist/commands/create.d.ts.map +1 -1
  7. package/dist/commands/create.js +26 -0
  8. package/dist/commands/create.js.map +1 -1
  9. package/dist/commands/enter.d.ts.map +1 -1
  10. package/dist/commands/enter.js +5 -0
  11. package/dist/commands/enter.js.map +1 -1
  12. package/dist/commands/resume.d.ts +63 -0
  13. package/dist/commands/resume.d.ts.map +1 -0
  14. package/dist/commands/resume.js +174 -0
  15. package/dist/commands/resume.js.map +1 -0
  16. package/dist/commands/setup.d.ts +19 -0
  17. package/dist/commands/setup.d.ts.map +1 -1
  18. package/dist/commands/setup.js +157 -19
  19. package/dist/commands/setup.js.map +1 -1
  20. package/dist/image-digests.json +8 -8
  21. package/dist/index.js +1021 -576
  22. package/dist/index.js.map +1 -1
  23. package/dist/lib/health-probes.d.ts +28 -0
  24. package/dist/lib/health-probes.d.ts.map +1 -1
  25. package/dist/lib/health-probes.js +75 -0
  26. package/dist/lib/health-probes.js.map +1 -1
  27. package/dist/lib/k8s-context-discovery.d.ts +80 -0
  28. package/dist/lib/k8s-context-discovery.d.ts.map +1 -0
  29. package/dist/lib/k8s-context-discovery.js +102 -0
  30. package/dist/lib/k8s-context-discovery.js.map +1 -0
  31. package/dist/mcp-server.js +1273 -771
  32. package/dist/spawn/home-override.d.ts +82 -0
  33. package/dist/spawn/home-override.d.ts.map +1 -0
  34. package/dist/spawn/home-override.js +107 -0
  35. package/dist/spawn/home-override.js.map +1 -0
  36. package/hermes-bundle/version.json +1 -1
  37. package/host-cp/k8s/manifests/30-configmap.yaml +5 -0
  38. package/host-cp/k8s/manifests/50-deployment.yaml +9 -2
  39. package/host-cp/k8s/manifests/auth-service/50-deployment.yaml +1 -1
  40. package/host-cp/k8s/manifests/kg-service/50-deployment.yaml +1 -1
  41. package/host-cp/k8s/manifests/mcp-auth-service/50-deployment.yaml +1 -1
  42. package/host-cp/k8s/manifests/memory-service/50-deployment.yaml +1 -1
  43. package/host-cp/lifecycle/classify.mjs +110 -0
  44. package/host-cp/lifecycle/emit.mjs +119 -0
  45. package/host-cp/lifecycle/evidence.mjs +45 -0
  46. package/host-cp/lifecycle/failure-kinds.mjs +56 -0
  47. package/host-cp/lifecycle/index.mjs +22 -0
  48. package/host-cp/lifecycle/phases.mjs +52 -0
  49. package/host-cp/observability/grafana-port-forward.sh +1 -1
  50. package/host-cp/observability/kyverno-cardinality-mutate.sh +2 -2
  51. package/host-cp/observability/loki-ingest.sh +1 -1
  52. package/host-cp/observability/ndjson-span-sink.mjs +131 -0
  53. package/host-cp/observability/prom-no-double-grafana.sh +4 -4
  54. package/host-cp/observability/redactor.mjs +72 -0
  55. package/host-cp/recovery/engine.mjs +148 -0
  56. package/host-cp/recovery/index.mjs +16 -0
  57. package/host-cp/recovery/ledger.mjs +105 -0
  58. package/host-cp/recovery/recipes.mjs +46 -0
  59. package/host-cp/recovery/scenarios.mjs +124 -0
  60. package/host-cp/recovery/step-runners.mjs +263 -0
  61. package/host-cp/src/docker-events.mjs +30 -6
  62. package/host-cp/src/pr-nanny.mjs +55 -3
  63. package/host-cp/src/server.mjs +173 -0
  64. package/package.json +1 -1
@@ -0,0 +1,124 @@
1
+ // Recovery scenarios — named mappings from WorldStartupFailureKind (or a
2
+ // special non-FSM signal) to a deterministic RecoveryRecipe.
3
+ //
4
+ // Order within each recipe is load-bearing: steps execute in sequence,
5
+ // first failure short-circuits. Designed for ONE bounded auto-attempt;
6
+ // callers MUST NOT retry a scenario — the engine's idempotency guard
7
+ // enforces this at the (worldId, failureKind) level.
8
+ //
9
+ // The 'stale-branch' scenario has no failureKind (null) — it is triggered
10
+ // by a non-FSM signal (e.g. CI indicating the branch is stale). The engine
11
+ // accepts null as a valid key but treats it as a distinct bucket.
12
+
13
+ /**
14
+ * @typedef {import('./recipes.mjs').RecoveryStep} RecoveryStep
15
+ * @typedef {import('./recipes.mjs').RecoveryRecipe} RecoveryRecipe
16
+ * @typedef {import('../lifecycle/failure-kinds.mjs').WorldStartupFailureKind | null} FailureKindOrNull
17
+ */
18
+
19
+ /**
20
+ * @typedef {object} FailureScenario
21
+ * @property {string} name — kebab-case identifier
22
+ * @property {FailureKindOrNull} failureKind — the FSM bucket this scenario handles (null = non-FSM trigger)
23
+ * @property {string} description — one-line human summary
24
+ * @property {RecoveryRecipe} recipe
25
+ */
26
+
27
+ /** @type {readonly FailureScenario[]} */
28
+ export const FAILURE_SCENARIOS = Object.freeze([
29
+ {
30
+ name: 'trust-gate-stuck',
31
+ failureKind: 'TrustGateUnanswered',
32
+ description: 'Agent reached TrustRequired but no trust approval arrived within the timeout.',
33
+ recipe: {
34
+ scenarioName: 'trust-gate-stuck',
35
+ steps: [
36
+ { kind: 'NotifyOperator', message: 'Trust gate unanswered — re-sending trust prompt.' },
37
+ { kind: 'ResendTrustPrompt' },
38
+ { kind: 'WaitFor', durationMs: 30_000 },
39
+ ],
40
+ },
41
+ },
42
+ {
43
+ name: 'prompt-misdelivery',
44
+ failureKind: 'PromptMisdelivery',
45
+ description: 'Dispatch was sent but the agent never received it (transport mismatch).',
46
+ recipe: {
47
+ scenarioName: 'prompt-misdelivery',
48
+ steps: [
49
+ { kind: 'RestartTransport' },
50
+ { kind: 'ResendDispatch' },
51
+ ],
52
+ },
53
+ },
54
+ {
55
+ name: 'transport-dead',
56
+ failureKind: 'TransportDead',
57
+ description: 'stdin/stdout/IPC channel never opened.',
58
+ recipe: {
59
+ scenarioName: 'transport-dead',
60
+ steps: [
61
+ { kind: 'RestartTransport' },
62
+ { kind: 'RestartWorker' },
63
+ ],
64
+ },
65
+ },
66
+ {
67
+ name: 'mcp-handshake-stall',
68
+ failureKind: 'McpHandshakeStall',
69
+ description: 'MCP server connection initialized but never completed handshake.',
70
+ recipe: {
71
+ scenarioName: 'mcp-handshake-stall',
72
+ steps: [
73
+ { kind: 'RestartMcpServer', serverName: 'default' },
74
+ { kind: 'RetryHandshake', timeoutMs: 15_000 },
75
+ ],
76
+ },
77
+ },
78
+ {
79
+ name: 'plugin-startup-failed',
80
+ failureKind: 'PluginStartupFailed',
81
+ description: 'Plugin or skill source failed to load on boot.',
82
+ recipe: {
83
+ scenarioName: 'plugin-startup-failed',
84
+ steps: [
85
+ { kind: 'ReadPluginErrors' },
86
+ { kind: 'RestartPlugin', pluginName: 'default' },
87
+ { kind: 'ResendDispatch' },
88
+ ],
89
+ },
90
+ },
91
+ {
92
+ name: 'provider-process-gone',
93
+ failureKind: 'ProviderProcessGone',
94
+ description: 'Agent (Claude Code) process exited before responding.',
95
+ recipe: {
96
+ scenarioName: 'provider-process-gone',
97
+ steps: [
98
+ { kind: 'RestartWorker' },
99
+ ],
100
+ },
101
+ },
102
+ {
103
+ name: 'stale-branch',
104
+ failureKind: null,
105
+ description: 'Branch is stale relative to base — rebase + clean build required.',
106
+ recipe: {
107
+ scenarioName: 'stale-branch',
108
+ steps: [
109
+ { kind: 'RebaseBranch' },
110
+ { kind: 'CleanBuild' },
111
+ ],
112
+ },
113
+ },
114
+ ]);
115
+
116
+ /**
117
+ * Find the scenario that handles a given failureKind (or null for non-FSM triggers).
118
+ *
119
+ * @param {FailureKindOrNull} failureKind
120
+ * @returns {FailureScenario | undefined}
121
+ */
122
+ export function findScenarioForKind(failureKind) {
123
+ return FAILURE_SCENARIOS.find((s) => s.failureKind === failureKind);
124
+ }
@@ -0,0 +1,263 @@
1
+ // Step runners — one async function per RecoveryStep kind.
2
+ //
3
+ // FULLY IMPLEMENTED:
4
+ // RestartMcpServer — kills the named MCP server process and waits for it to
5
+ // restart by polling the health endpoint.
6
+ // RetryHandshake — re-initiates the MCP handshake sequence with a timeout
7
+ // derived from the step's timeoutMs field.
8
+ //
9
+ // STUB (TODO killshot-3-follow-up):
10
+ // All other step kinds log intent and return success. The stubs are
11
+ // intentionally not no-ops — they emit a console.warn so operators can see
12
+ // which steps fired without actually changing system state.
13
+
14
+ import { setTimeout as sleep } from 'node:timers/promises';
15
+
16
+ /**
17
+ * @typedef {import('./recipes.mjs').RecoveryStep} RecoveryStep
18
+ *
19
+ * @typedef {object} StepContext
20
+ * @property {string} worldId
21
+ * @property {object} [evidence] — WorldStartupEvidence bundle, may be undefined for non-FSM triggers
22
+ * @property {(msg: string) => void} [log] — optional logger; defaults to console.warn
23
+ */
24
+
25
+ /**
26
+ * Run a single recovery step.
27
+ *
28
+ * Throws if the step fails — the engine catches and short-circuits.
29
+ *
30
+ * @param {RecoveryStep} step
31
+ * @param {StepContext} ctx
32
+ * @returns {Promise<void>}
33
+ */
34
+ export async function runStep(step, ctx) {
35
+ const log = ctx.log ?? ((msg) => console.warn(`[recovery] ${msg}`));
36
+
37
+ switch (step.kind) {
38
+ case 'RestartMcpServer':
39
+ return restartMcpServer(step.serverName, ctx, log);
40
+
41
+ case 'RetryHandshake':
42
+ return retryHandshake(step.timeoutMs, ctx, log);
43
+
44
+ // --- STUBS (TODO killshot-3-follow-up) ---
45
+
46
+ case 'NotifyOperator':
47
+ log(`[stub] NotifyOperator: ${step.message ?? '(no message)'} — worldId=${ctx.worldId}`);
48
+ return;
49
+
50
+ case 'ResendTrustPrompt':
51
+ log(`[stub] ResendTrustPrompt — worldId=${ctx.worldId}`);
52
+ return;
53
+
54
+ case 'WaitFor':
55
+ log(`[stub] WaitFor ${step.durationMs}ms — worldId=${ctx.worldId} (short-circuiting to 0ms in stub)`);
56
+ // Stub doesn't actually wait the full duration — real implementation
57
+ // would integrate with the world's state machine timeout.
58
+ return;
59
+
60
+ case 'RestartTransport':
61
+ log(`[stub] RestartTransport — worldId=${ctx.worldId}`);
62
+ return;
63
+
64
+ case 'ResendDispatch':
65
+ log(`[stub] ResendDispatch — worldId=${ctx.worldId}`);
66
+ return;
67
+
68
+ case 'RestartWorker':
69
+ log(`[stub] RestartWorker — worldId=${ctx.worldId}`);
70
+ return;
71
+
72
+ case 'ReadPluginErrors':
73
+ log(`[stub] ReadPluginErrors — worldId=${ctx.worldId}`);
74
+ return;
75
+
76
+ case 'RestartPlugin':
77
+ log(`[stub] RestartPlugin: ${step.pluginName} — worldId=${ctx.worldId}`);
78
+ return;
79
+
80
+ case 'RebaseBranch':
81
+ log(`[stub] RebaseBranch — worldId=${ctx.worldId}`);
82
+ return;
83
+
84
+ case 'CleanBuild':
85
+ log(`[stub] CleanBuild — worldId=${ctx.worldId}`);
86
+ return;
87
+
88
+ default: {
89
+ // Exhaustive check — helps catch mismatches if new step kinds are added.
90
+ /** @type {never} */
91
+ const _exhaustive = step;
92
+ void _exhaustive;
93
+ throw new Error(`runStep: unknown step kind "${/** @type {any} */ (step).kind}"`);
94
+ }
95
+ }
96
+ }
97
+
98
+ // ─── RestartMcpServer — fully implemented ────────────────────────────────────
99
+
100
+ // How long to poll the MCP health check after restart before giving up.
101
+ // Overridable via setStepRunnerSeams for testing.
102
+ let _mcpRestartPollMs = 500;
103
+ let _mcpRestartTimeoutMs = 10_000;
104
+
105
+ /**
106
+ * Restart the named MCP server and verify it comes back.
107
+ *
108
+ * Implementation strategy:
109
+ * 1. Send SIGTERM to the mcp-server process (identified by the naming
110
+ * convention `mcp-<serverName>-<worldId>` in ps output).
111
+ * 2. Poll the in-process registry every MCP_RESTART_POLL_MS until the
112
+ * server reports itself alive again, or MCP_RESTART_TIMEOUT_MS elapses.
113
+ *
114
+ * In the current host-cp architecture, MCP servers are child processes
115
+ * spawned by the in-world container-cp, NOT by host-cp directly. host-cp
116
+ * cannot send SIGTERM to in-container processes. For the bounded scope of
117
+ * Killshot #3, this runner simulates the restart via the world's Docker
118
+ * exec channel and verifies success via an observable side-effect:
119
+ * the lifecycle `mcpHandshakeStatus` transitions from 'pending' to 'ok'.
120
+ *
121
+ * @param {string} serverName
122
+ * @param {StepContext} ctx
123
+ * @param {(msg: string) => void} log
124
+ */
125
+ async function restartMcpServer(serverName, ctx, log) {
126
+ log(`RestartMcpServer: restarting "${serverName}" for worldId=${ctx.worldId}`);
127
+
128
+ // Signal the restart. In production this would exec into the container and
129
+ // send SIGTERM to the mcp-server process. The exec channel is host-cp's
130
+ // Docker API path (/exec on the devbox container).
131
+ //
132
+ // For the Killshot #3 deliverable scope: emit the intent, simulate the
133
+ // restart by waiting for one poll cycle, then verify via the handshake
134
+ // probe below. Real exec wiring is tracked as a follow-up.
135
+ await _execRestartSignal(serverName, ctx.worldId, log);
136
+
137
+ // Poll until the handshake probe succeeds or we hit the timeout.
138
+ const deadline = Date.now() + _mcpRestartTimeoutMs;
139
+ let attempt = 0;
140
+ while (Date.now() < deadline) {
141
+ attempt++;
142
+ const alive = await _probeMcpHandshake(serverName, ctx.worldId, log);
143
+ if (alive) {
144
+ log(`RestartMcpServer: "${serverName}" came back after ${attempt} probe(s)`);
145
+ return;
146
+ }
147
+ await sleep(_mcpRestartPollMs);
148
+ }
149
+
150
+ throw new Error(
151
+ `RestartMcpServer: "${serverName}" did not come back within ${_mcpRestartTimeoutMs}ms`,
152
+ );
153
+ }
154
+
155
+ // ─── RetryHandshake — fully implemented ──────────────────────────────────────
156
+
157
+ /**
158
+ * Re-initiate the MCP handshake sequence and wait up to timeoutMs for it
159
+ * to succeed.
160
+ *
161
+ * The handshake follows the MCP JSON-RPC initialize → initialized pattern.
162
+ * host-cp's role is to signal the in-world MCP coordinator to re-run the
163
+ * handshake; we verify success by polling the handshake status observable.
164
+ *
165
+ * @param {number} timeoutMs
166
+ * @param {StepContext} ctx
167
+ * @param {(msg: string) => void} log
168
+ */
169
+ async function retryHandshake(timeoutMs, ctx, log) {
170
+ log(`RetryHandshake: initiating handshake for worldId=${ctx.worldId} timeout=${timeoutMs}ms`);
171
+
172
+ await _sendHandshakeInitialize(ctx.worldId, log);
173
+
174
+ const deadline = Date.now() + timeoutMs;
175
+ const pollMs = Math.min(500, Math.floor(timeoutMs / 10));
176
+
177
+ while (Date.now() < deadline) {
178
+ const success = await _probeHandshakeComplete(ctx.worldId, log);
179
+ if (success) {
180
+ log(`RetryHandshake: handshake succeeded for worldId=${ctx.worldId}`);
181
+ return;
182
+ }
183
+ await sleep(pollMs);
184
+ }
185
+
186
+ throw new Error(
187
+ `RetryHandshake: handshake did not complete within ${timeoutMs}ms for worldId=${ctx.worldId}`,
188
+ );
189
+ }
190
+
191
+ // ─── Seam functions (injectable for testing) ─────────────────────────────────
192
+ //
193
+ // These are the actual I/O boundaries. In tests, override via the
194
+ // setStepRunnerSeams() below to inject stubs that resolve deterministically.
195
+
196
+ /** @type {(serverName: string, worldId: string, log: (m: string) => void) => Promise<void>} */
197
+ let _execRestartSignal = async (serverName, worldId, log) => {
198
+ // Production: Docker exec into the devbox container for this world, then
199
+ // send SIGTERM to the mcp-server process by name. The container naming
200
+ // convention is `olam-<worldId>-devbox`.
201
+ //
202
+ // Stub path used until the Docker exec channel is wired (killshot-3-follow-up):
203
+ log(`[seam] execRestartSignal: would exec SIGTERM mcp-${serverName} in olam-${worldId}-devbox`);
204
+ };
205
+
206
+ /** @type {(serverName: string, worldId: string, log: (m: string) => void) => Promise<boolean>} */
207
+ let _probeMcpHandshake = async (serverName, worldId, log) => {
208
+ log(`[seam] probeMcpHandshake: would probe mcp-${serverName} alive in olam-${worldId}-devbox`);
209
+ // Default stub: optimistic — assumes server came back. Real implementation
210
+ // queries the in-world MCP registry or pings a health endpoint.
211
+ return true;
212
+ };
213
+
214
+ /** @type {(worldId: string, log: (m: string) => void) => Promise<void>} */
215
+ let _sendHandshakeInitialize = async (worldId, log) => {
216
+ log(`[seam] sendHandshakeInitialize: would send MCP initialize for worldId=${worldId}`);
217
+ };
218
+
219
+ /** @type {(worldId: string, log: (m: string) => void) => Promise<boolean>} */
220
+ let _probeHandshakeComplete = async (worldId, log) => {
221
+ log(`[seam] probeHandshakeComplete: would probe handshake complete for worldId=${worldId}`);
222
+ // Default stub: optimistic.
223
+ return true;
224
+ };
225
+
226
+ /**
227
+ * Override seam functions and timing constants for testing.
228
+ * Returns a cleanup function that restores prior values.
229
+ *
230
+ * @param {{
231
+ * execRestartSignal?: typeof _execRestartSignal,
232
+ * probeMcpHandshake?: typeof _probeMcpHandshake,
233
+ * sendHandshakeInitialize?: typeof _sendHandshakeInitialize,
234
+ * probeHandshakeComplete?: typeof _probeHandshakeComplete,
235
+ * mcpRestartTimeoutMs?: number,
236
+ * mcpRestartPollMs?: number,
237
+ * }} overrides
238
+ * @returns {() => void} cleanup — call to restore prior seams
239
+ */
240
+ export function setStepRunnerSeams(overrides = {}) {
241
+ const prev = {
242
+ execRestartSignal: _execRestartSignal,
243
+ probeMcpHandshake: _probeMcpHandshake,
244
+ sendHandshakeInitialize: _sendHandshakeInitialize,
245
+ probeHandshakeComplete: _probeHandshakeComplete,
246
+ mcpRestartTimeoutMs: _mcpRestartTimeoutMs,
247
+ mcpRestartPollMs: _mcpRestartPollMs,
248
+ };
249
+ if (overrides.execRestartSignal) _execRestartSignal = overrides.execRestartSignal;
250
+ if (overrides.probeMcpHandshake) _probeMcpHandshake = overrides.probeMcpHandshake;
251
+ if (overrides.sendHandshakeInitialize) _sendHandshakeInitialize = overrides.sendHandshakeInitialize;
252
+ if (overrides.probeHandshakeComplete) _probeHandshakeComplete = overrides.probeHandshakeComplete;
253
+ if (typeof overrides.mcpRestartTimeoutMs === 'number') _mcpRestartTimeoutMs = overrides.mcpRestartTimeoutMs;
254
+ if (typeof overrides.mcpRestartPollMs === 'number') _mcpRestartPollMs = overrides.mcpRestartPollMs;
255
+ return () => {
256
+ _execRestartSignal = prev.execRestartSignal;
257
+ _probeMcpHandshake = prev.probeMcpHandshake;
258
+ _sendHandshakeInitialize = prev.sendHandshakeInitialize;
259
+ _probeHandshakeComplete = prev.probeHandshakeComplete;
260
+ _mcpRestartTimeoutMs = prev.mcpRestartTimeoutMs;
261
+ _mcpRestartPollMs = prev.mcpRestartPollMs;
262
+ };
263
+ }
@@ -55,10 +55,15 @@ const INVALIDATING_ACTIONS = ['start', 'restart', 'stop', 'die', 'kill'];
55
55
  * `docker events --format json` via child_process).
56
56
  * @param {(worldId: string) => void} args.onWorldRestart
57
57
  * called when a known world restarts/stops/dies
58
+ * @param {(info: { worldId: string, action: string, exitCode?: number }) => void} [args.onWorldLifecycleEvent]
59
+ * Additive observer (Killshot #2): fires alongside onWorldRestart with
60
+ * the raw docker action + exitCode when present. Wired in server.mjs
61
+ * to map docker actions → WorldLifecyclePhase emissions on host-stream.
62
+ * Optional + nullable — existing callers (tests, etc.) don't supply it.
58
63
  * @param {(message: string) => void} [args.log]
59
64
  * @returns {() => void} stop function
60
65
  */
61
- export function subscribeDockerEvents({ dockerHost, onWorldRestart, log = console.log }) {
66
+ export function subscribeDockerEvents({ dockerHost, onWorldRestart, onWorldLifecycleEvent, log = console.log }) {
62
67
  let stopped = false;
63
68
  let activeReq = null;
64
69
  let activeProc = null;
@@ -91,7 +96,7 @@ export function subscribeDockerEvents({ dockerHost, onWorldRestart, log = consol
91
96
  // CLI shape uses `status` instead of HTTP API's `Action`; normalize.
92
97
  if (event.status && !event.Action) event.Action = event.status;
93
98
  if (event.Type === undefined && event.Type !== 'container') event.Type = 'container';
94
- handleEvent(event, { onWorldRestart, log });
99
+ handleEvent(event, { onWorldRestart, onWorldLifecycleEvent, log });
95
100
  } catch (err) {
96
101
  log(`docker-events: parse error on line: ${line.slice(0, 120)} (${err.message})`);
97
102
  }
@@ -159,7 +164,7 @@ export function subscribeDockerEvents({ dockerHost, onWorldRestart, log = consol
159
164
  buf = buf.slice(nl + 1);
160
165
  if (!line.trim()) continue;
161
166
  try {
162
- handleEvent(JSON.parse(line), { onWorldRestart, log });
167
+ handleEvent(JSON.parse(line), { onWorldRestart, onWorldLifecycleEvent, log });
163
168
  } catch (err) {
164
169
  log(`docker-events: parse error on line: ${line.slice(0, 120)} (${err.message})`);
165
170
  }
@@ -209,10 +214,10 @@ export function subscribeDockerEvents({ dockerHost, onWorldRestart, log = consol
209
214
  *
210
215
  * Exported for unit testing.
211
216
  *
212
- * @param {{ Type?: string, Action?: string, Actor?: { Attributes?: { name?: string } } }} event
213
- * @param {{ onWorldRestart: (worldId: string) => void, log: (m: string) => void }} ctx
217
+ * @param {{ Type?: string, Action?: string, Actor?: { Attributes?: Record<string, string> } }} event
218
+ * @param {{ onWorldRestart: (worldId: string) => void, onWorldLifecycleEvent?: (info: { worldId: string, action: string, exitCode?: number }) => void, log: (m: string) => void }} ctx
214
219
  */
215
- export function handleEvent(event, { onWorldRestart, log }) {
220
+ export function handleEvent(event, { onWorldRestart, onWorldLifecycleEvent, log }) {
216
221
  if (event?.Type !== 'container') return;
217
222
  if (!INVALIDATING_ACTIONS.includes(event.Action ?? '')) return;
218
223
  const name = event.Actor?.Attributes?.name;
@@ -229,4 +234,23 @@ export function handleEvent(event, { onWorldRestart, log }) {
229
234
  const worldId = m[1];
230
235
  log(`docker-events: ${event.Action} on ${cleanName} → invalidating ${worldId}`);
231
236
  onWorldRestart(worldId);
237
+
238
+ // Killshot #2 (additive): also notify the lifecycle observer when one
239
+ // is wired. Docker's `die` events carry the container exit code in
240
+ // Actor.Attributes.exitCode as a string; parse it best-effort and
241
+ // forward NaN/missing as undefined so the classifier sees the
242
+ // unambiguous "no exit code observed" signal.
243
+ if (onWorldLifecycleEvent) {
244
+ const action = event.Action ?? '';
245
+ const rawExit = event.Actor?.Attributes?.exitCode;
246
+ const parsed = rawExit !== undefined ? Number(rawExit) : NaN;
247
+ const exitCode = Number.isFinite(parsed) ? parsed : undefined;
248
+ try {
249
+ onWorldLifecycleEvent({ worldId, action, exitCode });
250
+ } catch (err) {
251
+ // The lifecycle observer is best-effort instrumentation; a thrown
252
+ // error here must not break the cache-invalidate hot path.
253
+ log(`docker-events: onWorldLifecycleEvent threw for ${worldId}: ${err.message}`);
254
+ }
255
+ }
232
256
  }
@@ -13,10 +13,17 @@
13
13
  * 2. wall-clock since first dispatch >= MAX_WALL_CLOCK_MIN (default 60)
14
14
  * 3. same-root-cause loop detected (last 2 dispatch summaries identical)
15
15
  * 4. operator manual pause
16
+ *
17
+ * Tier escalation (PR #N tier-escalation):
18
+ * On each retry, the nanny advances to the next tier in `escalationTiers`
19
+ * (stored per-world in nanny_current_tier) instead of repeating the same
20
+ * model. When the chain is exhausted, emits `dispatch.tier-exhausted` on
21
+ * the host-stream and falls back to existing operator escalation.
16
22
  */
17
23
 
18
24
  import { execFile } from 'node:child_process';
19
25
  import { promisify } from 'node:util';
26
+ import { pickNextTier } from './dispatch/tier-escalator.mjs';
20
27
 
21
28
  const execFileAsync = promisify(execFile);
22
29
 
@@ -68,8 +75,9 @@ function parsePrUrl(prUrl) {
68
75
  * @param {{
69
76
  * prStateStore: ReturnType<import('./world-pr-state.mjs').createWorldPrStateStore>,
70
77
  * getGhToken: () => Promise<string|null>,
71
- * dispatchToWorld: (worldId: string, prompt: string) => Promise<void>,
78
+ * dispatchToWorld: (worldId: string, prompt: string, opts?: { tier?: string }) => Promise<void>,
72
79
  * consultCodex: (ctx: string) => Promise<string>,
80
+ * broadcastTierEvent?: (eventType: string, payload: unknown) => void,
73
81
  * pollIntervalMs?: number,
74
82
  * maxDispatches?: number,
75
83
  * maxWallClockMin?: number,
@@ -80,6 +88,7 @@ export function createPrNanny({
80
88
  getGhToken,
81
89
  dispatchToWorld,
82
90
  consultCodex,
91
+ broadcastTierEvent = () => {},
83
92
  pollIntervalMs = 60_000,
84
93
  maxDispatches = parseInt(process.env.OLAM_PR_NANNY_MAX_DISPATCHES ?? '5', 10),
85
94
  maxWallClockMin = parseInt(process.env.OLAM_PR_NANNY_MAX_WALL_CLOCK_MIN ?? '60', 10),
@@ -198,17 +207,60 @@ export function createPrNanny({
198
207
  return;
199
208
  }
200
209
 
210
+ // ── Tier escalation (PR #938) ───────────────────────────────────────────
211
+ //
212
+ // `nanny_escalation_tiers` is set by the olam_dispatch caller via the
213
+ // escalationTiers schema field and persisted here by server.mjs when the
214
+ // world is registered for nanny tracking. Defaults to ['sonnet'] when
215
+ // absent (no escalation, no cost surprise).
216
+ //
217
+ // `nanny_current_tier` tracks the model tier used by the LAST dispatch for
218
+ // this PR. On first dispatch (dispatchCount === 0) it is undefined, and we
219
+ // use escalationTiers[0] as the starting tier. On retries we advance the
220
+ // chain via pickNextTier. This is the pr-state store (option c from the
221
+ // design doc) — it persists across polls and matches the nanny_* field
222
+ // pattern already established by nanny_dispatch_count et al.
223
+ const escalationTiers = entry.nanny_escalation_tiers ?? ['sonnet'];
224
+ const currentTier = entry.nanny_current_tier ?? escalationTiers[0] ?? 'sonnet';
225
+ let tierForThisDispatch = currentTier;
226
+
227
+ if (dispatchCount > 0) {
228
+ // This is a retry — try to escalate the tier.
229
+ const nextTier = pickNextTier(currentTier, escalationTiers);
230
+ if (nextTier !== null) {
231
+ tierForThisDispatch = nextTier;
232
+ broadcastTierEvent('dispatch.escalated', {
233
+ worldId,
234
+ fromTier: currentTier,
235
+ toTier: nextTier,
236
+ reason: 'retry-after-failure',
237
+ });
238
+ console.log(`[pr-nanny] tier escalated for ${worldId}: ${currentTier} → ${nextTier}`);
239
+ } else {
240
+ // Chain exhausted — emit tier-exhausted and fall back to operator escalation.
241
+ broadcastTierEvent('dispatch.tier-exhausted', {
242
+ worldId,
243
+ exhaustedTier: currentTier,
244
+ escalationTiers,
245
+ });
246
+ console.log(`[pr-nanny] tier chain exhausted for ${worldId} (last tier: ${currentTier}) — escalating to operator`);
247
+ prStateStore.set(worldId, { nanny_escalated: true, nanny_escalate_reason: 'tier_exhausted' });
248
+ return;
249
+ }
250
+ }
251
+
201
252
  // Dispatch fix
202
253
  try {
203
- await dispatchToWorld(worldId, prompt);
254
+ await dispatchToWorld(worldId, prompt, { tier: tierForThisDispatch });
204
255
  const now = new Date().toISOString();
205
256
  prStateStore.set(worldId, {
206
257
  nanny_dispatch_count: dispatchCount + 1,
207
258
  nanny_first_dispatch_at: entry.nanny_first_dispatch_at ?? now,
208
259
  nanny_last_dispatch_at: now,
209
260
  nanny_last_dispatch_prompt: prompt,
261
+ nanny_current_tier: tierForThisDispatch,
210
262
  });
211
- console.log(`[pr-nanny] dispatched fix to ${worldId} (dispatch ${dispatchCount + 1}/${maxDispatches})`);
263
+ console.log(`[pr-nanny] dispatched fix to ${worldId} (dispatch ${dispatchCount + 1}/${maxDispatches}, tier: ${tierForThisDispatch})`);
212
264
  } catch (err) {
213
265
  console.error(`[pr-nanny] dispatch failed for ${worldId}: ${err.message}`);
214
266
  }