@pleri/olam-cli 0.1.147 → 0.1.150

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (137) hide show
  1. package/dist/agent-stream/agent-sdk-to-chunks.js +276 -0
  2. package/dist/agent-stream/agent-stream-launch.js +348 -0
  3. package/dist/agent-stream/chunks-subscriber-transport.js +262 -0
  4. package/dist/agent-stream/codex-runner.js +188 -0
  5. package/dist/agent-stream/driver-runner.js +347 -0
  6. package/dist/agent-stream/operator-subscription.js +179 -0
  7. package/dist/commands/auth.d.ts.map +1 -1
  8. package/dist/commands/auth.js +26 -1
  9. package/dist/commands/auth.js.map +1 -1
  10. package/dist/commands/create.d.ts.map +1 -1
  11. package/dist/commands/create.js +39 -0
  12. package/dist/commands/create.js.map +1 -1
  13. package/dist/commands/doctor.d.ts +54 -3
  14. package/dist/commands/doctor.d.ts.map +1 -1
  15. package/dist/commands/doctor.js +348 -6
  16. package/dist/commands/doctor.js.map +1 -1
  17. package/dist/commands/init.d.ts +46 -0
  18. package/dist/commands/init.d.ts.map +1 -1
  19. package/dist/commands/init.js +90 -0
  20. package/dist/commands/init.js.map +1 -1
  21. package/dist/commands/kg-build.d.ts +23 -0
  22. package/dist/commands/kg-build.d.ts.map +1 -1
  23. package/dist/commands/kg-build.js +104 -2
  24. package/dist/commands/kg-build.js.map +1 -1
  25. package/dist/commands/restart.d.ts +18 -0
  26. package/dist/commands/restart.d.ts.map +1 -0
  27. package/dist/commands/restart.js +113 -0
  28. package/dist/commands/restart.js.map +1 -0
  29. package/dist/commands/services.d.ts +41 -3
  30. package/dist/commands/services.d.ts.map +1 -1
  31. package/dist/commands/services.js +221 -13
  32. package/dist/commands/services.js.map +1 -1
  33. package/dist/commands/setup-linux-gate.d.ts +26 -0
  34. package/dist/commands/setup-linux-gate.d.ts.map +1 -0
  35. package/dist/commands/setup-linux-gate.js +42 -0
  36. package/dist/commands/setup-linux-gate.js.map +1 -0
  37. package/dist/commands/setup-metrics.d.ts +26 -0
  38. package/dist/commands/setup-metrics.d.ts.map +1 -0
  39. package/dist/commands/setup-metrics.js +57 -0
  40. package/dist/commands/setup-metrics.js.map +1 -0
  41. package/dist/commands/setup-phase-5a-skill-source.d.ts +68 -0
  42. package/dist/commands/setup-phase-5a-skill-source.d.ts.map +1 -0
  43. package/dist/commands/setup-phase-5a-skill-source.js +196 -0
  44. package/dist/commands/setup-phase-5a-skill-source.js.map +1 -0
  45. package/dist/commands/setup-phase-5b-project-sweep.d.ts +38 -0
  46. package/dist/commands/setup-phase-5b-project-sweep.d.ts.map +1 -0
  47. package/dist/commands/setup-phase-5b-project-sweep.js +176 -0
  48. package/dist/commands/setup-phase-5b-project-sweep.js.map +1 -0
  49. package/dist/commands/setup.d.ts +19 -0
  50. package/dist/commands/setup.d.ts.map +1 -1
  51. package/dist/commands/setup.js +22 -0
  52. package/dist/commands/setup.js.map +1 -1
  53. package/dist/commands/skills-10x.d.ts +23 -0
  54. package/dist/commands/skills-10x.d.ts.map +1 -0
  55. package/dist/commands/skills-10x.js +308 -0
  56. package/dist/commands/skills-10x.js.map +1 -0
  57. package/dist/commands/substrate-audit-log.d.ts +2 -0
  58. package/dist/commands/substrate-audit-log.d.ts.map +1 -1
  59. package/dist/commands/substrate-audit-log.js +13 -0
  60. package/dist/commands/substrate-audit-log.js.map +1 -1
  61. package/dist/image-digests.json +7 -7
  62. package/dist/index.js +18102 -15234
  63. package/dist/index.js.map +1 -1
  64. package/dist/lib/auth-refresh-kubernetes.d.ts +62 -0
  65. package/dist/lib/auth-refresh-kubernetes.d.ts.map +1 -0
  66. package/dist/lib/auth-refresh-kubernetes.js +127 -0
  67. package/dist/lib/auth-refresh-kubernetes.js.map +1 -0
  68. package/dist/lib/build-if-stale.d.ts +33 -0
  69. package/dist/lib/build-if-stale.d.ts.map +1 -0
  70. package/dist/lib/build-if-stale.js +156 -0
  71. package/dist/lib/build-if-stale.js.map +1 -0
  72. package/dist/lib/bundle-freshness.d.ts +57 -0
  73. package/dist/lib/bundle-freshness.d.ts.map +1 -0
  74. package/dist/lib/bundle-freshness.js +223 -0
  75. package/dist/lib/bundle-freshness.js.map +1 -0
  76. package/dist/lib/bundle-source.d.ts +52 -0
  77. package/dist/lib/bundle-source.d.ts.map +1 -0
  78. package/dist/lib/bundle-source.js +83 -0
  79. package/dist/lib/bundle-source.js.map +1 -0
  80. package/dist/lib/kubectl-wrap.d.ts +6 -0
  81. package/dist/lib/kubectl-wrap.d.ts.map +1 -1
  82. package/dist/lib/kubectl-wrap.js +6 -1
  83. package/dist/lib/kubectl-wrap.js.map +1 -1
  84. package/dist/lib/manifest-refresh.d.ts +42 -1
  85. package/dist/lib/manifest-refresh.d.ts.map +1 -1
  86. package/dist/lib/manifest-refresh.js +83 -7
  87. package/dist/lib/manifest-refresh.js.map +1 -1
  88. package/dist/lib/peripheral-registry.d.ts +36 -0
  89. package/dist/lib/peripheral-registry.d.ts.map +1 -0
  90. package/dist/lib/peripheral-registry.js +55 -0
  91. package/dist/lib/peripheral-registry.js.map +1 -0
  92. package/dist/lib/port-forward.d.ts +67 -0
  93. package/dist/lib/port-forward.d.ts.map +1 -1
  94. package/dist/lib/port-forward.js +153 -0
  95. package/dist/lib/port-forward.js.map +1 -1
  96. package/dist/lib/upgrade-kubernetes.d.ts +52 -12
  97. package/dist/lib/upgrade-kubernetes.d.ts.map +1 -1
  98. package/dist/lib/upgrade-kubernetes.js +390 -22
  99. package/dist/lib/upgrade-kubernetes.js.map +1 -1
  100. package/dist/mcp-server.js +84 -58
  101. package/host-cp/compose.yaml +6 -0
  102. package/host-cp/k8s/manifests/30-configmap.yaml +6 -0
  103. package/host-cp/k8s/manifests/50-deployment.yaml +46 -9
  104. package/host-cp/k8s/manifests/auth-service/10-serviceaccount.yaml +8 -0
  105. package/host-cp/k8s/manifests/auth-service/20-rbac.yaml +34 -0
  106. package/host-cp/k8s/manifests/auth-service/30-configmap.yaml +24 -0
  107. package/host-cp/k8s/manifests/auth-service/45-pvc.yaml +25 -0
  108. package/host-cp/k8s/manifests/auth-service/50-deployment.yaml +117 -0
  109. package/host-cp/k8s/manifests/auth-service/60-service.yaml +21 -0
  110. package/host-cp/k8s/manifests/kg-service/10-serviceaccount.yaml +8 -0
  111. package/host-cp/k8s/manifests/kg-service/20-rbac.yaml +34 -0
  112. package/host-cp/k8s/manifests/kg-service/30-configmap.yaml +18 -0
  113. package/host-cp/k8s/manifests/kg-service/45-pvc.yaml +25 -0
  114. package/host-cp/k8s/manifests/kg-service/50-deployment.yaml +108 -0
  115. package/host-cp/k8s/manifests/kg-service/60-service.yaml +21 -0
  116. package/host-cp/k8s/manifests/mcp-auth-service/10-serviceaccount.yaml +8 -0
  117. package/host-cp/k8s/manifests/mcp-auth-service/20-rbac.yaml +34 -0
  118. package/host-cp/k8s/manifests/mcp-auth-service/30-configmap.yaml +18 -0
  119. package/host-cp/k8s/manifests/mcp-auth-service/45-pvc.yaml +25 -0
  120. package/host-cp/k8s/manifests/mcp-auth-service/50-deployment.yaml +117 -0
  121. package/host-cp/k8s/manifests/mcp-auth-service/60-service.yaml +21 -0
  122. package/host-cp/k8s/manifests/memory-service/10-serviceaccount.yaml +8 -0
  123. package/host-cp/k8s/manifests/memory-service/20-rbac.yaml +34 -0
  124. package/host-cp/k8s/manifests/memory-service/30-configmap.yaml +20 -0
  125. package/host-cp/k8s/manifests/memory-service/45-pvc.yaml +25 -0
  126. package/host-cp/k8s/manifests/memory-service/50-deployment.yaml +121 -0
  127. package/host-cp/k8s/manifests/memory-service/60-service.yaml +21 -0
  128. package/host-cp/k8s/templates/auth-service-secret-template.yaml +28 -0
  129. package/host-cp/k8s/templates/kg-service-secret-template.yaml +28 -0
  130. package/host-cp/k8s/templates/mcp-auth-service-secret-template.yaml +28 -0
  131. package/host-cp/k8s/templates/memory-service-secret-template.yaml +29 -0
  132. package/host-cp/src/agent-runtime-trigger.mjs +7 -5
  133. package/host-cp/src/plan-chat-secret.mjs +13 -2
  134. package/host-cp/src/plan-chat-service.mjs +116 -15
  135. package/host-cp/src/server.mjs +23 -11
  136. package/host-cp/src/upgrade-spawner.mjs +10 -5
  137. package/package.json +4 -2
@@ -0,0 +1,276 @@
1
+ /**
2
+ * agent-sdk-to-chunks.ts — Claude Agent SDK event-stream → chunks substrate.
3
+ *
4
+ * The load-bearing seam for Phase C: the agent runtime is
5
+ * `@anthropic-ai/claude-agent-sdk` consuming the operator's LOCAL Claude
6
+ * Code subscription (D15). Each event the SDK emits from `query({...})`
7
+ * becomes a chunk row written via host-cp's `POST /v1/chunks` bearer
8
+ * endpoint. The sidecar server-injects `actor_id` + `actor_type` from
9
+ * the bearer principal (T3 mitigation; verified in Phase B B1/PB2).
10
+ *
11
+ * Design notes:
12
+ *
13
+ * - The adapter accepts an `AsyncIterable<SdkLikeMessage>` (NOT the
14
+ * full SDKMessage union) so it's testable with mock iterables
15
+ * without booting the real SDK. The Claude Agent SDK's `query()`
16
+ * returns an iterable matching this shape; the test harness
17
+ * supplies synthetic iterables.
18
+ *
19
+ * - `postChunk` is injected so the adapter is HTTP-transport-agnostic.
20
+ * Production wires it to `fetch(host-cp/v1/chunks, { bearer, body })`.
21
+ * Tests pass a recording mock that captures the chunk rows.
22
+ *
23
+ * - SDK message handling (the load-bearing mapping):
24
+ * - 'assistant' (SDKAssistantMessage) → extract `message.content`
25
+ * array; emit one chunk per content block.
26
+ * - text → kind='text', chunk=<text>
27
+ * - tool_use → kind='tool-call', chunk=<tool_name + input JSON>
28
+ * - other content blocks → kind='text' fallback (preserve text repr)
29
+ * - 'result' (SDKResultMessage) → end of turn; flush + return.
30
+ * - 'system' → skip (auth, session-state, hooks — not user-visible).
31
+ * - 'stream_event' (SDKPartialAssistantMessage) → SKIPPED for v1.
32
+ * Phase C C1 ships at the assistant-message granularity (not
33
+ * per-token streaming) because: (a) assistant messages already
34
+ * arrive incrementally per content block; (b) per-token writes
35
+ * would 10-100× the chunk count and may violate the K3 budget
36
+ * under sustained load; (c) operator UX renders at the message
37
+ * level via assistant-ui's `Thread` + `MultiAuthorMessage`
38
+ * (Phase C C0). Per-token streaming is a deliberate C+1
39
+ * follow-up if the demo arc needs sub-second token visibility.
40
+ *
41
+ * - Sequence numbering: caller provides `messageId` (unique per turn)
42
+ * and `seqStart` (typically 0). The adapter increments `seq` per
43
+ * emitted chunk; (messageId, seq) is the substrate's primary key
44
+ * per Phase A A1's @olam/chunks schema. Atomic at the adapter
45
+ * scope (single async loop; no concurrent writes per (messageId)).
46
+ *
47
+ * - `audit:auth-callers` posture: the SDK uses the operator's local
48
+ * Claude Code subscription (D15); it does NOT hit
49
+ * `fetch('api.anthropic.com')` from this adapter's call path. The
50
+ * adapter ITSELF only calls `postChunk` (a local HTTP POST to
51
+ * host-cp on host.docker.internal:3112). CLAUDE.md gets a one-
52
+ * paragraph clarification per D14 same-commit discipline.
53
+ *
54
+ * Source: docs/plans/olam-plan-chat-chunks-substrate/phase-c-tasks.md (C1)
55
+ */
56
+ /**
57
+ * Drain an SDK event stream and emit chunks for each message. Returns a
58
+ * summary; never throws on individual-message issues (logs + continues).
59
+ * Caller's `postChunk` failures DO propagate — substrate writes are the
60
+ * load-bearing surface; silent loss is not acceptable.
61
+ */
62
+ export async function streamSdkToChunks(input) {
63
+ const { messages, worldId, sessionId, messageId, postChunk } = input;
64
+ const seqStart = input.seqStart ?? 0;
65
+ const now = input.now ?? (() => new Date());
66
+ let seq = seqStart;
67
+ let chunksEmitted = 0;
68
+ let resultObserved = false;
69
+ let hadError = false;
70
+ for await (const msg of messages) {
71
+ if (msg.type === 'assistant') {
72
+ const assistant = msg;
73
+ const content = assistant.message?.content ?? [];
74
+ for (const block of content) {
75
+ const draft = blockToChunkDraft(block, {
76
+ worldId,
77
+ sessionId,
78
+ messageId,
79
+ seq,
80
+ createdAt: now().toISOString(),
81
+ });
82
+ if (draft === null)
83
+ continue;
84
+ await postChunk(draft);
85
+ seq += 1;
86
+ chunksEmitted += 1;
87
+ }
88
+ continue;
89
+ }
90
+ if (msg.type === 'result') {
91
+ const result = msg;
92
+ resultObserved = true;
93
+ hadError = result.is_error === true;
94
+ // Emit a terminal chunk so downstream consumers can render
95
+ // turn-completion (or error) state without inferring from
96
+ // absence-of-events.
97
+ await postChunk({
98
+ world_id: worldId,
99
+ session_id: sessionId,
100
+ message_id: messageId,
101
+ seq,
102
+ role: 'system',
103
+ kind: 'result',
104
+ chunk: JSON.stringify({
105
+ subtype: result.subtype ?? null,
106
+ is_error: hadError,
107
+ }),
108
+ created_at: now().toISOString(),
109
+ });
110
+ seq += 1;
111
+ chunksEmitted += 1;
112
+ break;
113
+ }
114
+ // 'system' messages (auth status, session state, hooks) and
115
+ // 'stream_event' (per-token granularity) are skipped in v1.
116
+ // 'stream_event' may be promoted to per-token chunk writes in a
117
+ // C+1 follow-up if the demo arc needs sub-second token visibility;
118
+ // current scope ships at assistant-message granularity to keep
119
+ // chunk write rate within K3 budget bounds.
120
+ }
121
+ return {
122
+ chunksEmitted,
123
+ endSeq: seq,
124
+ resultObserved,
125
+ hadError,
126
+ };
127
+ }
128
+ /**
129
+ * Drain a LONG-LIVED SDK event stream (multi-turn interactive mode) and
130
+ * emit chunks for each message, allocating a fresh messageId per turn.
131
+ *
132
+ * Differences from `streamSdkToChunks`:
133
+ * - Does NOT break on the first `'result'`; continues consuming until the
134
+ * iterable closes OR the caller aborts the underlying SDK stream.
135
+ * - Allocates a new messageId via the caller's callback on each
136
+ * `'result'` boundary; resets seq to 0 per new messageId.
137
+ * - No `seqStart` parameter — multi-turn streams always start fresh on
138
+ * each turn.
139
+ *
140
+ * Use this helper when wiring `query({prompt: AsyncIterable<SDKUserMessage>})`
141
+ * — the SDK emits multiple assistant + result turns over the same stream
142
+ * as operator messages feed in via the iterable. The substrate's per-turn
143
+ * messageId semantics survive because each `'result'` triggers a new
144
+ * messageId via `allocateMessageId()`.
145
+ */
146
+ export async function streamMultiTurnSdkToChunks(input) {
147
+ const { messages, worldId, sessionId, allocateMessageId, postChunk } = input;
148
+ const now = input.now ?? (() => new Date());
149
+ let messageId = allocateMessageId();
150
+ let seq = 0;
151
+ let turnsObserved = 0;
152
+ let chunksEmitted = 0;
153
+ let lastError = false;
154
+ for await (const msg of messages) {
155
+ if (msg.type === 'assistant') {
156
+ const assistant = msg;
157
+ const content = assistant.message?.content ?? [];
158
+ for (const block of content) {
159
+ const draft = blockToChunkDraft(block, {
160
+ worldId,
161
+ sessionId,
162
+ messageId,
163
+ seq,
164
+ createdAt: now().toISOString(),
165
+ });
166
+ if (draft === null)
167
+ continue;
168
+ await postChunk(draft);
169
+ seq += 1;
170
+ chunksEmitted += 1;
171
+ }
172
+ continue;
173
+ }
174
+ if (msg.type === 'result') {
175
+ const result = msg;
176
+ lastError = result.is_error === true;
177
+ await postChunk({
178
+ world_id: worldId,
179
+ session_id: sessionId,
180
+ message_id: messageId,
181
+ seq,
182
+ role: 'system',
183
+ kind: 'result',
184
+ chunk: JSON.stringify({
185
+ subtype: result.subtype ?? null,
186
+ is_error: lastError,
187
+ }),
188
+ created_at: now().toISOString(),
189
+ });
190
+ seq += 1;
191
+ chunksEmitted += 1;
192
+ turnsObserved += 1;
193
+ // Allocate fresh messageId for the NEXT turn; reset seq to 0.
194
+ // (Unlike streamSdkToChunks which BREAKs here, this helper keeps
195
+ // consuming the long-lived stream.)
196
+ messageId = allocateMessageId();
197
+ seq = 0;
198
+ continue;
199
+ }
200
+ // 'system' + 'stream_event' skipped (same scope as streamSdkToChunks).
201
+ }
202
+ return { turnsObserved, chunksEmitted, lastError };
203
+ }
204
+ function blockToChunkDraft(block, ctx) {
205
+ if (block.type === 'text') {
206
+ const text = block.text;
207
+ if (!text)
208
+ return null;
209
+ return {
210
+ world_id: ctx.worldId,
211
+ session_id: ctx.sessionId,
212
+ message_id: ctx.messageId,
213
+ seq: ctx.seq,
214
+ role: 'assistant',
215
+ kind: 'text',
216
+ chunk: text,
217
+ created_at: ctx.createdAt,
218
+ };
219
+ }
220
+ if (block.type === 'tool_use') {
221
+ const tool = block;
222
+ return {
223
+ world_id: ctx.worldId,
224
+ session_id: ctx.sessionId,
225
+ message_id: ctx.messageId,
226
+ seq: ctx.seq,
227
+ role: 'tool',
228
+ kind: 'tool-call',
229
+ chunk: JSON.stringify({
230
+ tool_use_id: tool.id,
231
+ name: tool.name,
232
+ input: tool.input,
233
+ }),
234
+ created_at: ctx.createdAt,
235
+ };
236
+ }
237
+ // Unknown block types: preserve via JSON repr so nothing is silently
238
+ // dropped (operator-visible kind='text' fallback keeps the stream
239
+ // intact for forward-compat with future SDK content-block additions).
240
+ return {
241
+ world_id: ctx.worldId,
242
+ session_id: ctx.sessionId,
243
+ message_id: ctx.messageId,
244
+ seq: ctx.seq,
245
+ role: 'assistant',
246
+ kind: 'text',
247
+ chunk: JSON.stringify(block),
248
+ created_at: ctx.createdAt,
249
+ };
250
+ }
251
+ /**
252
+ * Production transport for `postChunk`: HTTPs POST to host-cp's
253
+ * `/v1/chunks` endpoint with bearer auth. Returns a function that
254
+ * matches `StreamSdkToChunksInput['postChunk']`.
255
+ *
256
+ * Devbox container calls this with `host.docker.internal:3112` as the
257
+ * sidecar address; bearer is read from `~/.olam/plan-chat-secret`
258
+ * (matches the Phase B B3 SPA bearer-channel pattern, per OQ8 lean (a)).
259
+ */
260
+ export function makeHostCpChunkPoster(opts) {
261
+ return async (row) => {
262
+ const res = await fetch(`${opts.sidecarUrl}/v1/chunks`, {
263
+ method: 'POST',
264
+ headers: {
265
+ 'content-type': 'application/json',
266
+ authorization: `Bearer ${opts.bearer}`,
267
+ },
268
+ body: JSON.stringify(row),
269
+ });
270
+ if (!res.ok) {
271
+ const body = await res.text().catch(() => '<no body>');
272
+ throw new Error(`host-cp /v1/chunks POST failed: ${res.status} ${res.statusText} — ${body}`);
273
+ }
274
+ };
275
+ }
276
+ //# sourceMappingURL=agent-sdk-to-chunks.js.map
@@ -0,0 +1,348 @@
1
+ /**
2
+ * agent-stream-launch.ts — Phase B B6 PID-1 supervisor (with PID-file
3
+ * spawn guard added 2026-05-18 per olam-driver-runner-fix Phase A1
4
+ * finding + Phase C4 mitigation).
5
+ *
6
+ * Runs as PID 1 inside the devbox container. Forks driver + codex
7
+ * children; forwards SIGTERM with 25s drain grace; surfaces child
8
+ * exits to stderr (visible via docker logs).
9
+ *
10
+ * Spawn guard (Phase C4, 2026-05-18):
11
+ * - PID-file at `/tmp/olam-supervisor-${WORLD_ID}.pid`. On startup:
12
+ * atomic `open(path, 'wx')` create-if-not-exists. On EEXIST: read
13
+ * stored PID + check `process.kill(pid, 0)`. If alive: log + exit 0
14
+ * (no-op spawn; existing supervisor stays in charge). If dead:
15
+ * unlink + retry the atomic create.
16
+ * - On SIGTERM: unlink the PID-file before draining children.
17
+ * - Closes the gap originally documented in this file (line ~9 pre-
18
+ * C4) where "host-cp serialization will prevent double-spawn"
19
+ * turned out unreliable across host-cp restarts. Verified
20
+ * reproducible per `olam-frost-oak-9854-devbox` showing 7
21
+ * supervisors over a ~7-hour window.
22
+ *
23
+ * Surviving demo-cut simplifications:
24
+ * - NO cgroup memory polling / warning chunks.
25
+ * - NO health-probe endpoint.
26
+ * - NO selective bearer env scrubbing for lookouts.
27
+ * - Codex/driver auto-restart with simple exp-backoff cap 30s.
28
+ * - Driver crash surfaces via stderr only.
29
+ *
30
+ * Source: docs/design/olam-plan-chat-agent-runtime.md `lifecycle` +
31
+ * `supervision` + `sigterm-drain` sections + olam-driver-runner-fix
32
+ * Phase A1 + C4.
33
+ */
34
+ import { fork } from 'node:child_process';
35
+ import { createHash } from 'node:crypto';
36
+ import { existsSync, openSync, readFileSync, statSync, unlinkSync, writeFileSync } from 'node:fs';
37
+ import { dirname, join, resolve } from 'node:path';
38
+ import { fileURLToPath } from 'node:url';
39
+ const DRAIN_GRACE_MS = 25_000;
40
+ const BACKOFF_SCHEDULE_MS = [1_000, 2_000, 4_000, 8_000, 16_000, 30_000];
41
+ const BACKOFF_CAP_MS = 30_000;
42
+ const FAILURE_WINDOW_MS = 60_000;
43
+ const FAILURE_DISABLE_THRESHOLD = 5;
44
+ /**
45
+ * Spawn all configured children + install SIGTERM/uncaughtException/
46
+ * unhandledRejection handlers. Returns a handle for graceful shutdown.
47
+ *
48
+ * The supervisor process never exits on its own — it stays up until
49
+ * drain() is called (typically via signal handler).
50
+ */
51
+ export function runSupervisor(opts) {
52
+ const { children: childConfigs, childEnv, forkImpl = fork, processRef = process, } = opts;
53
+ const states = new Map();
54
+ for (const cfg of childConfigs) {
55
+ states.set(cfg.personaKey, { child: null, attempt: 0, failures: [], disabled: false });
56
+ }
57
+ let isDraining = false;
58
+ let pendingExitCode = 0;
59
+ let drainResolve = null;
60
+ const drainPromise = new Promise((resolve) => {
61
+ drainResolve = resolve;
62
+ });
63
+ function spawnChild(cfg) {
64
+ const state = states.get(cfg.personaKey);
65
+ if (!state || state.disabled || isDraining)
66
+ return;
67
+ const env = { ...process.env, ...childEnv };
68
+ // eslint-disable-next-line no-console
69
+ console.error(`[supervisor] forking ${cfg.personaKey} from ${cfg.modulePath}`);
70
+ const child = forkImpl(cfg.modulePath, [], {
71
+ env,
72
+ stdio: ['ignore', 'inherit', 'inherit', 'ipc'],
73
+ });
74
+ state.child = child;
75
+ child.on('exit', (code, signal) => {
76
+ state.child = null;
77
+ // eslint-disable-next-line no-console
78
+ console.error(`[supervisor] ${cfg.personaKey} exited code=${code} signal=${signal}`);
79
+ if (isDraining)
80
+ return;
81
+ if (code === 0 && signal === null) {
82
+ // Clean exit — don't restart (rare for long-lived runners).
83
+ return;
84
+ }
85
+ // Track failure timestamp; if 5+ within 60s, disable.
86
+ const now = Date.now();
87
+ state.failures = state.failures.filter((t) => now - t < FAILURE_WINDOW_MS);
88
+ state.failures.push(now);
89
+ if (state.failures.length >= FAILURE_DISABLE_THRESHOLD) {
90
+ state.disabled = true;
91
+ // eslint-disable-next-line no-console
92
+ console.error(`[supervisor] ${cfg.personaKey} disabled (${state.failures.length} failures in ${FAILURE_WINDOW_MS}ms)`);
93
+ return;
94
+ }
95
+ // Schedule restart with exp-backoff.
96
+ const delay = BACKOFF_SCHEDULE_MS[Math.min(state.attempt, BACKOFF_SCHEDULE_MS.length - 1)] ?? BACKOFF_CAP_MS;
97
+ state.attempt += 1;
98
+ setTimeout(() => {
99
+ if (isDraining || state.disabled)
100
+ return;
101
+ state.attempt = Math.max(0, state.attempt - 1); // decay attempt after delay elapsed
102
+ spawnChild(cfg);
103
+ }, delay);
104
+ });
105
+ }
106
+ function activeChildren() {
107
+ const out = [];
108
+ for (const state of states.values()) {
109
+ if (state.child)
110
+ out.push(state.child);
111
+ }
112
+ return out;
113
+ }
114
+ async function drain(exitCode = 0) {
115
+ if (isDraining) {
116
+ return drainPromise;
117
+ }
118
+ isDraining = true;
119
+ pendingExitCode = exitCode;
120
+ // eslint-disable-next-line no-console
121
+ console.error(`[supervisor] draining children (exit code ${exitCode})`);
122
+ for (const child of activeChildren()) {
123
+ try {
124
+ child.kill('SIGTERM');
125
+ }
126
+ catch {
127
+ // ignore
128
+ }
129
+ }
130
+ const deadline = Date.now() + DRAIN_GRACE_MS;
131
+ while (activeChildren().length > 0 && Date.now() < deadline) {
132
+ await new Promise((r) => setTimeout(r, 250));
133
+ }
134
+ for (const child of activeChildren()) {
135
+ try {
136
+ child.kill('SIGKILL');
137
+ }
138
+ catch {
139
+ // ignore
140
+ }
141
+ }
142
+ // eslint-disable-next-line no-console
143
+ console.error(`[supervisor] drain complete; exiting code ${pendingExitCode}`);
144
+ if (drainResolve)
145
+ drainResolve();
146
+ // Caller decides whether to actually call process.exit() — keeps the
147
+ // function testable + lets the main() entry point handle it.
148
+ }
149
+ processRef.on('SIGTERM', () => {
150
+ void drain(pendingExitCode);
151
+ });
152
+ processRef.on('SIGINT', () => {
153
+ void drain(pendingExitCode);
154
+ });
155
+ processRef.on('uncaughtException', (err) => {
156
+ // eslint-disable-next-line no-console
157
+ console.error('[supervisor] uncaughtException — draining with exit code 1:', err);
158
+ pendingExitCode = 1;
159
+ void drain(1);
160
+ });
161
+ processRef.on('unhandledRejection', (reason) => {
162
+ // eslint-disable-next-line no-console
163
+ console.error('[supervisor] unhandledRejection — draining with exit code 1:', reason);
164
+ pendingExitCode = 1;
165
+ void drain(1);
166
+ });
167
+ // Spawn initial children.
168
+ for (const cfg of childConfigs) {
169
+ spawnChild(cfg);
170
+ }
171
+ return {
172
+ drain,
173
+ draining: () => isDraining,
174
+ activeChildCount: () => activeChildren().length,
175
+ };
176
+ }
177
+ /**
178
+ * Phase C4 spawn guard. Atomic create-if-not-exists on a PID file at
179
+ * `/tmp/olam-supervisor-${worldId}.pid`. If the file exists and the
180
+ * stored PID is alive: outcome='existing-alive' (caller should no-op
181
+ * exit). If the file exists but the PID is dead: unlink + retry,
182
+ * outcome='stale-cleared'. Otherwise: outcome='acquired'.
183
+ *
184
+ * `process.kill(pid, 0)` is a liveness probe (no signal sent; throws if
185
+ * the PID doesn't exist or we don't own it). EPERM is treated as alive
186
+ * (process exists but we can't signal it — still don't double-spawn).
187
+ *
188
+ * Exported for unit testing; main() invokes at startup.
189
+ */
190
+ export function acquireSupervisorLock(worldId) {
191
+ const pidPath = `/tmp/olam-supervisor-${worldId}.pid`;
192
+ const tryAcquire = () => {
193
+ try {
194
+ const fd = openSync(pidPath, 'wx');
195
+ writeFileSync(fd, String(process.pid));
196
+ return { outcome: 'acquired', pidPath };
197
+ }
198
+ catch (err) {
199
+ if (err.code === 'EEXIST')
200
+ return null;
201
+ throw err;
202
+ }
203
+ };
204
+ // First attempt: atomic create.
205
+ const first = tryAcquire();
206
+ if (first)
207
+ return first;
208
+ // EEXIST path: read existing PID + liveness check.
209
+ let existingPid;
210
+ try {
211
+ existingPid = Number.parseInt(readFileSync(pidPath, 'utf8').trim(), 10);
212
+ }
213
+ catch {
214
+ // Read failed; assume stale + try unlink.
215
+ existingPid = NaN;
216
+ }
217
+ if (!Number.isFinite(existingPid) || existingPid <= 0) {
218
+ if (existsSync(pidPath))
219
+ unlinkSync(pidPath);
220
+ const retry = tryAcquire();
221
+ if (retry)
222
+ return retry;
223
+ throw new Error(`spawn-guard: failed to acquire ${pidPath} after stale cleanup`);
224
+ }
225
+ // Liveness probe.
226
+ try {
227
+ process.kill(existingPid, 0);
228
+ // No throw → process exists.
229
+ return { outcome: 'existing-alive', existingPid, pidPath };
230
+ }
231
+ catch (err) {
232
+ const code = err.code;
233
+ if (code === 'EPERM') {
234
+ // Process exists but we can't signal it; still don't double-spawn.
235
+ return { outcome: 'existing-alive', existingPid, pidPath };
236
+ }
237
+ // ESRCH or similar: stale.
238
+ if (existsSync(pidPath))
239
+ unlinkSync(pidPath);
240
+ const retry = tryAcquire();
241
+ if (retry)
242
+ return { outcome: 'stale-cleared', stalePid: existingPid, pidPath };
243
+ throw new Error(`spawn-guard: failed to acquire ${pidPath} after stale-PID cleanup`);
244
+ }
245
+ }
246
+ /**
247
+ * Phase A2 — emit a single startup line naming the bundle this supervisor
248
+ * is running. host-cp parses it (key=value pairs) to correlate world
249
+ * staleness reports with the operator's host dist.
250
+ *
251
+ * Format (one line, stderr — picked up via `docker logs`):
252
+ * `[supervisor] bundle bundle_entry=<path> bundle_sha256=<hex> bundle_mtime=<ISO>`
253
+ *
254
+ * When the entry path is unreadable, emits a degraded line with
255
+ * `bundle_sha256=unknown bundle_mtime=unknown` instead of throwing —
256
+ * supervisor must continue to start even if the introspection probe
257
+ * fails (e.g. file deleted mid-launch).
258
+ */
259
+ export function logBundleStartup(opts) {
260
+ const { entryPath } = opts;
261
+ // eslint-disable-next-line no-console
262
+ const logger = opts.logger ?? ((msg) => console.error(msg));
263
+ try {
264
+ const buf = readFileSync(entryPath);
265
+ const sha = createHash('sha256').update(buf).digest('hex');
266
+ const mtimeIso = statSync(entryPath).mtime.toISOString();
267
+ logger(`[supervisor] bundle bundle_entry=${entryPath} bundle_sha256=${sha} bundle_mtime=${mtimeIso}`);
268
+ }
269
+ catch (err) {
270
+ const code = err.code ?? 'unknown';
271
+ logger(`[supervisor] bundle bundle_entry=${entryPath} bundle_sha256=unknown bundle_mtime=unknown bundle_error=${code}`);
272
+ }
273
+ }
274
+ export async function main() {
275
+ // Phase A2 — emit bundle identity before anything else so it lands in
276
+ // `docker logs` even when env validation fails below.
277
+ if (process.argv[1]) {
278
+ logBundleStartup({ entryPath: process.argv[1] });
279
+ }
280
+ const required = ['HOST_CP_URL', 'HOST_CP_BEARER', 'WORLD_ID', 'SESSION_ID'];
281
+ for (const key of required) {
282
+ if (!process.env[key]) {
283
+ // eslint-disable-next-line no-console
284
+ console.error(`[supervisor] missing required env: ${key}`);
285
+ process.exit(1);
286
+ }
287
+ }
288
+ // Phase C4 spawn guard. Closes the zombie-supervisor gap surfaced by
289
+ // olam-driver-runner-fix Phase A1 finding.
290
+ const worldId = process.env.WORLD_ID;
291
+ const guard = acquireSupervisorLock(worldId);
292
+ if (guard.outcome === 'existing-alive') {
293
+ // eslint-disable-next-line no-console
294
+ console.error(`[supervisor] another supervisor is already running for world=${worldId} (PID ${guard.existingPid}); exiting cleanly`);
295
+ process.exit(0);
296
+ }
297
+ if (guard.outcome === 'stale-cleared') {
298
+ // eslint-disable-next-line no-console
299
+ console.error(`[supervisor] cleared stale PID-file (was ${guard.stalePid}); acquired ${guard.pidPath}`);
300
+ }
301
+ // Unlink the PID-file on any exit (clean + signal-induced).
302
+ const cleanupPidFile = () => {
303
+ try {
304
+ if (existsSync(guard.pidPath))
305
+ unlinkSync(guard.pidPath);
306
+ }
307
+ catch {
308
+ // swallow — best-effort cleanup
309
+ }
310
+ };
311
+ process.on('exit', cleanupPidFile);
312
+ process.on('SIGTERM', cleanupPidFile);
313
+ process.on('SIGINT', cleanupPidFile);
314
+ const childEnv = {};
315
+ for (const k of required) {
316
+ childEnv[k] = process.env[k];
317
+ }
318
+ const baseDir = process.env.AGENT_STREAM_DIR ?? dirname(fileURLToPath(import.meta.url));
319
+ const children = [
320
+ { personaKey: 'driver', modulePath: resolve(join(baseDir, 'driver-runner.js')) },
321
+ { personaKey: 'codex', modulePath: resolve(join(baseDir, 'codex-runner.js')) },
322
+ ];
323
+ const handle = runSupervisor({ children, childEnv });
324
+ // Keep the process alive until drain completes; then exit.
325
+ // Drain is triggered by SIGTERM / uncaughtException / unhandledRejection.
326
+ await new Promise((resolve) => {
327
+ const check = setInterval(() => {
328
+ if (handle.draining() && handle.activeChildCount() === 0) {
329
+ clearInterval(check);
330
+ resolve();
331
+ }
332
+ }, 500);
333
+ });
334
+ // pendingExitCode lives on the handle's internal state; we approximate
335
+ // by checking activeChildCount + draining. Production refines this.
336
+ process.exit(0);
337
+ }
338
+ if (typeof process !== 'undefined' &&
339
+ process.argv[1] &&
340
+ (process.argv[1].endsWith('agent-stream-launch.js') ||
341
+ process.argv[1].endsWith('agent-stream-launch.ts'))) {
342
+ main().catch((err) => {
343
+ // eslint-disable-next-line no-console
344
+ console.error('[supervisor] fatal:', err);
345
+ process.exit(1);
346
+ });
347
+ }
348
+ //# sourceMappingURL=agent-stream-launch.js.map