@pleri/olam-cli 0.1.148 → 0.1.151
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/agent-stream/agent-sdk-to-chunks.js +276 -0
- package/dist/agent-stream/agent-stream-launch.js +348 -0
- package/dist/agent-stream/chunks-subscriber-transport.js +262 -0
- package/dist/agent-stream/codex-runner.js +188 -0
- package/dist/agent-stream/driver-runner.js +347 -0
- package/dist/agent-stream/operator-subscription.js +179 -0
- package/dist/commands/create.d.ts.map +1 -1
- package/dist/commands/create.js +39 -0
- package/dist/commands/create.js.map +1 -1
- package/dist/commands/doctor.d.ts +23 -0
- package/dist/commands/doctor.d.ts.map +1 -1
- package/dist/commands/doctor.js +77 -3
- package/dist/commands/doctor.js.map +1 -1
- package/dist/commands/init.d.ts +46 -0
- package/dist/commands/init.d.ts.map +1 -1
- package/dist/commands/init.js +90 -0
- package/dist/commands/init.js.map +1 -1
- package/dist/commands/kg-build.d.ts +23 -0
- package/dist/commands/kg-build.d.ts.map +1 -1
- package/dist/commands/kg-build.js +104 -2
- package/dist/commands/kg-build.js.map +1 -1
- package/dist/commands/restart.d.ts +18 -0
- package/dist/commands/restart.d.ts.map +1 -0
- package/dist/commands/restart.js +113 -0
- package/dist/commands/restart.js.map +1 -0
- package/dist/commands/setup-linux-gate.d.ts +26 -0
- package/dist/commands/setup-linux-gate.d.ts.map +1 -0
- package/dist/commands/setup-linux-gate.js +42 -0
- package/dist/commands/setup-linux-gate.js.map +1 -0
- package/dist/commands/setup-metrics.d.ts +26 -0
- package/dist/commands/setup-metrics.d.ts.map +1 -0
- package/dist/commands/setup-metrics.js +57 -0
- package/dist/commands/setup-metrics.js.map +1 -0
- package/dist/commands/setup-phase-5a-skill-source.d.ts +68 -0
- package/dist/commands/setup-phase-5a-skill-source.d.ts.map +1 -0
- package/dist/commands/setup-phase-5a-skill-source.js +196 -0
- package/dist/commands/setup-phase-5a-skill-source.js.map +1 -0
- package/dist/commands/setup-phase-5b-project-sweep.d.ts +38 -0
- package/dist/commands/setup-phase-5b-project-sweep.d.ts.map +1 -0
- package/dist/commands/setup-phase-5b-project-sweep.js +175 -0
- package/dist/commands/setup-phase-5b-project-sweep.js.map +1 -0
- package/dist/commands/setup.d.ts +19 -0
- package/dist/commands/setup.d.ts.map +1 -1
- package/dist/commands/setup.js +22 -0
- package/dist/commands/setup.js.map +1 -1
- package/dist/commands/skills-10x.d.ts +23 -0
- package/dist/commands/skills-10x.d.ts.map +1 -0
- package/dist/commands/skills-10x.js +308 -0
- package/dist/commands/skills-10x.js.map +1 -0
- package/dist/image-digests.json +7 -7
- package/dist/index.js +17878 -15826
- package/dist/index.js.map +1 -1
- package/dist/lib/build-if-stale.d.ts +33 -0
- package/dist/lib/build-if-stale.d.ts.map +1 -0
- package/dist/lib/build-if-stale.js +156 -0
- package/dist/lib/build-if-stale.js.map +1 -0
- package/dist/lib/bundle-freshness.d.ts +57 -0
- package/dist/lib/bundle-freshness.d.ts.map +1 -0
- package/dist/lib/bundle-freshness.js +223 -0
- package/dist/lib/bundle-freshness.js.map +1 -0
- package/dist/lib/bundle-source.d.ts +52 -0
- package/dist/lib/bundle-source.d.ts.map +1 -0
- package/dist/lib/bundle-source.js +83 -0
- package/dist/lib/bundle-source.js.map +1 -0
- package/dist/lib/manifest-refresh.d.ts +34 -0
- package/dist/lib/manifest-refresh.d.ts.map +1 -1
- package/dist/lib/manifest-refresh.js +66 -0
- package/dist/lib/manifest-refresh.js.map +1 -1
- package/dist/lib/upgrade-kubernetes.d.ts +17 -1
- package/dist/lib/upgrade-kubernetes.d.ts.map +1 -1
- package/dist/lib/upgrade-kubernetes.js +125 -1
- package/dist/lib/upgrade-kubernetes.js.map +1 -1
- package/dist/mcp-server.js +84 -58
- package/host-cp/compose.yaml +6 -0
- package/host-cp/k8s/manifests/30-configmap.yaml +6 -0
- package/host-cp/k8s/manifests/50-deployment.yaml +46 -9
- package/host-cp/k8s/manifests/auth-service/50-deployment.yaml +7 -4
- package/host-cp/k8s/manifests/kg-service/50-deployment.yaml +1 -1
- package/host-cp/k8s/manifests/mcp-auth-service/50-deployment.yaml +7 -4
- package/host-cp/k8s/manifests/memory-service/50-deployment.yaml +6 -1
- package/host-cp/src/agent-runtime-trigger.mjs +7 -5
- package/host-cp/src/plan-chat-secret.mjs +13 -2
- package/host-cp/src/plan-chat-service.mjs +94 -12
- package/host-cp/src/server.mjs +19 -7
- package/host-cp/src/upgrade-spawner.mjs +10 -5
- package/package.json +4 -2
|
@@ -0,0 +1,276 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* agent-sdk-to-chunks.ts — Claude Agent SDK event-stream → chunks substrate.
|
|
3
|
+
*
|
|
4
|
+
* The load-bearing seam for Phase C: the agent runtime is
|
|
5
|
+
* `@anthropic-ai/claude-agent-sdk` consuming the operator's LOCAL Claude
|
|
6
|
+
* Code subscription (D15). Each event the SDK emits from `query({...})`
|
|
7
|
+
* becomes a chunk row written via host-cp's `POST /v1/chunks` bearer
|
|
8
|
+
* endpoint. The sidecar server-injects `actor_id` + `actor_type` from
|
|
9
|
+
* the bearer principal (T3 mitigation; verified in Phase B B1/PB2).
|
|
10
|
+
*
|
|
11
|
+
* Design notes:
|
|
12
|
+
*
|
|
13
|
+
* - The adapter accepts an `AsyncIterable<SdkLikeMessage>` (NOT the
|
|
14
|
+
* full SDKMessage union) so it's testable with mock iterables
|
|
15
|
+
* without booting the real SDK. The Claude Agent SDK's `query()`
|
|
16
|
+
* returns an iterable matching this shape; the test harness
|
|
17
|
+
* supplies synthetic iterables.
|
|
18
|
+
*
|
|
19
|
+
* - `postChunk` is injected so the adapter is HTTP-transport-agnostic.
|
|
20
|
+
* Production wires it to `fetch(host-cp/v1/chunks, { bearer, body })`.
|
|
21
|
+
* Tests pass a recording mock that captures the chunk rows.
|
|
22
|
+
*
|
|
23
|
+
* - SDK message handling (the load-bearing mapping):
|
|
24
|
+
* - 'assistant' (SDKAssistantMessage) → extract `message.content`
|
|
25
|
+
* array; emit one chunk per content block.
|
|
26
|
+
* - text → kind='text', chunk=<text>
|
|
27
|
+
* - tool_use → kind='tool-call', chunk=<tool_name + input JSON>
|
|
28
|
+
* - other content blocks → kind='text' fallback (preserve text repr)
|
|
29
|
+
* - 'result' (SDKResultMessage) → end of turn; flush + return.
|
|
30
|
+
* - 'system' → skip (auth, session-state, hooks — not user-visible).
|
|
31
|
+
* - 'stream_event' (SDKPartialAssistantMessage) → SKIPPED for v1.
|
|
32
|
+
* Phase C C1 ships at the assistant-message granularity (not
|
|
33
|
+
* per-token streaming) because: (a) assistant messages already
|
|
34
|
+
* arrive incrementally per content block; (b) per-token writes
|
|
35
|
+
* would 10-100× the chunk count and may violate the K3 budget
|
|
36
|
+
* under sustained load; (c) operator UX renders at the message
|
|
37
|
+
* level via assistant-ui's `Thread` + `MultiAuthorMessage`
|
|
38
|
+
* (Phase C C0). Per-token streaming is a deliberate C+1
|
|
39
|
+
* follow-up if the demo arc needs sub-second token visibility.
|
|
40
|
+
*
|
|
41
|
+
* - Sequence numbering: caller provides `messageId` (unique per turn)
|
|
42
|
+
* and `seqStart` (typically 0). The adapter increments `seq` per
|
|
43
|
+
* emitted chunk; (messageId, seq) is the substrate's primary key
|
|
44
|
+
* per Phase A A1's @olam/chunks schema. Atomic at the adapter
|
|
45
|
+
* scope (single async loop; no concurrent writes per (messageId)).
|
|
46
|
+
*
|
|
47
|
+
* - `audit:auth-callers` posture: the SDK uses the operator's local
|
|
48
|
+
* Claude Code subscription (D15); it does NOT hit
|
|
49
|
+
* `fetch('api.anthropic.com')` from this adapter's call path. The
|
|
50
|
+
* adapter ITSELF only calls `postChunk` (a local HTTP POST to
|
|
51
|
+
* host-cp on host.docker.internal:3112). CLAUDE.md gets a one-
|
|
52
|
+
* paragraph clarification per D14 same-commit discipline.
|
|
53
|
+
*
|
|
54
|
+
* Source: docs/plans/olam-plan-chat-chunks-substrate/phase-c-tasks.md (C1)
|
|
55
|
+
*/
|
|
56
|
+
/**
|
|
57
|
+
* Drain an SDK event stream and emit chunks for each message. Returns a
|
|
58
|
+
* summary; never throws on individual-message issues (logs + continues).
|
|
59
|
+
* Caller's `postChunk` failures DO propagate — substrate writes are the
|
|
60
|
+
* load-bearing surface; silent loss is not acceptable.
|
|
61
|
+
*/
|
|
62
|
+
export async function streamSdkToChunks(input) {
|
|
63
|
+
const { messages, worldId, sessionId, messageId, postChunk } = input;
|
|
64
|
+
const seqStart = input.seqStart ?? 0;
|
|
65
|
+
const now = input.now ?? (() => new Date());
|
|
66
|
+
let seq = seqStart;
|
|
67
|
+
let chunksEmitted = 0;
|
|
68
|
+
let resultObserved = false;
|
|
69
|
+
let hadError = false;
|
|
70
|
+
for await (const msg of messages) {
|
|
71
|
+
if (msg.type === 'assistant') {
|
|
72
|
+
const assistant = msg;
|
|
73
|
+
const content = assistant.message?.content ?? [];
|
|
74
|
+
for (const block of content) {
|
|
75
|
+
const draft = blockToChunkDraft(block, {
|
|
76
|
+
worldId,
|
|
77
|
+
sessionId,
|
|
78
|
+
messageId,
|
|
79
|
+
seq,
|
|
80
|
+
createdAt: now().toISOString(),
|
|
81
|
+
});
|
|
82
|
+
if (draft === null)
|
|
83
|
+
continue;
|
|
84
|
+
await postChunk(draft);
|
|
85
|
+
seq += 1;
|
|
86
|
+
chunksEmitted += 1;
|
|
87
|
+
}
|
|
88
|
+
continue;
|
|
89
|
+
}
|
|
90
|
+
if (msg.type === 'result') {
|
|
91
|
+
const result = msg;
|
|
92
|
+
resultObserved = true;
|
|
93
|
+
hadError = result.is_error === true;
|
|
94
|
+
// Emit a terminal chunk so downstream consumers can render
|
|
95
|
+
// turn-completion (or error) state without inferring from
|
|
96
|
+
// absence-of-events.
|
|
97
|
+
await postChunk({
|
|
98
|
+
world_id: worldId,
|
|
99
|
+
session_id: sessionId,
|
|
100
|
+
message_id: messageId,
|
|
101
|
+
seq,
|
|
102
|
+
role: 'system',
|
|
103
|
+
kind: 'result',
|
|
104
|
+
chunk: JSON.stringify({
|
|
105
|
+
subtype: result.subtype ?? null,
|
|
106
|
+
is_error: hadError,
|
|
107
|
+
}),
|
|
108
|
+
created_at: now().toISOString(),
|
|
109
|
+
});
|
|
110
|
+
seq += 1;
|
|
111
|
+
chunksEmitted += 1;
|
|
112
|
+
break;
|
|
113
|
+
}
|
|
114
|
+
// 'system' messages (auth status, session state, hooks) and
|
|
115
|
+
// 'stream_event' (per-token granularity) are skipped in v1.
|
|
116
|
+
// 'stream_event' may be promoted to per-token chunk writes in a
|
|
117
|
+
// C+1 follow-up if the demo arc needs sub-second token visibility;
|
|
118
|
+
// current scope ships at assistant-message granularity to keep
|
|
119
|
+
// chunk write rate within K3 budget bounds.
|
|
120
|
+
}
|
|
121
|
+
return {
|
|
122
|
+
chunksEmitted,
|
|
123
|
+
endSeq: seq,
|
|
124
|
+
resultObserved,
|
|
125
|
+
hadError,
|
|
126
|
+
};
|
|
127
|
+
}
|
|
128
|
+
/**
|
|
129
|
+
* Drain a LONG-LIVED SDK event stream (multi-turn interactive mode) and
|
|
130
|
+
* emit chunks for each message, allocating a fresh messageId per turn.
|
|
131
|
+
*
|
|
132
|
+
* Differences from `streamSdkToChunks`:
|
|
133
|
+
* - Does NOT break on the first `'result'`; continues consuming until the
|
|
134
|
+
* iterable closes OR the caller aborts the underlying SDK stream.
|
|
135
|
+
* - Allocates a new messageId via the caller's callback on each
|
|
136
|
+
* `'result'` boundary; resets seq to 0 per new messageId.
|
|
137
|
+
* - No `seqStart` parameter — multi-turn streams always start fresh on
|
|
138
|
+
* each turn.
|
|
139
|
+
*
|
|
140
|
+
* Use this helper when wiring `query({prompt: AsyncIterable<SDKUserMessage>})`
|
|
141
|
+
* — the SDK emits multiple assistant + result turns over the same stream
|
|
142
|
+
* as operator messages feed in via the iterable. The substrate's per-turn
|
|
143
|
+
* messageId semantics survive because each `'result'` triggers a new
|
|
144
|
+
* messageId via `allocateMessageId()`.
|
|
145
|
+
*/
|
|
146
|
+
export async function streamMultiTurnSdkToChunks(input) {
|
|
147
|
+
const { messages, worldId, sessionId, allocateMessageId, postChunk } = input;
|
|
148
|
+
const now = input.now ?? (() => new Date());
|
|
149
|
+
let messageId = allocateMessageId();
|
|
150
|
+
let seq = 0;
|
|
151
|
+
let turnsObserved = 0;
|
|
152
|
+
let chunksEmitted = 0;
|
|
153
|
+
let lastError = false;
|
|
154
|
+
for await (const msg of messages) {
|
|
155
|
+
if (msg.type === 'assistant') {
|
|
156
|
+
const assistant = msg;
|
|
157
|
+
const content = assistant.message?.content ?? [];
|
|
158
|
+
for (const block of content) {
|
|
159
|
+
const draft = blockToChunkDraft(block, {
|
|
160
|
+
worldId,
|
|
161
|
+
sessionId,
|
|
162
|
+
messageId,
|
|
163
|
+
seq,
|
|
164
|
+
createdAt: now().toISOString(),
|
|
165
|
+
});
|
|
166
|
+
if (draft === null)
|
|
167
|
+
continue;
|
|
168
|
+
await postChunk(draft);
|
|
169
|
+
seq += 1;
|
|
170
|
+
chunksEmitted += 1;
|
|
171
|
+
}
|
|
172
|
+
continue;
|
|
173
|
+
}
|
|
174
|
+
if (msg.type === 'result') {
|
|
175
|
+
const result = msg;
|
|
176
|
+
lastError = result.is_error === true;
|
|
177
|
+
await postChunk({
|
|
178
|
+
world_id: worldId,
|
|
179
|
+
session_id: sessionId,
|
|
180
|
+
message_id: messageId,
|
|
181
|
+
seq,
|
|
182
|
+
role: 'system',
|
|
183
|
+
kind: 'result',
|
|
184
|
+
chunk: JSON.stringify({
|
|
185
|
+
subtype: result.subtype ?? null,
|
|
186
|
+
is_error: lastError,
|
|
187
|
+
}),
|
|
188
|
+
created_at: now().toISOString(),
|
|
189
|
+
});
|
|
190
|
+
seq += 1;
|
|
191
|
+
chunksEmitted += 1;
|
|
192
|
+
turnsObserved += 1;
|
|
193
|
+
// Allocate fresh messageId for the NEXT turn; reset seq to 0.
|
|
194
|
+
// (Unlike streamSdkToChunks which BREAKs here, this helper keeps
|
|
195
|
+
// consuming the long-lived stream.)
|
|
196
|
+
messageId = allocateMessageId();
|
|
197
|
+
seq = 0;
|
|
198
|
+
continue;
|
|
199
|
+
}
|
|
200
|
+
// 'system' + 'stream_event' skipped (same scope as streamSdkToChunks).
|
|
201
|
+
}
|
|
202
|
+
return { turnsObserved, chunksEmitted, lastError };
|
|
203
|
+
}
|
|
204
|
+
function blockToChunkDraft(block, ctx) {
|
|
205
|
+
if (block.type === 'text') {
|
|
206
|
+
const text = block.text;
|
|
207
|
+
if (!text)
|
|
208
|
+
return null;
|
|
209
|
+
return {
|
|
210
|
+
world_id: ctx.worldId,
|
|
211
|
+
session_id: ctx.sessionId,
|
|
212
|
+
message_id: ctx.messageId,
|
|
213
|
+
seq: ctx.seq,
|
|
214
|
+
role: 'assistant',
|
|
215
|
+
kind: 'text',
|
|
216
|
+
chunk: text,
|
|
217
|
+
created_at: ctx.createdAt,
|
|
218
|
+
};
|
|
219
|
+
}
|
|
220
|
+
if (block.type === 'tool_use') {
|
|
221
|
+
const tool = block;
|
|
222
|
+
return {
|
|
223
|
+
world_id: ctx.worldId,
|
|
224
|
+
session_id: ctx.sessionId,
|
|
225
|
+
message_id: ctx.messageId,
|
|
226
|
+
seq: ctx.seq,
|
|
227
|
+
role: 'tool',
|
|
228
|
+
kind: 'tool-call',
|
|
229
|
+
chunk: JSON.stringify({
|
|
230
|
+
tool_use_id: tool.id,
|
|
231
|
+
name: tool.name,
|
|
232
|
+
input: tool.input,
|
|
233
|
+
}),
|
|
234
|
+
created_at: ctx.createdAt,
|
|
235
|
+
};
|
|
236
|
+
}
|
|
237
|
+
// Unknown block types: preserve via JSON repr so nothing is silently
|
|
238
|
+
// dropped (operator-visible kind='text' fallback keeps the stream
|
|
239
|
+
// intact for forward-compat with future SDK content-block additions).
|
|
240
|
+
return {
|
|
241
|
+
world_id: ctx.worldId,
|
|
242
|
+
session_id: ctx.sessionId,
|
|
243
|
+
message_id: ctx.messageId,
|
|
244
|
+
seq: ctx.seq,
|
|
245
|
+
role: 'assistant',
|
|
246
|
+
kind: 'text',
|
|
247
|
+
chunk: JSON.stringify(block),
|
|
248
|
+
created_at: ctx.createdAt,
|
|
249
|
+
};
|
|
250
|
+
}
|
|
251
|
+
/**
|
|
252
|
+
* Production transport for `postChunk`: HTTPs POST to host-cp's
|
|
253
|
+
* `/v1/chunks` endpoint with bearer auth. Returns a function that
|
|
254
|
+
* matches `StreamSdkToChunksInput['postChunk']`.
|
|
255
|
+
*
|
|
256
|
+
* Devbox container calls this with `host.docker.internal:3112` as the
|
|
257
|
+
* sidecar address; bearer is read from `~/.olam/plan-chat-secret`
|
|
258
|
+
* (matches the Phase B B3 SPA bearer-channel pattern, per OQ8 lean (a)).
|
|
259
|
+
*/
|
|
260
|
+
export function makeHostCpChunkPoster(opts) {
|
|
261
|
+
return async (row) => {
|
|
262
|
+
const res = await fetch(`${opts.sidecarUrl}/v1/chunks`, {
|
|
263
|
+
method: 'POST',
|
|
264
|
+
headers: {
|
|
265
|
+
'content-type': 'application/json',
|
|
266
|
+
authorization: `Bearer ${opts.bearer}`,
|
|
267
|
+
},
|
|
268
|
+
body: JSON.stringify(row),
|
|
269
|
+
});
|
|
270
|
+
if (!res.ok) {
|
|
271
|
+
const body = await res.text().catch(() => '<no body>');
|
|
272
|
+
throw new Error(`host-cp /v1/chunks POST failed: ${res.status} ${res.statusText} — ${body}`);
|
|
273
|
+
}
|
|
274
|
+
};
|
|
275
|
+
}
|
|
276
|
+
//# sourceMappingURL=agent-sdk-to-chunks.js.map
|
|
@@ -0,0 +1,348 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* agent-stream-launch.ts — Phase B B6 PID-1 supervisor (with PID-file
|
|
3
|
+
* spawn guard added 2026-05-18 per olam-driver-runner-fix Phase A1
|
|
4
|
+
* finding + Phase C4 mitigation).
|
|
5
|
+
*
|
|
6
|
+
* Runs as PID 1 inside the devbox container. Forks driver + codex
|
|
7
|
+
* children; forwards SIGTERM with 25s drain grace; surfaces child
|
|
8
|
+
* exits to stderr (visible via docker logs).
|
|
9
|
+
*
|
|
10
|
+
* Spawn guard (Phase C4, 2026-05-18):
|
|
11
|
+
* - PID-file at `/tmp/olam-supervisor-${WORLD_ID}.pid`. On startup:
|
|
12
|
+
* atomic `open(path, 'wx')` create-if-not-exists. On EEXIST: read
|
|
13
|
+
* stored PID + check `process.kill(pid, 0)`. If alive: log + exit 0
|
|
14
|
+
* (no-op spawn; existing supervisor stays in charge). If dead:
|
|
15
|
+
* unlink + retry the atomic create.
|
|
16
|
+
* - On SIGTERM: unlink the PID-file before draining children.
|
|
17
|
+
* - Closes the gap originally documented in this file (line ~9 pre-
|
|
18
|
+
* C4) where "host-cp serialization will prevent double-spawn"
|
|
19
|
+
* turned out unreliable across host-cp restarts. Verified
|
|
20
|
+
* reproducible per `olam-frost-oak-9854-devbox` showing 7
|
|
21
|
+
* supervisors over a ~7-hour window.
|
|
22
|
+
*
|
|
23
|
+
* Surviving demo-cut simplifications:
|
|
24
|
+
* - NO cgroup memory polling / warning chunks.
|
|
25
|
+
* - NO health-probe endpoint.
|
|
26
|
+
* - NO selective bearer env scrubbing for lookouts.
|
|
27
|
+
* - Codex/driver auto-restart with simple exp-backoff cap 30s.
|
|
28
|
+
* - Driver crash surfaces via stderr only.
|
|
29
|
+
*
|
|
30
|
+
* Source: docs/design/olam-plan-chat-agent-runtime.md `lifecycle` +
|
|
31
|
+
* `supervision` + `sigterm-drain` sections + olam-driver-runner-fix
|
|
32
|
+
* Phase A1 + C4.
|
|
33
|
+
*/
|
|
34
|
+
import { fork } from 'node:child_process';
|
|
35
|
+
import { createHash } from 'node:crypto';
|
|
36
|
+
import { existsSync, openSync, readFileSync, statSync, unlinkSync, writeFileSync } from 'node:fs';
|
|
37
|
+
import { dirname, join, resolve } from 'node:path';
|
|
38
|
+
import { fileURLToPath } from 'node:url';
|
|
39
|
+
const DRAIN_GRACE_MS = 25_000;
|
|
40
|
+
const BACKOFF_SCHEDULE_MS = [1_000, 2_000, 4_000, 8_000, 16_000, 30_000];
|
|
41
|
+
const BACKOFF_CAP_MS = 30_000;
|
|
42
|
+
const FAILURE_WINDOW_MS = 60_000;
|
|
43
|
+
const FAILURE_DISABLE_THRESHOLD = 5;
|
|
44
|
+
/**
|
|
45
|
+
* Spawn all configured children + install SIGTERM/uncaughtException/
|
|
46
|
+
* unhandledRejection handlers. Returns a handle for graceful shutdown.
|
|
47
|
+
*
|
|
48
|
+
* The supervisor process never exits on its own — it stays up until
|
|
49
|
+
* drain() is called (typically via signal handler).
|
|
50
|
+
*/
|
|
51
|
+
export function runSupervisor(opts) {
|
|
52
|
+
const { children: childConfigs, childEnv, forkImpl = fork, processRef = process, } = opts;
|
|
53
|
+
const states = new Map();
|
|
54
|
+
for (const cfg of childConfigs) {
|
|
55
|
+
states.set(cfg.personaKey, { child: null, attempt: 0, failures: [], disabled: false });
|
|
56
|
+
}
|
|
57
|
+
let isDraining = false;
|
|
58
|
+
let pendingExitCode = 0;
|
|
59
|
+
let drainResolve = null;
|
|
60
|
+
const drainPromise = new Promise((resolve) => {
|
|
61
|
+
drainResolve = resolve;
|
|
62
|
+
});
|
|
63
|
+
function spawnChild(cfg) {
|
|
64
|
+
const state = states.get(cfg.personaKey);
|
|
65
|
+
if (!state || state.disabled || isDraining)
|
|
66
|
+
return;
|
|
67
|
+
const env = { ...process.env, ...childEnv };
|
|
68
|
+
// eslint-disable-next-line no-console
|
|
69
|
+
console.error(`[supervisor] forking ${cfg.personaKey} from ${cfg.modulePath}`);
|
|
70
|
+
const child = forkImpl(cfg.modulePath, [], {
|
|
71
|
+
env,
|
|
72
|
+
stdio: ['ignore', 'inherit', 'inherit', 'ipc'],
|
|
73
|
+
});
|
|
74
|
+
state.child = child;
|
|
75
|
+
child.on('exit', (code, signal) => {
|
|
76
|
+
state.child = null;
|
|
77
|
+
// eslint-disable-next-line no-console
|
|
78
|
+
console.error(`[supervisor] ${cfg.personaKey} exited code=${code} signal=${signal}`);
|
|
79
|
+
if (isDraining)
|
|
80
|
+
return;
|
|
81
|
+
if (code === 0 && signal === null) {
|
|
82
|
+
// Clean exit — don't restart (rare for long-lived runners).
|
|
83
|
+
return;
|
|
84
|
+
}
|
|
85
|
+
// Track failure timestamp; if 5+ within 60s, disable.
|
|
86
|
+
const now = Date.now();
|
|
87
|
+
state.failures = state.failures.filter((t) => now - t < FAILURE_WINDOW_MS);
|
|
88
|
+
state.failures.push(now);
|
|
89
|
+
if (state.failures.length >= FAILURE_DISABLE_THRESHOLD) {
|
|
90
|
+
state.disabled = true;
|
|
91
|
+
// eslint-disable-next-line no-console
|
|
92
|
+
console.error(`[supervisor] ${cfg.personaKey} disabled (${state.failures.length} failures in ${FAILURE_WINDOW_MS}ms)`);
|
|
93
|
+
return;
|
|
94
|
+
}
|
|
95
|
+
// Schedule restart with exp-backoff.
|
|
96
|
+
const delay = BACKOFF_SCHEDULE_MS[Math.min(state.attempt, BACKOFF_SCHEDULE_MS.length - 1)] ?? BACKOFF_CAP_MS;
|
|
97
|
+
state.attempt += 1;
|
|
98
|
+
setTimeout(() => {
|
|
99
|
+
if (isDraining || state.disabled)
|
|
100
|
+
return;
|
|
101
|
+
state.attempt = Math.max(0, state.attempt - 1); // decay attempt after delay elapsed
|
|
102
|
+
spawnChild(cfg);
|
|
103
|
+
}, delay);
|
|
104
|
+
});
|
|
105
|
+
}
|
|
106
|
+
function activeChildren() {
|
|
107
|
+
const out = [];
|
|
108
|
+
for (const state of states.values()) {
|
|
109
|
+
if (state.child)
|
|
110
|
+
out.push(state.child);
|
|
111
|
+
}
|
|
112
|
+
return out;
|
|
113
|
+
}
|
|
114
|
+
async function drain(exitCode = 0) {
|
|
115
|
+
if (isDraining) {
|
|
116
|
+
return drainPromise;
|
|
117
|
+
}
|
|
118
|
+
isDraining = true;
|
|
119
|
+
pendingExitCode = exitCode;
|
|
120
|
+
// eslint-disable-next-line no-console
|
|
121
|
+
console.error(`[supervisor] draining children (exit code ${exitCode})`);
|
|
122
|
+
for (const child of activeChildren()) {
|
|
123
|
+
try {
|
|
124
|
+
child.kill('SIGTERM');
|
|
125
|
+
}
|
|
126
|
+
catch {
|
|
127
|
+
// ignore
|
|
128
|
+
}
|
|
129
|
+
}
|
|
130
|
+
const deadline = Date.now() + DRAIN_GRACE_MS;
|
|
131
|
+
while (activeChildren().length > 0 && Date.now() < deadline) {
|
|
132
|
+
await new Promise((r) => setTimeout(r, 250));
|
|
133
|
+
}
|
|
134
|
+
for (const child of activeChildren()) {
|
|
135
|
+
try {
|
|
136
|
+
child.kill('SIGKILL');
|
|
137
|
+
}
|
|
138
|
+
catch {
|
|
139
|
+
// ignore
|
|
140
|
+
}
|
|
141
|
+
}
|
|
142
|
+
// eslint-disable-next-line no-console
|
|
143
|
+
console.error(`[supervisor] drain complete; exiting code ${pendingExitCode}`);
|
|
144
|
+
if (drainResolve)
|
|
145
|
+
drainResolve();
|
|
146
|
+
// Caller decides whether to actually call process.exit() — keeps the
|
|
147
|
+
// function testable + lets the main() entry point handle it.
|
|
148
|
+
}
|
|
149
|
+
processRef.on('SIGTERM', () => {
|
|
150
|
+
void drain(pendingExitCode);
|
|
151
|
+
});
|
|
152
|
+
processRef.on('SIGINT', () => {
|
|
153
|
+
void drain(pendingExitCode);
|
|
154
|
+
});
|
|
155
|
+
processRef.on('uncaughtException', (err) => {
|
|
156
|
+
// eslint-disable-next-line no-console
|
|
157
|
+
console.error('[supervisor] uncaughtException — draining with exit code 1:', err);
|
|
158
|
+
pendingExitCode = 1;
|
|
159
|
+
void drain(1);
|
|
160
|
+
});
|
|
161
|
+
processRef.on('unhandledRejection', (reason) => {
|
|
162
|
+
// eslint-disable-next-line no-console
|
|
163
|
+
console.error('[supervisor] unhandledRejection — draining with exit code 1:', reason);
|
|
164
|
+
pendingExitCode = 1;
|
|
165
|
+
void drain(1);
|
|
166
|
+
});
|
|
167
|
+
// Spawn initial children.
|
|
168
|
+
for (const cfg of childConfigs) {
|
|
169
|
+
spawnChild(cfg);
|
|
170
|
+
}
|
|
171
|
+
return {
|
|
172
|
+
drain,
|
|
173
|
+
draining: () => isDraining,
|
|
174
|
+
activeChildCount: () => activeChildren().length,
|
|
175
|
+
};
|
|
176
|
+
}
|
|
177
|
+
/**
|
|
178
|
+
* Phase C4 spawn guard. Atomic create-if-not-exists on a PID file at
|
|
179
|
+
* `/tmp/olam-supervisor-${worldId}.pid`. If the file exists and the
|
|
180
|
+
* stored PID is alive: outcome='existing-alive' (caller should no-op
|
|
181
|
+
* exit). If the file exists but the PID is dead: unlink + retry,
|
|
182
|
+
* outcome='stale-cleared'. Otherwise: outcome='acquired'.
|
|
183
|
+
*
|
|
184
|
+
* `process.kill(pid, 0)` is a liveness probe (no signal sent; throws if
|
|
185
|
+
* the PID doesn't exist or we don't own it). EPERM is treated as alive
|
|
186
|
+
* (process exists but we can't signal it — still don't double-spawn).
|
|
187
|
+
*
|
|
188
|
+
* Exported for unit testing; main() invokes at startup.
|
|
189
|
+
*/
|
|
190
|
+
export function acquireSupervisorLock(worldId) {
|
|
191
|
+
const pidPath = `/tmp/olam-supervisor-${worldId}.pid`;
|
|
192
|
+
const tryAcquire = () => {
|
|
193
|
+
try {
|
|
194
|
+
const fd = openSync(pidPath, 'wx');
|
|
195
|
+
writeFileSync(fd, String(process.pid));
|
|
196
|
+
return { outcome: 'acquired', pidPath };
|
|
197
|
+
}
|
|
198
|
+
catch (err) {
|
|
199
|
+
if (err.code === 'EEXIST')
|
|
200
|
+
return null;
|
|
201
|
+
throw err;
|
|
202
|
+
}
|
|
203
|
+
};
|
|
204
|
+
// First attempt: atomic create.
|
|
205
|
+
const first = tryAcquire();
|
|
206
|
+
if (first)
|
|
207
|
+
return first;
|
|
208
|
+
// EEXIST path: read existing PID + liveness check.
|
|
209
|
+
let existingPid;
|
|
210
|
+
try {
|
|
211
|
+
existingPid = Number.parseInt(readFileSync(pidPath, 'utf8').trim(), 10);
|
|
212
|
+
}
|
|
213
|
+
catch {
|
|
214
|
+
// Read failed; assume stale + try unlink.
|
|
215
|
+
existingPid = NaN;
|
|
216
|
+
}
|
|
217
|
+
if (!Number.isFinite(existingPid) || existingPid <= 0) {
|
|
218
|
+
if (existsSync(pidPath))
|
|
219
|
+
unlinkSync(pidPath);
|
|
220
|
+
const retry = tryAcquire();
|
|
221
|
+
if (retry)
|
|
222
|
+
return retry;
|
|
223
|
+
throw new Error(`spawn-guard: failed to acquire ${pidPath} after stale cleanup`);
|
|
224
|
+
}
|
|
225
|
+
// Liveness probe.
|
|
226
|
+
try {
|
|
227
|
+
process.kill(existingPid, 0);
|
|
228
|
+
// No throw → process exists.
|
|
229
|
+
return { outcome: 'existing-alive', existingPid, pidPath };
|
|
230
|
+
}
|
|
231
|
+
catch (err) {
|
|
232
|
+
const code = err.code;
|
|
233
|
+
if (code === 'EPERM') {
|
|
234
|
+
// Process exists but we can't signal it; still don't double-spawn.
|
|
235
|
+
return { outcome: 'existing-alive', existingPid, pidPath };
|
|
236
|
+
}
|
|
237
|
+
// ESRCH or similar: stale.
|
|
238
|
+
if (existsSync(pidPath))
|
|
239
|
+
unlinkSync(pidPath);
|
|
240
|
+
const retry = tryAcquire();
|
|
241
|
+
if (retry)
|
|
242
|
+
return { outcome: 'stale-cleared', stalePid: existingPid, pidPath };
|
|
243
|
+
throw new Error(`spawn-guard: failed to acquire ${pidPath} after stale-PID cleanup`);
|
|
244
|
+
}
|
|
245
|
+
}
|
|
246
|
+
/**
|
|
247
|
+
* Phase A2 — emit a single startup line naming the bundle this supervisor
|
|
248
|
+
* is running. host-cp parses it (key=value pairs) to correlate world
|
|
249
|
+
* staleness reports with the operator's host dist.
|
|
250
|
+
*
|
|
251
|
+
* Format (one line, stderr — picked up via `docker logs`):
|
|
252
|
+
* `[supervisor] bundle bundle_entry=<path> bundle_sha256=<hex> bundle_mtime=<ISO>`
|
|
253
|
+
*
|
|
254
|
+
* When the entry path is unreadable, emits a degraded line with
|
|
255
|
+
* `bundle_sha256=unknown bundle_mtime=unknown` instead of throwing —
|
|
256
|
+
* supervisor must continue to start even if the introspection probe
|
|
257
|
+
* fails (e.g. file deleted mid-launch).
|
|
258
|
+
*/
|
|
259
|
+
export function logBundleStartup(opts) {
|
|
260
|
+
const { entryPath } = opts;
|
|
261
|
+
// eslint-disable-next-line no-console
|
|
262
|
+
const logger = opts.logger ?? ((msg) => console.error(msg));
|
|
263
|
+
try {
|
|
264
|
+
const buf = readFileSync(entryPath);
|
|
265
|
+
const sha = createHash('sha256').update(buf).digest('hex');
|
|
266
|
+
const mtimeIso = statSync(entryPath).mtime.toISOString();
|
|
267
|
+
logger(`[supervisor] bundle bundle_entry=${entryPath} bundle_sha256=${sha} bundle_mtime=${mtimeIso}`);
|
|
268
|
+
}
|
|
269
|
+
catch (err) {
|
|
270
|
+
const code = err.code ?? 'unknown';
|
|
271
|
+
logger(`[supervisor] bundle bundle_entry=${entryPath} bundle_sha256=unknown bundle_mtime=unknown bundle_error=${code}`);
|
|
272
|
+
}
|
|
273
|
+
}
|
|
274
|
+
export async function main() {
|
|
275
|
+
// Phase A2 — emit bundle identity before anything else so it lands in
|
|
276
|
+
// `docker logs` even when env validation fails below.
|
|
277
|
+
if (process.argv[1]) {
|
|
278
|
+
logBundleStartup({ entryPath: process.argv[1] });
|
|
279
|
+
}
|
|
280
|
+
const required = ['HOST_CP_URL', 'HOST_CP_BEARER', 'WORLD_ID', 'SESSION_ID'];
|
|
281
|
+
for (const key of required) {
|
|
282
|
+
if (!process.env[key]) {
|
|
283
|
+
// eslint-disable-next-line no-console
|
|
284
|
+
console.error(`[supervisor] missing required env: ${key}`);
|
|
285
|
+
process.exit(1);
|
|
286
|
+
}
|
|
287
|
+
}
|
|
288
|
+
// Phase C4 spawn guard. Closes the zombie-supervisor gap surfaced by
|
|
289
|
+
// olam-driver-runner-fix Phase A1 finding.
|
|
290
|
+
const worldId = process.env.WORLD_ID;
|
|
291
|
+
const guard = acquireSupervisorLock(worldId);
|
|
292
|
+
if (guard.outcome === 'existing-alive') {
|
|
293
|
+
// eslint-disable-next-line no-console
|
|
294
|
+
console.error(`[supervisor] another supervisor is already running for world=${worldId} (PID ${guard.existingPid}); exiting cleanly`);
|
|
295
|
+
process.exit(0);
|
|
296
|
+
}
|
|
297
|
+
if (guard.outcome === 'stale-cleared') {
|
|
298
|
+
// eslint-disable-next-line no-console
|
|
299
|
+
console.error(`[supervisor] cleared stale PID-file (was ${guard.stalePid}); acquired ${guard.pidPath}`);
|
|
300
|
+
}
|
|
301
|
+
// Unlink the PID-file on any exit (clean + signal-induced).
|
|
302
|
+
const cleanupPidFile = () => {
|
|
303
|
+
try {
|
|
304
|
+
if (existsSync(guard.pidPath))
|
|
305
|
+
unlinkSync(guard.pidPath);
|
|
306
|
+
}
|
|
307
|
+
catch {
|
|
308
|
+
// swallow — best-effort cleanup
|
|
309
|
+
}
|
|
310
|
+
};
|
|
311
|
+
process.on('exit', cleanupPidFile);
|
|
312
|
+
process.on('SIGTERM', cleanupPidFile);
|
|
313
|
+
process.on('SIGINT', cleanupPidFile);
|
|
314
|
+
const childEnv = {};
|
|
315
|
+
for (const k of required) {
|
|
316
|
+
childEnv[k] = process.env[k];
|
|
317
|
+
}
|
|
318
|
+
const baseDir = process.env.AGENT_STREAM_DIR ?? dirname(fileURLToPath(import.meta.url));
|
|
319
|
+
const children = [
|
|
320
|
+
{ personaKey: 'driver', modulePath: resolve(join(baseDir, 'driver-runner.js')) },
|
|
321
|
+
{ personaKey: 'codex', modulePath: resolve(join(baseDir, 'codex-runner.js')) },
|
|
322
|
+
];
|
|
323
|
+
const handle = runSupervisor({ children, childEnv });
|
|
324
|
+
// Keep the process alive until drain completes; then exit.
|
|
325
|
+
// Drain is triggered by SIGTERM / uncaughtException / unhandledRejection.
|
|
326
|
+
await new Promise((resolve) => {
|
|
327
|
+
const check = setInterval(() => {
|
|
328
|
+
if (handle.draining() && handle.activeChildCount() === 0) {
|
|
329
|
+
clearInterval(check);
|
|
330
|
+
resolve();
|
|
331
|
+
}
|
|
332
|
+
}, 500);
|
|
333
|
+
});
|
|
334
|
+
// pendingExitCode lives on the handle's internal state; we approximate
|
|
335
|
+
// by checking activeChildCount + draining. Production refines this.
|
|
336
|
+
process.exit(0);
|
|
337
|
+
}
|
|
338
|
+
if (typeof process !== 'undefined' &&
|
|
339
|
+
process.argv[1] &&
|
|
340
|
+
(process.argv[1].endsWith('agent-stream-launch.js') ||
|
|
341
|
+
process.argv[1].endsWith('agent-stream-launch.ts'))) {
|
|
342
|
+
main().catch((err) => {
|
|
343
|
+
// eslint-disable-next-line no-console
|
|
344
|
+
console.error('[supervisor] fatal:', err);
|
|
345
|
+
process.exit(1);
|
|
346
|
+
});
|
|
347
|
+
}
|
|
348
|
+
//# sourceMappingURL=agent-stream-launch.js.map
|