@vellumai/credential-executor 0.8.5 → 0.8.7-dev.202606052118.34cd356

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/Dockerfile CHANGED
@@ -33,6 +33,9 @@ WORKDIR /app/credential-executor
33
33
 
34
34
  RUN apt-get update && apt-get install -y \
35
35
  ca-certificates \
36
+ e2fsprogs \
37
+ mount \
38
+ util-linux \
36
39
  && rm -rf /var/lib/apt/lists/*
37
40
 
38
41
  # Copy bun binary from builder
@@ -56,6 +59,13 @@ RUN mkdir -p /ces-data && chown ces:ces /ces-data
56
59
  # Pre-create /ces-security for credential key storage (keys.enc, store.key)
57
60
  RUN mkdir -p /ces-security && chown ces:ces /ces-security
58
61
 
62
+ COPY packages/block-volume-bootstrap/scripts/*.sh /usr/local/bin/
63
+ RUN chmod +x \
64
+ /usr/local/bin/vellum-block-volume-common.sh \
65
+ /usr/local/bin/vellum-block-volume-init.sh \
66
+ /usr/local/bin/vellum-block-volume-mount.sh \
67
+ /usr/local/bin/vellum-block-volume-resize.sh
68
+
59
69
  USER ces
60
70
 
61
71
  EXPOSE 8090
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@vellumai/credential-executor",
3
- "version": "0.8.5",
3
+ "version": "0.8.7-dev.202606052118.34cd356",
4
4
  "license": "MIT",
5
5
  "type": "module",
6
6
  "exports": {
@@ -9,11 +9,76 @@
9
9
  import { describe, expect, test } from "bun:test";
10
10
 
11
11
  import {
12
+ applyManagedCredentialRefs,
12
13
  buildLazyGetters,
13
14
  type ApiKeyRef,
14
15
  type AssistantIdRef,
15
16
  } from "../managed-lazy-getters.js";
16
17
 
18
+ // ---------------------------------------------------------------------------
19
+ // applyManagedCredentialRefs — fail-closed overwrite across sessions
20
+ // ---------------------------------------------------------------------------
21
+
22
+ describe("applyManagedCredentialRefs", () => {
23
+ test("overwrites both refs with the provided values", () => {
24
+ const apiKeyRef: ApiKeyRef = { current: "old-key" };
25
+ const assistantIdRef: AssistantIdRef = { current: "ast_old" };
26
+
27
+ applyManagedCredentialRefs(apiKeyRef, assistantIdRef, "new-key", "ast_new");
28
+
29
+ expect(apiKeyRef.current).toBe("new-key");
30
+ expect(assistantIdRef.current).toBe("ast_new");
31
+ });
32
+
33
+ test("clears a stale assistant ID when the new value is omitted", () => {
34
+ // A new session (or an API-key-only update) that does not carry an
35
+ // assistant ID must not inherit the previous session's ID.
36
+ const apiKeyRef: ApiKeyRef = { current: "prev-key" };
37
+ const assistantIdRef: AssistantIdRef = { current: "ast_prev" };
38
+
39
+ applyManagedCredentialRefs(
40
+ apiKeyRef,
41
+ assistantIdRef,
42
+ "rotated-key",
43
+ undefined,
44
+ );
45
+
46
+ expect(apiKeyRef.current).toBe("rotated-key");
47
+ expect(assistantIdRef.current).toBe("");
48
+ });
49
+
50
+ test("clears a stale API key when the new value is omitted", () => {
51
+ const apiKeyRef: ApiKeyRef = { current: "prev-key" };
52
+ const assistantIdRef: AssistantIdRef = { current: "ast_prev" };
53
+
54
+ applyManagedCredentialRefs(apiKeyRef, assistantIdRef, undefined, undefined);
55
+
56
+ expect(apiKeyRef.current).toBe("");
57
+ expect(assistantIdRef.current).toBe("");
58
+ });
59
+
60
+ test("lazy getters fail closed after a reconnect that omits the ID", () => {
61
+ const apiKeyRef: ApiKeyRef = { current: "session1-key" };
62
+ const assistantIdRef: AssistantIdRef = { current: "ast_session1" };
63
+ const { getManagedMaterializerOptions } = buildLazyGetters({
64
+ platformBaseUrl: "https://api.vellum.ai",
65
+ assistantIdRef,
66
+ apiKeyRef,
67
+ });
68
+
69
+ // A reconnecting session provides a key but no assistant ID.
70
+ applyManagedCredentialRefs(
71
+ apiKeyRef,
72
+ assistantIdRef,
73
+ "session2-key",
74
+ undefined,
75
+ );
76
+
77
+ // Materialization must not proceed with the prior session's assistant ID.
78
+ expect(getManagedMaterializerOptions()).toBeUndefined();
79
+ });
80
+ });
81
+
17
82
  // ---------------------------------------------------------------------------
18
83
  // Before API key arrives
19
84
  // ---------------------------------------------------------------------------
@@ -0,0 +1,244 @@
1
+ /**
2
+ * Managed CES reconnection test (real entrypoint subprocess).
3
+ *
4
+ * Spawns the actual `managed-main.ts` entrypoint and verifies that the CES
5
+ * sidecar survives the assistant disconnecting and accepts a reconnection,
6
+ * rather than shutting down when the RPC stream ends.
7
+ *
8
+ * This guards the core invariant that CES runs independently of whether the
9
+ * assistant is actively connected: the assistant container can crash and be
10
+ * restarted (Kubernetes restarts containers, not the whole pod), and the
11
+ * restarted assistant must be able to reconnect to a still-running CES.
12
+ */
13
+
14
+ import { afterEach, describe, expect, test } from "bun:test";
15
+ import { createConnection, createServer, type Socket } from "node:net";
16
+ import { existsSync, mkdirSync, mkdtempSync, rmSync } from "node:fs";
17
+ import { tmpdir } from "node:os";
18
+ import { join, resolve } from "node:path";
19
+
20
+ import {
21
+ CES_PROTOCOL_VERSION,
22
+ type HandshakeAck,
23
+ } from "@vellumai/service-contracts/credential-rpc";
24
+
25
+ import type { Subprocess } from "bun";
26
+
27
+ // ---------------------------------------------------------------------------
28
+ // Helpers
29
+ // ---------------------------------------------------------------------------
30
+
31
+ /** Sleep for the given number of milliseconds. */
32
+ function delay(ms: number): Promise<void> {
33
+ return new Promise((r) => setTimeout(r, ms));
34
+ }
35
+
36
+ /** Pick a currently-free TCP port by binding to port 0 and reading it back. */
37
+ function pickFreePort(): Promise<number> {
38
+ return new Promise((resolve, reject) => {
39
+ const srv = createServer();
40
+ srv.listen(0, () => {
41
+ const address = srv.address();
42
+ const port = typeof address === "object" && address ? address.port : 0;
43
+ srv.close(() => (port ? resolve(port) : reject(new Error("no port"))));
44
+ });
45
+ srv.on("error", reject);
46
+ });
47
+ }
48
+
49
+ /** Poll the health endpoint until it responds OK or the deadline passes. */
50
+ async function waitForHealth(port: number, timeoutMs = 15_000): Promise<void> {
51
+ const deadline = Date.now() + timeoutMs;
52
+ while (Date.now() < deadline) {
53
+ try {
54
+ const resp = await fetch(`http://127.0.0.1:${port}/healthz`);
55
+ if (resp.ok) return;
56
+ } catch {
57
+ // not up yet
58
+ }
59
+ await delay(100);
60
+ }
61
+ throw new Error(`CES health endpoint did not come up within ${timeoutMs}ms`);
62
+ }
63
+
64
+ /** Read the `rpcConnected` field from /readyz. */
65
+ async function readyzRpcConnected(port: number): Promise<boolean> {
66
+ const resp = await fetch(`http://127.0.0.1:${port}/readyz`);
67
+ const body = (await resp.json()) as { rpcConnected?: boolean };
68
+ return body.rpcConnected === true;
69
+ }
70
+
71
+ /** Wait until the socket path exists (CES has bound the bootstrap socket). */
72
+ async function waitForSocket(socketPath: string, timeoutMs = 10_000): Promise<void> {
73
+ const deadline = Date.now() + timeoutMs;
74
+ while (Date.now() < deadline) {
75
+ if (existsSync(socketPath)) return;
76
+ await delay(50);
77
+ }
78
+ throw new Error(`Bootstrap socket did not appear within ${timeoutMs}ms`);
79
+ }
80
+
81
+ /** Connect to the bootstrap socket, retrying past the listen/connect race. */
82
+ function connectToSocket(
83
+ socketPath: string,
84
+ { maxRetries = 40, baseDelayMs = 25 } = {},
85
+ ): Promise<Socket> {
86
+ return new Promise((resolveConn, reject) => {
87
+ let attempt = 0;
88
+ const tryConnect = () => {
89
+ const sock = createConnection(socketPath, () => {
90
+ sock.removeAllListeners("error");
91
+ resolveConn(sock);
92
+ });
93
+ sock.on("error", (err: NodeJS.ErrnoException) => {
94
+ sock.destroy();
95
+ attempt++;
96
+ if (
97
+ attempt < maxRetries &&
98
+ (err.code === "ENOENT" || err.code === "ECONNREFUSED")
99
+ ) {
100
+ setTimeout(tryConnect, baseDelayMs);
101
+ } else {
102
+ reject(err);
103
+ }
104
+ });
105
+ };
106
+ tryConnect();
107
+ });
108
+ }
109
+
110
+ /** Send a handshake and resolve the resulting ack. */
111
+ function handshake(sock: Socket, sessionId: string): Promise<HandshakeAck> {
112
+ return new Promise((resolveAck, reject) => {
113
+ let buffer = "";
114
+ const timer = setTimeout(() => {
115
+ sock.removeAllListeners("data");
116
+ reject(new Error("Timed out waiting for handshake ack"));
117
+ }, 5_000);
118
+
119
+ sock.on("data", (chunk: Buffer) => {
120
+ buffer += chunk.toString("utf-8");
121
+ const idx = buffer.indexOf("\n");
122
+ if (idx === -1) return;
123
+ const line = buffer.slice(0, idx).trim();
124
+ clearTimeout(timer);
125
+ sock.removeAllListeners("data");
126
+ try {
127
+ resolveAck(JSON.parse(line) as HandshakeAck);
128
+ } catch (err) {
129
+ reject(err as Error);
130
+ }
131
+ });
132
+ sock.on("error", (err) => {
133
+ clearTimeout(timer);
134
+ reject(err);
135
+ });
136
+
137
+ sock.write(
138
+ JSON.stringify({
139
+ type: "handshake_request",
140
+ protocolVersion: CES_PROTOCOL_VERSION,
141
+ sessionId,
142
+ }) + "\n",
143
+ );
144
+ });
145
+ }
146
+
147
+ // ---------------------------------------------------------------------------
148
+ // Test
149
+ // ---------------------------------------------------------------------------
150
+
151
+ let tmpDir: string | undefined;
152
+ let proc: Subprocess | undefined;
153
+
154
+ afterEach(async () => {
155
+ if (proc) {
156
+ proc.kill("SIGTERM");
157
+ await Promise.race([proc.exited, delay(3_000)]);
158
+ proc = undefined;
159
+ }
160
+ if (tmpDir) {
161
+ try {
162
+ rmSync(tmpDir, { recursive: true, force: true });
163
+ } catch {
164
+ /* ok */
165
+ }
166
+ tmpDir = undefined;
167
+ }
168
+ });
169
+
170
+ describe("managed CES reconnection (real entrypoint)", () => {
171
+ test("survives an assistant disconnect and accepts a reconnection", async () => {
172
+ tmpDir = mkdtempSync(join(tmpdir(), "ces-reconnect-"));
173
+ const dataDir = join(tmpDir, "ces-data");
174
+ const socketDir = join(tmpDir, "bootstrap");
175
+ const socketPath = join(socketDir, "ces.sock");
176
+ const assistantDataMount = join(tmpDir, "assistant-data-ro");
177
+ mkdirSync(dataDir, { recursive: true });
178
+ mkdirSync(socketDir, { recursive: true });
179
+ mkdirSync(join(assistantDataMount, ".vellum"), { recursive: true });
180
+
181
+ const healthPort = await pickFreePort();
182
+ const managedMain = resolve(__dirname, "..", "managed-main.ts");
183
+
184
+ proc = Bun.spawn({
185
+ cmd: [process.execPath, managedMain],
186
+ env: {
187
+ ...process.env,
188
+ CES_MODE: "managed",
189
+ CES_DATA_DIR: dataDir,
190
+ CES_BOOTSTRAP_SOCKET: socketPath,
191
+ CES_HEALTH_PORT: String(healthPort),
192
+ CES_ASSISTANT_DATA_MOUNT: assistantDataMount,
193
+ },
194
+ stdout: "ignore",
195
+ stderr: "ignore",
196
+ });
197
+
198
+ // Sidecar comes up and binds the bootstrap socket.
199
+ await waitForHealth(healthPort);
200
+ await waitForSocket(socketPath);
201
+ expect(await readyzRpcConnected(healthPort)).toBe(false);
202
+
203
+ // First assistant session connects and completes a handshake.
204
+ const first = await connectToSocket(socketPath);
205
+ const ack1 = await handshake(first, "session-1");
206
+ expect(ack1.accepted).toBe(true);
207
+
208
+ // Give /readyz a moment to flip, then confirm CES sees the connection.
209
+ await delay(200);
210
+ expect(await readyzRpcConnected(healthPort)).toBe(true);
211
+
212
+ // Simulate the assistant pod crashing: drop the connection hard.
213
+ first.destroy();
214
+
215
+ // CES must NOT exit. It should stay healthy, flip rpcConnected back to
216
+ // false, and re-bind the bootstrap socket to await a reconnection.
217
+ await waitForSocket(socketPath);
218
+ expect(proc.killed).toBe(false);
219
+ const resp = await fetch(`http://127.0.0.1:${healthPort}/healthz`);
220
+ expect(resp.ok).toBe(true);
221
+
222
+ // Wait for the new session's rpcConnected to clear before reconnecting.
223
+ const cleared = await (async () => {
224
+ const deadline = Date.now() + 5_000;
225
+ while (Date.now() < deadline) {
226
+ if (!(await readyzRpcConnected(healthPort))) return true;
227
+ await delay(100);
228
+ }
229
+ return false;
230
+ })();
231
+ expect(cleared).toBe(true);
232
+
233
+ // The restarted assistant reconnects and handshakes successfully.
234
+ const second = await connectToSocket(socketPath);
235
+ const ack2 = await handshake(second, "session-2");
236
+ expect(ack2.accepted).toBe(true);
237
+ expect(ack2.sessionId).toBe("session-2");
238
+
239
+ await delay(200);
240
+ expect(await readyzRpcConnected(healthPort)).toBe(true);
241
+
242
+ second.destroy();
243
+ }, 30_000);
244
+ });
@@ -32,6 +32,27 @@ export interface AssistantIdRef {
32
32
  current: string;
33
33
  }
34
34
 
35
+ /**
36
+ * Overwrite the session-scoped managed credential refs.
37
+ *
38
+ * The managed handler registry is long-lived — it persists across assistant
39
+ * reconnects — so every handshake and every `update_managed_credential` must
40
+ * fully overwrite these refs, including clearing them when a value is absent.
41
+ * Otherwise a new or reprovisioned session could keep materializing platform
42
+ * credentials with the previous session's API key or assistant ID. Absent
43
+ * values fall back to "" (fail closed): the lazy getters then return no
44
+ * materialization options, and `getAssistantApiKey` falls back to the env key.
45
+ */
46
+ export function applyManagedCredentialRefs(
47
+ apiKeyRef: ApiKeyRef,
48
+ assistantIdRef: AssistantIdRef,
49
+ apiKey: string | undefined,
50
+ assistantId: string | undefined,
51
+ ): void {
52
+ apiKeyRef.current = apiKey ?? "";
53
+ assistantIdRef.current = assistantId ?? "";
54
+ }
55
+
35
56
  export interface LazyGetterOptions {
36
57
  platformBaseUrl: string;
37
58
  assistantIdRef: AssistantIdRef;
@@ -55,10 +76,11 @@ export interface LazyGetters {
55
76
  export function buildLazyGetters(opts: LazyGetterOptions): LazyGetters {
56
77
  const { platformBaseUrl, assistantIdRef, apiKeyRef, envApiKey } = opts;
57
78
 
58
- const getAssistantApiKey = (): string =>
59
- apiKeyRef.current || envApiKey || "";
79
+ const getAssistantApiKey = (): string => apiKeyRef.current || envApiKey || "";
60
80
 
61
- const getManagedSubjectOptions = (): ManagedSubjectResolverOptions | undefined => {
81
+ const getManagedSubjectOptions = ():
82
+ | ManagedSubjectResolverOptions
83
+ | undefined => {
62
84
  const key = getAssistantApiKey();
63
85
  const id = assistantIdRef.current;
64
86
  return platformBaseUrl && key && id
@@ -66,7 +88,9 @@ export function buildLazyGetters(opts: LazyGetterOptions): LazyGetters {
66
88
  : undefined;
67
89
  };
68
90
 
69
- const getManagedMaterializerOptions = (): ManagedMaterializerOptions | undefined => {
91
+ const getManagedMaterializerOptions = ():
92
+ | ManagedMaterializerOptions
93
+ | undefined => {
70
94
  const key = getAssistantApiKey();
71
95
  const id = assistantIdRef.current;
72
96
  return platformBaseUrl && key && id
@@ -6,11 +6,15 @@
6
6
  *
7
7
  * 1. Ensures the CES-private data directories exist.
8
8
  * 2. Binds a bootstrap Unix socket on the shared bootstrap volume.
9
- * 3. Accepts exactly **one** assistant runtime connection.
9
+ * 3. Accepts a single assistant runtime connection.
10
10
  * 4. Unlinks the socket path immediately after the connection is accepted,
11
- * preventing any second process from connecting.
11
+ * preventing any second process from connecting while the session is live.
12
12
  * 5. Serves RPC on the accepted stream only.
13
- * 6. Simultaneously serves health probes (`/healthz`, `/readyz`) on a
13
+ * 6. When that session ends (the assistant disconnects or its container is
14
+ * restarted), re-binds the socket and awaits a reconnection. CES is a
15
+ * long-lived sidecar — it outlives any single assistant session and only
16
+ * shuts down on SIGTERM/SIGINT. At most one connection is ever active.
17
+ * 7. Simultaneously serves health probes (`/healthz`, `/readyz`) on a
14
18
  * dedicated HTTP port for Kubernetes liveness/readiness checks.
15
19
  *
16
20
  * The managed entrypoint never opens a generic TCP or HTTP command API.
@@ -52,6 +56,7 @@ import {
52
56
  registerCommandExecutionHandler,
53
57
  registerManageSecureCommandToolHandler,
54
58
  type RpcHandlerRegistry,
59
+ type ServeEndReason,
55
60
  type SessionIdRef,
56
61
  } from "./server.js";
57
62
  import {
@@ -67,6 +72,7 @@ import {
67
72
  parseHandle,
68
73
  } from "@vellumai/service-contracts/credential-rpc";
69
74
  import {
75
+ applyManagedCredentialRefs,
70
76
  buildLazyGetters,
71
77
  type ApiKeyRef,
72
78
  type AssistantIdRef,
@@ -118,7 +124,7 @@ function buildHandlers(
118
124
  apiKeyRef: ApiKeyRef,
119
125
  assistantIdRef: AssistantIdRef,
120
126
  secureKeyBackend: SecureKeyBackend,
121
- ): RpcHandlerRegistry {
127
+ ): { handlers: RpcHandlerRegistry; temporaryGrantStore: TemporaryGrantStore } {
122
128
  // -- Grant stores ----------------------------------------------------------
123
129
  const persistentGrantStore = new PersistentGrantStore(
124
130
  getCesGrantsDir("managed"),
@@ -140,15 +146,13 @@ function buildHandlers(
140
146
  // though handlers are built before the handshake completes.
141
147
  const platformBaseUrl = process.env["VELLUM_PLATFORM_URL"] ?? "";
142
148
 
143
- const {
144
- getManagedSubjectOptions,
145
- getManagedMaterializerOptions,
146
- } = buildLazyGetters({
147
- platformBaseUrl,
148
- assistantIdRef,
149
- apiKeyRef,
150
- envApiKey: process.env["ASSISTANT_API_KEY"] || "",
151
- });
149
+ const { getManagedSubjectOptions, getManagedMaterializerOptions } =
150
+ buildLazyGetters({
151
+ platformBaseUrl,
152
+ assistantIdRef,
153
+ apiKeyRef,
154
+ envApiKey: process.env["ASSISTANT_API_KEY"] || "",
155
+ });
152
156
 
153
157
  if (!platformBaseUrl) {
154
158
  log.warn(
@@ -189,7 +193,10 @@ function buildHandlers(
189
193
  };
190
194
 
191
195
  const localSubjectDepsStub: LocalSubjectResolverDeps = {
192
- metadataStore: { getById: () => undefined, list: () => [] } as unknown as LocalSubjectResolverDeps["metadataStore"],
196
+ metadataStore: {
197
+ getById: () => undefined,
198
+ list: () => [],
199
+ } as unknown as LocalSubjectResolverDeps["metadataStore"],
193
200
  oauthConnections: { getById: () => undefined },
194
201
  };
195
202
 
@@ -408,7 +415,7 @@ function buildHandlers(
408
415
  return { results };
409
416
  }) as (typeof handlers)[string];
410
417
 
411
- return handlers;
418
+ return { handlers, temporaryGrantStore };
412
419
  }
413
420
 
414
421
  // ---------------------------------------------------------------------------
@@ -476,13 +483,18 @@ function startHealthServer(
476
483
  }
477
484
 
478
485
  // ---------------------------------------------------------------------------
479
- // Bootstrap socket server (accepts exactly one connection)
486
+ // Bootstrap socket server (accepts one connection at a time)
480
487
  // ---------------------------------------------------------------------------
481
488
 
482
489
  /**
483
- * Listen on a Unix socket, accept exactly one connection, unlink the
484
- * socket path, and return readable/writable streams for the accepted
485
- * connection.
490
+ * Listen on a Unix socket, accept one connection, unlink the socket path,
491
+ * and return readable/writable streams for the accepted connection.
492
+ *
493
+ * The socket is unlinked while a connection is active so no second process
494
+ * can connect concurrently (only one assistant ever talks to CES at a time).
495
+ * When that session ends, the caller re-invokes this function to re-bind the
496
+ * socket and accept the assistant's reconnection — CES outlives any single
497
+ * assistant session (see `main()`).
486
498
  */
487
499
  function acceptOneConnection(
488
500
  socketPath: string,
@@ -515,16 +527,17 @@ function acceptOneConnection(
515
527
  return;
516
528
  }
517
529
 
518
- signal.addEventListener(
519
- "abort",
520
- () => {
521
- cleanup();
522
- reject(new Error("Aborted while waiting for connection"));
523
- },
524
- { once: true },
525
- );
530
+ // Remove this listener once the promise settles. Because CES re-binds
531
+ // the socket after each session ends, a long-lived AbortSignal would
532
+ // otherwise accumulate one dangling listener per reconnection.
533
+ const onAbort = () => {
534
+ cleanup();
535
+ reject(new Error("Aborted while waiting for connection"));
536
+ };
537
+ signal.addEventListener("abort", onAbort, { once: true });
526
538
 
527
539
  netServer.on("error", (err) => {
540
+ signal.removeEventListener("abort", onAbort);
528
541
  cleanup();
529
542
  reject(err);
530
543
  });
@@ -534,8 +547,10 @@ function acceptOneConnection(
534
547
  });
535
548
 
536
549
  netServer.on("connection", (socket: Socket) => {
537
- // Accept exactly one connection, then close the listener and
538
- // unlink the socket path so no other process can connect.
550
+ // Accept the connection, then close the listener and unlink the
551
+ // socket path so no other process can connect while this session
552
+ // is active.
553
+ signal.removeEventListener("abort", onAbort);
539
554
  log.info("Assistant connected via bootstrap socket");
540
555
  netServer.close();
541
556
  try {
@@ -543,7 +558,7 @@ function acceptOneConnection(
543
558
  } catch {
544
559
  // Already unlinked
545
560
  }
546
- log.info("Bootstrap socket unlinked (single-connection enforced)");
561
+ log.info("Bootstrap socket unlinked (single active connection enforced)");
547
562
 
548
563
  const readable = new Readable({
549
564
  read() {
@@ -648,76 +663,157 @@ async function main(): Promise<void> {
648
663
  startHealthServer(healthPort, controller.signal, credentialDeps);
649
664
  log.info(`Health server listening on port ${healthPort}`);
650
665
 
651
- // Wait for exactly one assistant connection on the bootstrap socket
652
- const socketPath = getBootstrapSocketPath();
653
- log.info(`Waiting for assistant connection on ${socketPath}...`);
654
-
655
- let connection: Awaited<ReturnType<typeof acceptOneConnection>>;
656
- try {
657
- connection = await acceptOneConnection(socketPath, controller.signal);
658
- } catch (err) {
659
- if (controller.signal.aborted) {
660
- log.info("Shutdown before assistant connected.");
661
- return;
662
- }
663
- throw err;
664
- }
665
-
666
- rpcConnected = true;
667
-
668
- // Build the handler registry with all available RPC implementations.
669
- // Use mutable refs so the handshake-provided session ID and API key
670
- // are available to handlers at call time (after the handshake completes).
666
+ // Build the handler registry once, up front, and reuse it across every
667
+ // assistant session. All CES state lives behind these handlers — file-backed
668
+ // grant/audit stores plus the in-memory temporary-grant store and the
669
+ // secure-command tool registry — and must be process-scoped so it survives
670
+ // an assistant reconnection. In particular, the tool registry mirrors the
671
+ // persistent toolstore on disk; rebuilding it per session would let a later
672
+ // `unregister` miss a tool registered in an earlier session and orphan its
673
+ // bundle.
674
+ //
675
+ // The in-memory temporary-grant store is the exception: `allow_once` /
676
+ // `allow_10m` grants are keyed by proposal hash only (not session), so they
677
+ // would otherwise leak ephemeral approvals across sessions. It is cleared at
678
+ // the end of every session below so a reconnecting assistant must re-prompt.
679
+ //
680
+ // The mutable refs carry the handshake-provided session ID, API key, and
681
+ // assistant ID; handlers read them at call time, so updating the refs when
682
+ // each session's handshake completes is all that's needed per connection.
671
683
  const sessionIdRef: SessionIdRef = { current: `ces-managed-${Date.now()}` };
672
684
  const apiKeyRef: ApiKeyRef = { current: "" };
673
685
  const assistantIdRef: AssistantIdRef = { current: "" };
674
- const handlers = buildHandlers(
686
+ const { handlers, temporaryGrantStore } = buildHandlers(
675
687
  sessionIdRef,
676
688
  apiKeyRef,
677
689
  assistantIdRef,
678
690
  secureKeyBackend,
679
691
  );
680
692
 
693
+ // Serve loop. CES is a long-lived sidecar that must outlive any single
694
+ // assistant session: the assistant container can crash and be restarted
695
+ // independently of the CES container (Kubernetes restarts containers, not
696
+ // the whole pod), so when the RPC stream ends we re-bind the bootstrap
697
+ // socket and wait for the assistant to reconnect rather than tearing the
698
+ // sidecar down. The loop only exits on a shutdown signal (SIGTERM/SIGINT),
699
+ // which aborts the controller.
681
700
  const rpcLog = getLogger("rpc");
682
- const server = new CesRpcServer({
683
- input: connection.readable,
684
- output: connection.writable,
685
- handlers,
686
- logger: {
687
- log: (msg: string, ...args: unknown[]) => rpcLog.info({ args }, msg),
688
- warn: (msg: string, ...args: unknown[]) => rpcLog.warn({ args }, msg),
689
- error: (msg: string, ...args: unknown[]) => rpcLog.error({ args }, msg),
690
- },
691
- signal: controller.signal,
692
- onHandshakeComplete: (hsSessionId, hsApiKey, hsAssistantId) => {
693
- sessionIdRef.current = hsSessionId;
694
- if (hsApiKey) {
695
- apiKeyRef.current = hsApiKey;
696
- log.info("Received assistant API key via handshake");
697
- }
698
- if (hsAssistantId) {
699
- assistantIdRef.current = hsAssistantId;
700
- log.info("Received assistant ID via handshake");
701
- }
702
- },
703
- onApiKeyUpdate: (newKey, newAssistantId) => {
704
- apiKeyRef.current = newKey;
705
- log.info("Assistant API key updated via RPC");
706
- if (newAssistantId) {
707
- assistantIdRef.current = newAssistantId;
708
- log.info("Assistant ID updated via RPC");
701
+ const socketPath = getBootstrapSocketPath();
702
+
703
+ while (!controller.signal.aborted) {
704
+ log.info(`Waiting for assistant connection on ${socketPath}...`);
705
+
706
+ let connection: Awaited<ReturnType<typeof acceptOneConnection>>;
707
+ try {
708
+ connection = await acceptOneConnection(socketPath, controller.signal);
709
+ } catch (err) {
710
+ if (controller.signal.aborted) {
711
+ log.info("Shutdown before assistant connected.");
712
+ return;
709
713
  }
710
- },
711
- });
714
+ throw err;
715
+ }
712
716
 
713
- const endReason = await server.serve();
717
+ rpcConnected = true;
714
718
 
715
- rpcConnected = false;
716
- log.warn(
717
- { reason: endReason, uptime: process.uptime(), pid: process.pid },
718
- "RPC session ended — shutting down",
719
- );
720
- controller.abort("rpc_session_ended");
719
+ const server = new CesRpcServer({
720
+ input: connection.readable,
721
+ output: connection.writable,
722
+ handlers,
723
+ logger: {
724
+ log: (msg: string, ...args: unknown[]) => rpcLog.info({ args }, msg),
725
+ warn: (msg: string, ...args: unknown[]) => rpcLog.warn({ args }, msg),
726
+ error: (msg: string, ...args: unknown[]) => rpcLog.error({ args }, msg),
727
+ },
728
+ signal: controller.signal,
729
+ onHandshakeComplete: (hsSessionId, hsApiKey, hsAssistantId) => {
730
+ sessionIdRef.current = hsSessionId;
731
+ // Overwrite the credential refs on every handshake. The handler
732
+ // registry persists across reconnects, so a new session that omits
733
+ // the API key / assistant ID must fail closed (falling back to the
734
+ // env key, or no key) rather than reusing the previous session's
735
+ // credentials.
736
+ applyManagedCredentialRefs(
737
+ apiKeyRef,
738
+ assistantIdRef,
739
+ hsApiKey,
740
+ hsAssistantId,
741
+ );
742
+ if (hsApiKey) {
743
+ log.info("Received assistant API key via handshake");
744
+ }
745
+ if (hsAssistantId) {
746
+ log.info("Received assistant ID via handshake");
747
+ }
748
+ },
749
+ onApiKeyUpdate: (newKey, newAssistantId) => {
750
+ // Overwrite both refs on every credential update, for the same
751
+ // fail-closed reason as the handshake: the assistant sources the
752
+ // assistant ID from the same place it sources the key, so an update
753
+ // that omits the ID means it has none — CES must clear the stale ID
754
+ // rather than keep materializing for the previous session's assistant.
755
+ applyManagedCredentialRefs(
756
+ apiKeyRef,
757
+ assistantIdRef,
758
+ newKey,
759
+ newAssistantId,
760
+ );
761
+ log.info("Assistant API key updated via RPC");
762
+ if (newAssistantId) {
763
+ log.info("Assistant ID updated via RPC");
764
+ }
765
+ },
766
+ });
767
+
768
+ // `serve()` resolves on a clean stream end or signal abort, and rejects
769
+ // when the transport stream errors — which is precisely what a hard
770
+ // disconnect (connection reset when the assistant container crashes)
771
+ // looks like. Both cases must keep the sidecar up; only a shutdown
772
+ // signal should tear it down. So treat a serve() rejection the same as
773
+ // a session end and fall through to await reconnection.
774
+ let endReason: ServeEndReason | "transport_error";
775
+ try {
776
+ endReason = await server.serve();
777
+ } catch (err) {
778
+ server.close();
779
+ endReason = "transport_error";
780
+ log.warn(
781
+ { err, uptime: process.uptime(), pid: process.pid },
782
+ "RPC transport errored — treating as session end",
783
+ );
784
+ }
785
+
786
+ rpcConnected = false;
787
+
788
+ // Drop all ephemeral approvals when the session ends. `allow_once` /
789
+ // `allow_10m` grants are keyed by proposal hash only, so reusing the
790
+ // store across a reconnect would let a pre-disconnect approval be
791
+ // consumed by a later session without re-prompting. Clearing here
792
+ // restores the prior behavior, where the process exited on stream end
793
+ // and these grants never survived.
794
+ temporaryGrantStore.clear();
795
+
796
+ // A signal-driven end means the process is shutting down; exit the loop.
797
+ // Any other end reason (the assistant disconnected, its stream closed,
798
+ // or the transport errored) means we keep the sidecar up and await a
799
+ // reconnection.
800
+ if (
801
+ controller.signal.aborted ||
802
+ endReason === "signal_aborted" ||
803
+ endReason === "signal_aborted_before_start"
804
+ ) {
805
+ log.info(
806
+ { reason: endReason, uptime: process.uptime(), pid: process.pid },
807
+ "RPC session ended due to shutdown — exiting serve loop",
808
+ );
809
+ break;
810
+ }
811
+
812
+ log.warn(
813
+ { reason: endReason, uptime: process.uptime(), pid: process.pid },
814
+ "RPC session ended (assistant disconnected) — awaiting reconnection",
815
+ );
816
+ }
721
817
  }
722
818
 
723
819
  main().catch((err) => {