@vellumai/credential-executor 0.8.4 → 0.8.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -6,11 +6,15 @@
6
6
  *
7
7
  * 1. Ensures the CES-private data directories exist.
8
8
  * 2. Binds a bootstrap Unix socket on the shared bootstrap volume.
9
- * 3. Accepts exactly **one** assistant runtime connection.
9
+ * 3. Accepts a single assistant runtime connection.
10
10
  * 4. Unlinks the socket path immediately after the connection is accepted,
11
- * preventing any second process from connecting.
11
+ * preventing any second process from connecting while the session is live.
12
12
  * 5. Serves RPC on the accepted stream only.
13
- * 6. Simultaneously serves health probes (`/healthz`, `/readyz`) on a
13
+ * 6. When that session ends (the assistant disconnects or its container is
14
+ * restarted), re-binds the socket and awaits a reconnection. CES is a
15
+ * long-lived sidecar — it outlives any single assistant session and only
16
+ * shuts down on SIGTERM/SIGINT. At most one connection is ever active.
17
+ * 7. Simultaneously serves health probes (`/healthz`, `/readyz`) on a
14
18
  * dedicated HTTP port for Kubernetes liveness/readiness checks.
15
19
  *
16
20
  * The managed entrypoint never opens a generic TCP or HTTP command API.
@@ -22,7 +26,10 @@ import { createServer as createNetServer, type Socket } from "node:net";
22
26
  import { dirname, join } from "node:path";
23
27
  import { Readable, Writable } from "node:stream";
24
28
 
25
- import { CES_PROTOCOL_VERSION, CesRpcMethod } from "@vellumai/service-contracts/credential-rpc";
29
+ import {
30
+ CES_PROTOCOL_VERSION,
31
+ CesRpcMethod,
32
+ } from "@vellumai/service-contracts/credential-rpc";
26
33
 
27
34
  import { AuditStore } from "./audit/store.js";
28
35
  import { PersistentGrantStore } from "./grants/persistent-store.js";
@@ -49,19 +56,36 @@ import {
49
56
  registerCommandExecutionHandler,
50
57
  registerManageSecureCommandToolHandler,
51
58
  type RpcHandlerRegistry,
59
+ type ServeEndReason,
52
60
  type SessionIdRef,
53
61
  } from "./server.js";
54
- import { deleteBundleFromToolstore, publishBundle } from "./toolstore/publish.js";
62
+ import {
63
+ deleteBundleFromToolstore,
64
+ publishBundle,
65
+ } from "./toolstore/publish.js";
55
66
  import { validateSourceUrl } from "./toolstore/manifest.js";
56
67
  import { buildCesEgressHooks } from "./commands/egress-hooks.js";
57
68
  import { resolveManagedSubject } from "./subjects/managed.js";
58
69
  import { materializeManagedToken } from "./materializers/managed-platform.js";
59
- import { HandleType, parseHandle } from "@vellumai/service-contracts/credential-rpc";
60
- import { buildLazyGetters, type ApiKeyRef, type AssistantIdRef } from "./managed-lazy-getters.js";
70
+ import {
71
+ HandleType,
72
+ parseHandle,
73
+ } from "@vellumai/service-contracts/credential-rpc";
74
+ import {
75
+ applyManagedCredentialRefs,
76
+ buildLazyGetters,
77
+ type ApiKeyRef,
78
+ type AssistantIdRef,
79
+ } from "./managed-lazy-getters.js";
61
80
  import { MANAGED_LOCAL_STATIC_REJECTION_ERROR } from "./managed-errors.js";
62
81
  import type { SecureKeyBackend } from "@vellumai/credential-storage";
63
82
  import { createLocalSecureKeyBackend } from "./materializers/local-secure-key-backend.js";
64
- import { handleCredentialRoute, type CredentialRouteDeps } from "./http/credential-routes.js";
83
+ import type { LocalMaterialiser } from "./materializers/local.js";
84
+ import type { LocalSubjectResolverDeps } from "./subjects/local.js";
85
+ import {
86
+ handleCredentialRoute,
87
+ type CredentialRouteDeps,
88
+ } from "./http/credential-routes.js";
65
89
  import { handleLogExportRoute } from "./http/log-export-routes.js";
66
90
  import { CES_MIGRATIONS } from "./migrations/registry.js";
67
91
  import { runCesMigrations } from "./migrations/runner.js";
@@ -95,7 +119,12 @@ function ensureDataDirs(): void {
95
119
  // Build RPC handler registry (managed mode)
96
120
  // ---------------------------------------------------------------------------
97
121
 
98
- function buildHandlers(sessionIdRef: SessionIdRef, apiKeyRef: ApiKeyRef, assistantIdRef: AssistantIdRef, secureKeyBackend: SecureKeyBackend): RpcHandlerRegistry {
122
+ function buildHandlers(
123
+ sessionIdRef: SessionIdRef,
124
+ apiKeyRef: ApiKeyRef,
125
+ assistantIdRef: AssistantIdRef,
126
+ secureKeyBackend: SecureKeyBackend,
127
+ ): { handlers: RpcHandlerRegistry; temporaryGrantStore: TemporaryGrantStore } {
99
128
  // -- Grant stores ----------------------------------------------------------
100
129
  const persistentGrantStore = new PersistentGrantStore(
101
130
  getCesGrantsDir("managed"),
@@ -117,7 +146,7 @@ function buildHandlers(sessionIdRef: SessionIdRef, apiKeyRef: ApiKeyRef, assista
117
146
  // though handlers are built before the handshake completes.
118
147
  const platformBaseUrl = process.env["VELLUM_PLATFORM_URL"] ?? "";
119
148
 
120
- const { getAssistantApiKey, getManagedSubjectOptions, getManagedMaterializerOptions } =
149
+ const { getManagedSubjectOptions, getManagedMaterializerOptions } =
121
150
  buildLazyGetters({
122
151
  platformBaseUrl,
123
152
  assistantIdRef,
@@ -135,11 +164,13 @@ function buildHandlers(sessionIdRef: SessionIdRef, apiKeyRef: ApiKeyRef, assista
135
164
  // -- Workspace root for command execution cwd ------------------------------
136
165
  // Use VELLUM_WORKSPACE_DIR when set, otherwise fall back to the legacy
137
166
  // path derived from the assistant data mount.
138
- const defaultWorkspaceDir = process.env["VELLUM_WORKSPACE_DIR"] ?? (() => {
139
- const assistantDataMount =
140
- process.env["CES_ASSISTANT_DATA_MOUNT"] ?? "/assistant-data-ro";
141
- return join(join(assistantDataMount, ".vellum"), "workspace");
142
- })();
167
+ const defaultWorkspaceDir =
168
+ process.env["VELLUM_WORKSPACE_DIR"] ??
169
+ (() => {
170
+ const assistantDataMount =
171
+ process.env["CES_ASSISTANT_DATA_MOUNT"] ?? "/assistant-data-ro";
172
+ return join(join(assistantDataMount, ".vellum"), "workspace");
173
+ })();
143
174
 
144
175
  // -- Build handler registry ------------------------------------------------
145
176
  // NOTE: local_static credential handles are NOT supported in managed mode.
@@ -161,8 +192,11 @@ function buildHandlers(sessionIdRef: SessionIdRef, apiKeyRef: ApiKeyRef, assista
161
192
  }),
162
193
  };
163
194
 
164
- const localSubjectDepsStub = {
165
- metadataStore: { getById: () => undefined, list: () => [] } as any,
195
+ const localSubjectDepsStub: LocalSubjectResolverDeps = {
196
+ metadataStore: {
197
+ getById: () => undefined,
198
+ list: () => [],
199
+ } as unknown as LocalSubjectResolverDeps["metadataStore"],
166
200
  oauthConnections: { getById: () => undefined },
167
201
  };
168
202
 
@@ -171,10 +205,14 @@ function buildHandlers(sessionIdRef: SessionIdRef, apiKeyRef: ApiKeyRef, assista
171
205
  const httpDeps = {
172
206
  persistentGrantStore,
173
207
  temporaryGrantStore,
174
- localMaterialiser: localMaterialiserStub as any,
208
+ localMaterialiser: localMaterialiserStub as unknown as LocalMaterialiser,
175
209
  localSubjectDeps: localSubjectDepsStub,
176
- get managedSubjectOptions() { return getManagedSubjectOptions(); },
177
- get managedMaterializerOptions() { return getManagedMaterializerOptions(); },
210
+ get managedSubjectOptions() {
211
+ return getManagedSubjectOptions();
212
+ },
213
+ get managedMaterializerOptions() {
214
+ return getManagedMaterializerOptions();
215
+ },
178
216
  auditStore,
179
217
  sessionId: sessionIdRef,
180
218
  };
@@ -215,10 +253,7 @@ function buildHandlers(sessionIdRef: SessionIdRef, apiKeyRef: ApiKeyRef, assista
215
253
  };
216
254
  }
217
255
 
218
- const subjectResult = await resolveManagedSubject(
219
- handle,
220
- subOpts,
221
- );
256
+ const subjectResult = await resolveManagedSubject(handle, subOpts);
222
257
  if (!subjectResult.ok) {
223
258
  return { ok: false as const, error: subjectResult.error.message };
224
259
  }
@@ -241,7 +276,8 @@ function buildHandlers(sessionIdRef: SessionIdRef, apiKeyRef: ApiKeyRef, assista
241
276
  default:
242
277
  return {
243
278
  ok: false as const,
244
- error: `Handle type "${parseResult.handle.type}" is not supported in managed mode. ` +
279
+ error:
280
+ `Handle type "${parseResult.handle.type}" is not supported in managed mode. ` +
245
281
  `Supported types: platform_oauth.`,
246
282
  };
247
283
  }
@@ -255,7 +291,15 @@ function buildHandlers(sessionIdRef: SessionIdRef, apiKeyRef: ApiKeyRef, assista
255
291
  });
256
292
 
257
293
  // Register manage_secure_command_tool handler
258
- const toolRegistry = new Map<string, { toolName: string; credentialHandle: string; description: string; bundleDigest: string }>();
294
+ const toolRegistry = new Map<
295
+ string,
296
+ {
297
+ toolName: string;
298
+ credentialHandle: string;
299
+ description: string;
300
+ bundleDigest: string;
301
+ }
302
+ >();
259
303
 
260
304
  registerManageSecureCommandToolHandler(handlers, {
261
305
  downloadBundle: async (sourceUrl: string) => {
@@ -264,13 +308,17 @@ function buildHandlers(sessionIdRef: SessionIdRef, apiKeyRef: ApiKeyRef, assista
264
308
  throw new Error(urlError);
265
309
  }
266
310
  const MAX_BUNDLE_SIZE = 100 * 1024 * 1024; // 100 MB
267
- const resp = await fetch(sourceUrl, { signal: AbortSignal.timeout(60_000) });
311
+ const resp = await fetch(sourceUrl, {
312
+ signal: AbortSignal.timeout(60_000),
313
+ });
268
314
  if (!resp.ok) {
269
315
  throw new Error(`HTTP ${resp.status}: ${resp.statusText}`);
270
316
  }
271
317
  const contentLength = resp.headers.get("content-length");
272
318
  if (contentLength && parseInt(contentLength, 10) > MAX_BUNDLE_SIZE) {
273
- throw new Error(`Bundle too large: ${contentLength} bytes (max ${MAX_BUNDLE_SIZE})`);
319
+ throw new Error(
320
+ `Bundle too large: ${contentLength} bytes (max ${MAX_BUNDLE_SIZE})`,
321
+ );
274
322
  }
275
323
  // Stream the body and enforce the size limit on actual bytes received,
276
324
  // since Content-Length can be absent (chunked encoding) or lie.
@@ -283,18 +331,23 @@ function buildHandlers(sessionIdRef: SessionIdRef, apiKeyRef: ApiKeyRef, assista
283
331
  for await (const chunk of body) {
284
332
  totalBytes += chunk.byteLength;
285
333
  if (totalBytes > MAX_BUNDLE_SIZE) {
286
- throw new Error(`Bundle too large: received >${MAX_BUNDLE_SIZE} bytes (max ${MAX_BUNDLE_SIZE})`);
334
+ throw new Error(
335
+ `Bundle too large: received >${MAX_BUNDLE_SIZE} bytes (max ${MAX_BUNDLE_SIZE})`,
336
+ );
287
337
  }
288
338
  chunks.push(chunk);
289
339
  }
290
340
  return Buffer.concat(chunks);
291
341
  },
292
- publishBundle: (request) => publishBundle({ ...request, cesMode: "managed" }),
342
+ publishBundle: (request) =>
343
+ publishBundle({ ...request, cesMode: "managed" }),
293
344
  unregisterTool: (toolName: string) => {
294
345
  const entry = toolRegistry.get(toolName);
295
346
  const removed = toolRegistry.delete(toolName);
296
347
  if (removed && entry?.bundleDigest) {
297
- const stillInUse = Array.from(toolRegistry.values()).some(t => t.bundleDigest === entry.bundleDigest);
348
+ const stillInUse = Array.from(toolRegistry.values()).some(
349
+ (t) => t.bundleDigest === entry.bundleDigest,
350
+ );
298
351
  if (!stillInUse) {
299
352
  deleteBundleFromToolstore(entry.bundleDigest, "managed");
300
353
  }
@@ -310,52 +363,59 @@ function buildHandlers(sessionIdRef: SessionIdRef, apiKeyRef: ApiKeyRef, assista
310
363
  handlers[CesRpcMethod.RecordGrant] = createRecordGrantHandler({
311
364
  persistentGrantStore,
312
365
  temporaryGrantStore,
313
- }) as typeof handlers[string];
366
+ }) as (typeof handlers)[string];
314
367
 
315
368
  handlers[CesRpcMethod.ListGrants] = createListGrantsHandler({
316
369
  persistentGrantStore,
317
- }) as typeof handlers[string];
370
+ }) as (typeof handlers)[string];
318
371
 
319
372
  handlers[CesRpcMethod.RevokeGrant] = createRevokeGrantHandler({
320
373
  persistentGrantStore,
321
- }) as typeof handlers[string];
374
+ }) as (typeof handlers)[string];
322
375
 
323
376
  // Register audit record handler
324
377
  handlers[CesRpcMethod.ListAuditRecords] = createListAuditRecordsHandler({
325
378
  auditStore,
326
- }) as typeof handlers[string];
379
+ }) as (typeof handlers)[string];
327
380
 
328
381
  // Register credential CRUD handlers
329
382
  handlers[CesRpcMethod.GetCredential] = (async (req: { account: string }) => {
330
383
  const value = await secureKeyBackend.get(req.account);
331
384
  return { found: value !== undefined, value };
332
- }) as typeof handlers[string];
385
+ }) as (typeof handlers)[string];
333
386
 
334
- handlers[CesRpcMethod.SetCredential] = (async (req: { account: string; value: string }) => {
387
+ handlers[CesRpcMethod.SetCredential] = (async (req: {
388
+ account: string;
389
+ value: string;
390
+ }) => {
335
391
  const ok = await secureKeyBackend.set(req.account, req.value);
336
392
  return { ok };
337
- }) as typeof handlers[string];
393
+ }) as (typeof handlers)[string];
338
394
 
339
- handlers[CesRpcMethod.DeleteCredential] = (async (req: { account: string }) => {
395
+ handlers[CesRpcMethod.DeleteCredential] = (async (req: {
396
+ account: string;
397
+ }) => {
340
398
  const result = await secureKeyBackend.delete(req.account);
341
399
  return { result };
342
- }) as typeof handlers[string];
400
+ }) as (typeof handlers)[string];
343
401
 
344
402
  handlers[CesRpcMethod.ListCredentials] = (async () => {
345
403
  const accounts = await secureKeyBackend.list();
346
404
  return { accounts };
347
- }) as typeof handlers[string];
405
+ }) as (typeof handlers)[string];
348
406
 
349
- handlers[CesRpcMethod.BulkSetCredentials] = (async (req: { credentials: Array<{ account: string; value: string }> }) => {
407
+ handlers[CesRpcMethod.BulkSetCredentials] = (async (req: {
408
+ credentials: Array<{ account: string; value: string }>;
409
+ }) => {
350
410
  const results = [];
351
411
  for (const { account, value } of req.credentials) {
352
412
  const ok = await secureKeyBackend.set(account, value);
353
413
  results.push({ account, ok });
354
414
  }
355
415
  return { results };
356
- }) as typeof handlers[string];
416
+ }) as (typeof handlers)[string];
357
417
 
358
- return handlers;
418
+ return { handlers, temporaryGrantStore };
359
419
  }
360
420
 
361
421
  // ---------------------------------------------------------------------------
@@ -374,10 +434,9 @@ function startHealthServer(
374
434
  async fetch(req) {
375
435
  const url = new URL(req.url);
376
436
  if (url.pathname === "/healthz") {
377
- return new Response(
378
- JSON.stringify({ status: "ok" }),
379
- { headers: { "Content-Type": "application/json" } },
380
- );
437
+ return new Response(JSON.stringify({ status: "ok" }), {
438
+ headers: { "Content-Type": "application/json" },
439
+ });
381
440
  }
382
441
  if (url.pathname === "/readyz") {
383
442
  // Always return 200 — pod readiness must not depend on whether the
@@ -386,44 +445,56 @@ function startHealthServer(
386
445
  // scheduling during dark-launch. The sidecar can't do useful work
387
446
  // without a connection anyway, so readiness is purely about the
388
447
  // process being up and able to accept a future connection.
389
- return new Response(
390
- JSON.stringify({ status: "ok", rpcConnected }),
391
- {
392
- status: 200,
393
- headers: { "Content-Type": "application/json" },
394
- },
395
- );
448
+ return new Response(JSON.stringify({ status: "ok", rpcConnected }), {
449
+ status: 200,
450
+ headers: { "Content-Type": "application/json" },
451
+ });
396
452
  }
397
453
 
398
454
  // Credential CRUD routes (only if service token is configured)
399
455
  if (credentialDeps) {
400
- const credentialResponse = await handleCredentialRoute(req, credentialDeps);
456
+ const credentialResponse = await handleCredentialRoute(
457
+ req,
458
+ credentialDeps,
459
+ );
401
460
  if (credentialResponse) return credentialResponse;
402
461
  }
403
462
 
404
463
  // Log export route
405
- const logExportResponse = await handleLogExportRoute(req, getCesLogDir("managed"));
464
+ const logExportResponse = await handleLogExportRoute(
465
+ req,
466
+ getCesLogDir("managed"),
467
+ );
406
468
  if (logExportResponse) return logExportResponse;
407
469
 
408
470
  return new Response("Not Found", { status: 404 });
409
471
  },
410
472
  });
411
473
 
412
- signal.addEventListener("abort", () => {
413
- server.stop(true);
414
- }, { once: true });
474
+ signal.addEventListener(
475
+ "abort",
476
+ () => {
477
+ server.stop(true);
478
+ },
479
+ { once: true },
480
+ );
415
481
 
416
482
  return server;
417
483
  }
418
484
 
419
485
  // ---------------------------------------------------------------------------
420
- // Bootstrap socket server (accepts exactly one connection)
486
+ // Bootstrap socket server (accepts one connection at a time)
421
487
  // ---------------------------------------------------------------------------
422
488
 
423
489
  /**
424
- * Listen on a Unix socket, accept exactly one connection, unlink the
425
- * socket path, and return readable/writable streams for the accepted
426
- * connection.
490
+ * Listen on a Unix socket, accept one connection, unlink the socket path,
491
+ * and return readable/writable streams for the accepted connection.
492
+ *
493
+ * The socket is unlinked while a connection is active so no second process
494
+ * can connect concurrently (only one assistant ever talks to CES at a time).
495
+ * When that session ends, the caller re-invokes this function to re-bind the
496
+ * socket and accept the assistant's reconnection — CES outlives any single
497
+ * assistant session (see `main()`).
427
498
  */
428
499
  function acceptOneConnection(
429
500
  socketPath: string,
@@ -456,12 +527,17 @@ function acceptOneConnection(
456
527
  return;
457
528
  }
458
529
 
459
- signal.addEventListener("abort", () => {
530
+ // Remove this listener once the promise settles. Because CES re-binds
531
+ // the socket after each session ends, a long-lived AbortSignal would
532
+ // otherwise accumulate one dangling listener per reconnection.
533
+ const onAbort = () => {
460
534
  cleanup();
461
535
  reject(new Error("Aborted while waiting for connection"));
462
- }, { once: true });
536
+ };
537
+ signal.addEventListener("abort", onAbort, { once: true });
463
538
 
464
539
  netServer.on("error", (err) => {
540
+ signal.removeEventListener("abort", onAbort);
465
541
  cleanup();
466
542
  reject(err);
467
543
  });
@@ -471,8 +547,10 @@ function acceptOneConnection(
471
547
  });
472
548
 
473
549
  netServer.on("connection", (socket: Socket) => {
474
- // Accept exactly one connection, then close the listener and
475
- // unlink the socket path so no other process can connect.
550
+ // Accept the connection, then close the listener and unlink the
551
+ // socket path so no other process can connect while this session
552
+ // is active.
553
+ signal.removeEventListener("abort", onAbort);
476
554
  log.info("Assistant connected via bootstrap socket");
477
555
  netServer.close();
478
556
  try {
@@ -480,7 +558,7 @@ function acceptOneConnection(
480
558
  } catch {
481
559
  // Already unlinked
482
560
  }
483
- log.info("Bootstrap socket unlinked (single-connection enforced)");
561
+ log.info("Bootstrap socket unlinked (single active connection enforced)");
484
562
 
485
563
  const readable = new Readable({
486
564
  read() {
@@ -529,13 +607,22 @@ async function main(): Promise<void> {
529
607
 
530
608
  const controller = new AbortController();
531
609
 
532
- // Graceful shutdown
533
- const shutdown = () => {
534
- log.info("Shutting down...");
535
- controller.abort();
536
- };
537
- process.on("SIGTERM", shutdown);
538
- process.on("SIGINT", shutdown);
610
+ // Graceful shutdown — pass the signal as the abort reason so consumers
611
+ // of controller.signal can inspect signal.reason for triage.
612
+ process.on("SIGTERM", () => {
613
+ log.warn(
614
+ { signal: "SIGTERM", pid: process.pid, uptime: process.uptime() },
615
+ "Received SIGTERM — shutting down",
616
+ );
617
+ controller.abort("SIGTERM");
618
+ });
619
+ process.on("SIGINT", () => {
620
+ log.warn(
621
+ { signal: "SIGINT", pid: process.pid, uptime: process.uptime() },
622
+ "Received SIGINT — shutting down",
623
+ );
624
+ controller.abort("SIGINT");
625
+ });
539
626
 
540
627
  // Create the secure key backend unconditionally — it's needed by both
541
628
  // HTTP credential routes (when CES_SERVICE_TOKEN is set) and RPC
@@ -546,7 +633,11 @@ async function main(): Promise<void> {
546
633
  const secureKeyBackend = createLocalSecureKeyBackend(vellumRoot);
547
634
 
548
635
  // Run one-time credential store migrations before accepting connections.
549
- await runCesMigrations(getCesDataRoot("managed"), secureKeyBackend, CES_MIGRATIONS);
636
+ await runCesMigrations(
637
+ getCesDataRoot("managed"),
638
+ secureKeyBackend,
639
+ CES_MIGRATIONS,
640
+ );
550
641
  log.info("CES managed startup: migrations complete");
551
642
 
552
643
  // Set up credential CRUD routes if a service token is configured.
@@ -565,73 +656,164 @@ async function main(): Promise<void> {
565
656
  );
566
657
  }
567
658
 
568
- // Start health server on dedicated port
659
+ // Start health server on dedicated port. The returned handle isn't
660
+ // needed because the server lifetime is bound to controller.signal,
661
+ // which fires on shutdown and triggers Bun.serve's stop().
569
662
  const healthPort = getHealthPort();
570
- const healthServer = startHealthServer(healthPort, controller.signal, credentialDeps);
663
+ startHealthServer(healthPort, controller.signal, credentialDeps);
571
664
  log.info(`Health server listening on port ${healthPort}`);
572
665
 
573
- // Wait for exactly one assistant connection on the bootstrap socket
574
- const socketPath = getBootstrapSocketPath();
575
- log.info(`Waiting for assistant connection on ${socketPath}...`);
576
-
577
- let connection: Awaited<ReturnType<typeof acceptOneConnection>>;
578
- try {
579
- connection = await acceptOneConnection(socketPath, controller.signal);
580
- } catch (err) {
581
- if (controller.signal.aborted) {
582
- log.info("Shutdown before assistant connected.");
583
- return;
584
- }
585
- throw err;
586
- }
587
-
588
- rpcConnected = true;
589
-
590
- // Build the handler registry with all available RPC implementations.
591
- // Use mutable refs so the handshake-provided session ID and API key
592
- // are available to handlers at call time (after the handshake completes).
666
+ // Build the handler registry once, up front, and reuse it across every
667
+ // assistant session. All CES state lives behind these handlers — file-backed
668
+ // grant/audit stores plus the in-memory temporary-grant store and the
669
+ // secure-command tool registry — and must be process-scoped so it survives
670
+ // an assistant reconnection. In particular, the tool registry mirrors the
671
+ // persistent toolstore on disk; rebuilding it per session would let a later
672
+ // `unregister` miss a tool registered in an earlier session and orphan its
673
+ // bundle.
674
+ //
675
+ // The in-memory temporary-grant store is the exception: `allow_once` /
676
+ // `allow_10m` grants are keyed by proposal hash only (not session), so they
677
+ // would otherwise leak ephemeral approvals across sessions. It is cleared at
678
+ // the end of every session below so a reconnecting assistant must re-prompt.
679
+ //
680
+ // The mutable refs carry the handshake-provided session ID, API key, and
681
+ // assistant ID; handlers read them at call time, so updating the refs when
682
+ // each session's handshake completes is all that's needed per connection.
593
683
  const sessionIdRef: SessionIdRef = { current: `ces-managed-${Date.now()}` };
594
684
  const apiKeyRef: ApiKeyRef = { current: "" };
595
685
  const assistantIdRef: AssistantIdRef = { current: "" };
596
- const handlers = buildHandlers(sessionIdRef, apiKeyRef, assistantIdRef, secureKeyBackend);
686
+ const { handlers, temporaryGrantStore } = buildHandlers(
687
+ sessionIdRef,
688
+ apiKeyRef,
689
+ assistantIdRef,
690
+ secureKeyBackend,
691
+ );
597
692
 
693
+ // Serve loop. CES is a long-lived sidecar that must outlive any single
694
+ // assistant session: the assistant container can crash and be restarted
695
+ // independently of the CES container (Kubernetes restarts containers, not
696
+ // the whole pod), so when the RPC stream ends we re-bind the bootstrap
697
+ // socket and wait for the assistant to reconnect rather than tearing the
698
+ // sidecar down. The loop only exits on a shutdown signal (SIGTERM/SIGINT),
699
+ // which aborts the controller.
598
700
  const rpcLog = getLogger("rpc");
599
- const server = new CesRpcServer({
600
- input: connection.readable,
601
- output: connection.writable,
602
- handlers,
603
- logger: {
604
- log: (msg: string, ...args: unknown[]) => rpcLog.info({ args }, msg),
605
- warn: (msg: string, ...args: unknown[]) => rpcLog.warn({ args }, msg),
606
- error: (msg: string, ...args: unknown[]) => rpcLog.error({ args }, msg),
607
- },
608
- signal: controller.signal,
609
- onHandshakeComplete: (hsSessionId, hsApiKey, hsAssistantId) => {
610
- sessionIdRef.current = hsSessionId;
611
- if (hsApiKey) {
612
- apiKeyRef.current = hsApiKey;
613
- log.info("Received assistant API key via handshake");
614
- }
615
- if (hsAssistantId) {
616
- assistantIdRef.current = hsAssistantId;
617
- log.info("Received assistant ID via handshake");
618
- }
619
- },
620
- onApiKeyUpdate: (newKey, newAssistantId) => {
621
- apiKeyRef.current = newKey;
622
- log.info("Assistant API key updated via RPC");
623
- if (newAssistantId) {
624
- assistantIdRef.current = newAssistantId;
625
- log.info("Assistant ID updated via RPC");
701
+ const socketPath = getBootstrapSocketPath();
702
+
703
+ while (!controller.signal.aborted) {
704
+ log.info(`Waiting for assistant connection on ${socketPath}...`);
705
+
706
+ let connection: Awaited<ReturnType<typeof acceptOneConnection>>;
707
+ try {
708
+ connection = await acceptOneConnection(socketPath, controller.signal);
709
+ } catch (err) {
710
+ if (controller.signal.aborted) {
711
+ log.info("Shutdown before assistant connected.");
712
+ return;
626
713
  }
627
- },
628
- });
714
+ throw err;
715
+ }
716
+
717
+ rpcConnected = true;
629
718
 
630
- await server.serve();
719
+ const server = new CesRpcServer({
720
+ input: connection.readable,
721
+ output: connection.writable,
722
+ handlers,
723
+ logger: {
724
+ log: (msg: string, ...args: unknown[]) => rpcLog.info({ args }, msg),
725
+ warn: (msg: string, ...args: unknown[]) => rpcLog.warn({ args }, msg),
726
+ error: (msg: string, ...args: unknown[]) => rpcLog.error({ args }, msg),
727
+ },
728
+ signal: controller.signal,
729
+ onHandshakeComplete: (hsSessionId, hsApiKey, hsAssistantId) => {
730
+ sessionIdRef.current = hsSessionId;
731
+ // Overwrite the credential refs on every handshake. The handler
732
+ // registry persists across reconnects, so a new session that omits
733
+ // the API key / assistant ID must fail closed (falling back to the
734
+ // env key, or no key) rather than reusing the previous session's
735
+ // credentials.
736
+ applyManagedCredentialRefs(
737
+ apiKeyRef,
738
+ assistantIdRef,
739
+ hsApiKey,
740
+ hsAssistantId,
741
+ );
742
+ if (hsApiKey) {
743
+ log.info("Received assistant API key via handshake");
744
+ }
745
+ if (hsAssistantId) {
746
+ log.info("Received assistant ID via handshake");
747
+ }
748
+ },
749
+ onApiKeyUpdate: (newKey, newAssistantId) => {
750
+ // Overwrite both refs on every credential update, for the same
751
+ // fail-closed reason as the handshake: the assistant sources the
752
+ // assistant ID from the same place it sources the key, so an update
753
+ // that omits the ID means it has none — CES must clear the stale ID
754
+ // rather than keep materializing for the previous session's assistant.
755
+ applyManagedCredentialRefs(
756
+ apiKeyRef,
757
+ assistantIdRef,
758
+ newKey,
759
+ newAssistantId,
760
+ );
761
+ log.info("Assistant API key updated via RPC");
762
+ if (newAssistantId) {
763
+ log.info("Assistant ID updated via RPC");
764
+ }
765
+ },
766
+ });
631
767
 
632
- rpcConnected = false;
633
- log.info("RPC session ended. Shutting down...");
634
- controller.abort();
768
+ // `serve()` resolves on a clean stream end or signal abort, and rejects
769
+ // when the transport stream errors — which is precisely what a hard
770
+ // disconnect (connection reset when the assistant container crashes)
771
+ // looks like. Both cases must keep the sidecar up; only a shutdown
772
+ // signal should tear it down. So treat a serve() rejection the same as
773
+ // a session end and fall through to await reconnection.
774
+ let endReason: ServeEndReason | "transport_error";
775
+ try {
776
+ endReason = await server.serve();
777
+ } catch (err) {
778
+ server.close();
779
+ endReason = "transport_error";
780
+ log.warn(
781
+ { err, uptime: process.uptime(), pid: process.pid },
782
+ "RPC transport errored — treating as session end",
783
+ );
784
+ }
785
+
786
+ rpcConnected = false;
787
+
788
+ // Drop all ephemeral approvals when the session ends. `allow_once` /
789
+ // `allow_10m` grants are keyed by proposal hash only, so reusing the
790
+ // store across a reconnect would let a pre-disconnect approval be
791
+ // consumed by a later session without re-prompting. Clearing here
792
+ // restores the prior behavior, where the process exited on stream end
793
+ // and these grants never survived.
794
+ temporaryGrantStore.clear();
795
+
796
+ // A signal-driven end means the process is shutting down; exit the loop.
797
+ // Any other end reason (the assistant disconnected, its stream closed,
798
+ // or the transport errored) means we keep the sidecar up and await a
799
+ // reconnection.
800
+ if (
801
+ controller.signal.aborted ||
802
+ endReason === "signal_aborted" ||
803
+ endReason === "signal_aborted_before_start"
804
+ ) {
805
+ log.info(
806
+ { reason: endReason, uptime: process.uptime(), pid: process.pid },
807
+ "RPC session ended due to shutdown — exiting serve loop",
808
+ );
809
+ break;
810
+ }
811
+
812
+ log.warn(
813
+ { reason: endReason, uptime: process.uptime(), pid: process.pid },
814
+ "RPC session ended (assistant disconnected) — awaiting reconnection",
815
+ );
816
+ }
635
817
  }
636
818
 
637
819
  main().catch((err) => {