@vellumai/credential-executor 0.8.4 → 0.8.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bun.lock +279 -0
- package/eslint.config.mjs +23 -0
- package/knip.json +9 -0
- package/package.json +7 -2
- package/src/__tests__/ces-migrations-runner.test.ts +3 -3
- package/src/__tests__/command-executor.test.ts +5 -12
- package/src/__tests__/command-workspace.test.ts +0 -1
- package/src/__tests__/http-executor.test.ts +2 -7
- package/src/__tests__/managed-integration.test.ts +9 -10
- package/src/__tests__/managed-lazy-getters.test.ts +65 -0
- package/src/__tests__/managed-materializers.test.ts +1 -3
- package/src/__tests__/managed-reconnect.test.ts +244 -0
- package/src/__tests__/toolstore.test.ts +1 -1
- package/src/__tests__/transport.test.ts +8 -8
- package/src/managed-lazy-getters.ts +28 -4
- package/src/managed-main.ts +315 -133
- package/src/server.ts +11 -6
package/src/managed-main.ts
CHANGED
|
@@ -6,11 +6,15 @@
|
|
|
6
6
|
*
|
|
7
7
|
* 1. Ensures the CES-private data directories exist.
|
|
8
8
|
* 2. Binds a bootstrap Unix socket on the shared bootstrap volume.
|
|
9
|
-
* 3. Accepts
|
|
9
|
+
* 3. Accepts a single assistant runtime connection.
|
|
10
10
|
* 4. Unlinks the socket path immediately after the connection is accepted,
|
|
11
|
-
* preventing any second process from connecting.
|
|
11
|
+
* preventing any second process from connecting while the session is live.
|
|
12
12
|
* 5. Serves RPC on the accepted stream only.
|
|
13
|
-
* 6.
|
|
13
|
+
* 6. When that session ends (the assistant disconnects or its container is
|
|
14
|
+
* restarted), re-binds the socket and awaits a reconnection. CES is a
|
|
15
|
+
* long-lived sidecar — it outlives any single assistant session and only
|
|
16
|
+
* shuts down on SIGTERM/SIGINT. At most one connection is ever active.
|
|
17
|
+
* 7. Simultaneously serves health probes (`/healthz`, `/readyz`) on a
|
|
14
18
|
* dedicated HTTP port for Kubernetes liveness/readiness checks.
|
|
15
19
|
*
|
|
16
20
|
* The managed entrypoint never opens a generic TCP or HTTP command API.
|
|
@@ -22,7 +26,10 @@ import { createServer as createNetServer, type Socket } from "node:net";
|
|
|
22
26
|
import { dirname, join } from "node:path";
|
|
23
27
|
import { Readable, Writable } from "node:stream";
|
|
24
28
|
|
|
25
|
-
import {
|
|
29
|
+
import {
|
|
30
|
+
CES_PROTOCOL_VERSION,
|
|
31
|
+
CesRpcMethod,
|
|
32
|
+
} from "@vellumai/service-contracts/credential-rpc";
|
|
26
33
|
|
|
27
34
|
import { AuditStore } from "./audit/store.js";
|
|
28
35
|
import { PersistentGrantStore } from "./grants/persistent-store.js";
|
|
@@ -49,19 +56,36 @@ import {
|
|
|
49
56
|
registerCommandExecutionHandler,
|
|
50
57
|
registerManageSecureCommandToolHandler,
|
|
51
58
|
type RpcHandlerRegistry,
|
|
59
|
+
type ServeEndReason,
|
|
52
60
|
type SessionIdRef,
|
|
53
61
|
} from "./server.js";
|
|
54
|
-
import {
|
|
62
|
+
import {
|
|
63
|
+
deleteBundleFromToolstore,
|
|
64
|
+
publishBundle,
|
|
65
|
+
} from "./toolstore/publish.js";
|
|
55
66
|
import { validateSourceUrl } from "./toolstore/manifest.js";
|
|
56
67
|
import { buildCesEgressHooks } from "./commands/egress-hooks.js";
|
|
57
68
|
import { resolveManagedSubject } from "./subjects/managed.js";
|
|
58
69
|
import { materializeManagedToken } from "./materializers/managed-platform.js";
|
|
59
|
-
import {
|
|
60
|
-
|
|
70
|
+
import {
|
|
71
|
+
HandleType,
|
|
72
|
+
parseHandle,
|
|
73
|
+
} from "@vellumai/service-contracts/credential-rpc";
|
|
74
|
+
import {
|
|
75
|
+
applyManagedCredentialRefs,
|
|
76
|
+
buildLazyGetters,
|
|
77
|
+
type ApiKeyRef,
|
|
78
|
+
type AssistantIdRef,
|
|
79
|
+
} from "./managed-lazy-getters.js";
|
|
61
80
|
import { MANAGED_LOCAL_STATIC_REJECTION_ERROR } from "./managed-errors.js";
|
|
62
81
|
import type { SecureKeyBackend } from "@vellumai/credential-storage";
|
|
63
82
|
import { createLocalSecureKeyBackend } from "./materializers/local-secure-key-backend.js";
|
|
64
|
-
import {
|
|
83
|
+
import type { LocalMaterialiser } from "./materializers/local.js";
|
|
84
|
+
import type { LocalSubjectResolverDeps } from "./subjects/local.js";
|
|
85
|
+
import {
|
|
86
|
+
handleCredentialRoute,
|
|
87
|
+
type CredentialRouteDeps,
|
|
88
|
+
} from "./http/credential-routes.js";
|
|
65
89
|
import { handleLogExportRoute } from "./http/log-export-routes.js";
|
|
66
90
|
import { CES_MIGRATIONS } from "./migrations/registry.js";
|
|
67
91
|
import { runCesMigrations } from "./migrations/runner.js";
|
|
@@ -95,7 +119,12 @@ function ensureDataDirs(): void {
|
|
|
95
119
|
// Build RPC handler registry (managed mode)
|
|
96
120
|
// ---------------------------------------------------------------------------
|
|
97
121
|
|
|
98
|
-
function buildHandlers(
|
|
122
|
+
function buildHandlers(
|
|
123
|
+
sessionIdRef: SessionIdRef,
|
|
124
|
+
apiKeyRef: ApiKeyRef,
|
|
125
|
+
assistantIdRef: AssistantIdRef,
|
|
126
|
+
secureKeyBackend: SecureKeyBackend,
|
|
127
|
+
): { handlers: RpcHandlerRegistry; temporaryGrantStore: TemporaryGrantStore } {
|
|
99
128
|
// -- Grant stores ----------------------------------------------------------
|
|
100
129
|
const persistentGrantStore = new PersistentGrantStore(
|
|
101
130
|
getCesGrantsDir("managed"),
|
|
@@ -117,7 +146,7 @@ function buildHandlers(sessionIdRef: SessionIdRef, apiKeyRef: ApiKeyRef, assista
|
|
|
117
146
|
// though handlers are built before the handshake completes.
|
|
118
147
|
const platformBaseUrl = process.env["VELLUM_PLATFORM_URL"] ?? "";
|
|
119
148
|
|
|
120
|
-
const {
|
|
149
|
+
const { getManagedSubjectOptions, getManagedMaterializerOptions } =
|
|
121
150
|
buildLazyGetters({
|
|
122
151
|
platformBaseUrl,
|
|
123
152
|
assistantIdRef,
|
|
@@ -135,11 +164,13 @@ function buildHandlers(sessionIdRef: SessionIdRef, apiKeyRef: ApiKeyRef, assista
|
|
|
135
164
|
// -- Workspace root for command execution cwd ------------------------------
|
|
136
165
|
// Use VELLUM_WORKSPACE_DIR when set, otherwise fall back to the legacy
|
|
137
166
|
// path derived from the assistant data mount.
|
|
138
|
-
const defaultWorkspaceDir =
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
167
|
+
const defaultWorkspaceDir =
|
|
168
|
+
process.env["VELLUM_WORKSPACE_DIR"] ??
|
|
169
|
+
(() => {
|
|
170
|
+
const assistantDataMount =
|
|
171
|
+
process.env["CES_ASSISTANT_DATA_MOUNT"] ?? "/assistant-data-ro";
|
|
172
|
+
return join(join(assistantDataMount, ".vellum"), "workspace");
|
|
173
|
+
})();
|
|
143
174
|
|
|
144
175
|
// -- Build handler registry ------------------------------------------------
|
|
145
176
|
// NOTE: local_static credential handles are NOT supported in managed mode.
|
|
@@ -161,8 +192,11 @@ function buildHandlers(sessionIdRef: SessionIdRef, apiKeyRef: ApiKeyRef, assista
|
|
|
161
192
|
}),
|
|
162
193
|
};
|
|
163
194
|
|
|
164
|
-
const localSubjectDepsStub = {
|
|
165
|
-
metadataStore: {
|
|
195
|
+
const localSubjectDepsStub: LocalSubjectResolverDeps = {
|
|
196
|
+
metadataStore: {
|
|
197
|
+
getById: () => undefined,
|
|
198
|
+
list: () => [],
|
|
199
|
+
} as unknown as LocalSubjectResolverDeps["metadataStore"],
|
|
166
200
|
oauthConnections: { getById: () => undefined },
|
|
167
201
|
};
|
|
168
202
|
|
|
@@ -171,10 +205,14 @@ function buildHandlers(sessionIdRef: SessionIdRef, apiKeyRef: ApiKeyRef, assista
|
|
|
171
205
|
const httpDeps = {
|
|
172
206
|
persistentGrantStore,
|
|
173
207
|
temporaryGrantStore,
|
|
174
|
-
localMaterialiser: localMaterialiserStub as
|
|
208
|
+
localMaterialiser: localMaterialiserStub as unknown as LocalMaterialiser,
|
|
175
209
|
localSubjectDeps: localSubjectDepsStub,
|
|
176
|
-
get managedSubjectOptions() {
|
|
177
|
-
|
|
210
|
+
get managedSubjectOptions() {
|
|
211
|
+
return getManagedSubjectOptions();
|
|
212
|
+
},
|
|
213
|
+
get managedMaterializerOptions() {
|
|
214
|
+
return getManagedMaterializerOptions();
|
|
215
|
+
},
|
|
178
216
|
auditStore,
|
|
179
217
|
sessionId: sessionIdRef,
|
|
180
218
|
};
|
|
@@ -215,10 +253,7 @@ function buildHandlers(sessionIdRef: SessionIdRef, apiKeyRef: ApiKeyRef, assista
|
|
|
215
253
|
};
|
|
216
254
|
}
|
|
217
255
|
|
|
218
|
-
const subjectResult = await resolveManagedSubject(
|
|
219
|
-
handle,
|
|
220
|
-
subOpts,
|
|
221
|
-
);
|
|
256
|
+
const subjectResult = await resolveManagedSubject(handle, subOpts);
|
|
222
257
|
if (!subjectResult.ok) {
|
|
223
258
|
return { ok: false as const, error: subjectResult.error.message };
|
|
224
259
|
}
|
|
@@ -241,7 +276,8 @@ function buildHandlers(sessionIdRef: SessionIdRef, apiKeyRef: ApiKeyRef, assista
|
|
|
241
276
|
default:
|
|
242
277
|
return {
|
|
243
278
|
ok: false as const,
|
|
244
|
-
error:
|
|
279
|
+
error:
|
|
280
|
+
`Handle type "${parseResult.handle.type}" is not supported in managed mode. ` +
|
|
245
281
|
`Supported types: platform_oauth.`,
|
|
246
282
|
};
|
|
247
283
|
}
|
|
@@ -255,7 +291,15 @@ function buildHandlers(sessionIdRef: SessionIdRef, apiKeyRef: ApiKeyRef, assista
|
|
|
255
291
|
});
|
|
256
292
|
|
|
257
293
|
// Register manage_secure_command_tool handler
|
|
258
|
-
const toolRegistry = new Map<
|
|
294
|
+
const toolRegistry = new Map<
|
|
295
|
+
string,
|
|
296
|
+
{
|
|
297
|
+
toolName: string;
|
|
298
|
+
credentialHandle: string;
|
|
299
|
+
description: string;
|
|
300
|
+
bundleDigest: string;
|
|
301
|
+
}
|
|
302
|
+
>();
|
|
259
303
|
|
|
260
304
|
registerManageSecureCommandToolHandler(handlers, {
|
|
261
305
|
downloadBundle: async (sourceUrl: string) => {
|
|
@@ -264,13 +308,17 @@ function buildHandlers(sessionIdRef: SessionIdRef, apiKeyRef: ApiKeyRef, assista
|
|
|
264
308
|
throw new Error(urlError);
|
|
265
309
|
}
|
|
266
310
|
const MAX_BUNDLE_SIZE = 100 * 1024 * 1024; // 100 MB
|
|
267
|
-
const resp = await fetch(sourceUrl, {
|
|
311
|
+
const resp = await fetch(sourceUrl, {
|
|
312
|
+
signal: AbortSignal.timeout(60_000),
|
|
313
|
+
});
|
|
268
314
|
if (!resp.ok) {
|
|
269
315
|
throw new Error(`HTTP ${resp.status}: ${resp.statusText}`);
|
|
270
316
|
}
|
|
271
317
|
const contentLength = resp.headers.get("content-length");
|
|
272
318
|
if (contentLength && parseInt(contentLength, 10) > MAX_BUNDLE_SIZE) {
|
|
273
|
-
throw new Error(
|
|
319
|
+
throw new Error(
|
|
320
|
+
`Bundle too large: ${contentLength} bytes (max ${MAX_BUNDLE_SIZE})`,
|
|
321
|
+
);
|
|
274
322
|
}
|
|
275
323
|
// Stream the body and enforce the size limit on actual bytes received,
|
|
276
324
|
// since Content-Length can be absent (chunked encoding) or lie.
|
|
@@ -283,18 +331,23 @@ function buildHandlers(sessionIdRef: SessionIdRef, apiKeyRef: ApiKeyRef, assista
|
|
|
283
331
|
for await (const chunk of body) {
|
|
284
332
|
totalBytes += chunk.byteLength;
|
|
285
333
|
if (totalBytes > MAX_BUNDLE_SIZE) {
|
|
286
|
-
throw new Error(
|
|
334
|
+
throw new Error(
|
|
335
|
+
`Bundle too large: received >${MAX_BUNDLE_SIZE} bytes (max ${MAX_BUNDLE_SIZE})`,
|
|
336
|
+
);
|
|
287
337
|
}
|
|
288
338
|
chunks.push(chunk);
|
|
289
339
|
}
|
|
290
340
|
return Buffer.concat(chunks);
|
|
291
341
|
},
|
|
292
|
-
publishBundle: (request) =>
|
|
342
|
+
publishBundle: (request) =>
|
|
343
|
+
publishBundle({ ...request, cesMode: "managed" }),
|
|
293
344
|
unregisterTool: (toolName: string) => {
|
|
294
345
|
const entry = toolRegistry.get(toolName);
|
|
295
346
|
const removed = toolRegistry.delete(toolName);
|
|
296
347
|
if (removed && entry?.bundleDigest) {
|
|
297
|
-
const stillInUse = Array.from(toolRegistry.values()).some(
|
|
348
|
+
const stillInUse = Array.from(toolRegistry.values()).some(
|
|
349
|
+
(t) => t.bundleDigest === entry.bundleDigest,
|
|
350
|
+
);
|
|
298
351
|
if (!stillInUse) {
|
|
299
352
|
deleteBundleFromToolstore(entry.bundleDigest, "managed");
|
|
300
353
|
}
|
|
@@ -310,52 +363,59 @@ function buildHandlers(sessionIdRef: SessionIdRef, apiKeyRef: ApiKeyRef, assista
|
|
|
310
363
|
handlers[CesRpcMethod.RecordGrant] = createRecordGrantHandler({
|
|
311
364
|
persistentGrantStore,
|
|
312
365
|
temporaryGrantStore,
|
|
313
|
-
}) as typeof handlers[string];
|
|
366
|
+
}) as (typeof handlers)[string];
|
|
314
367
|
|
|
315
368
|
handlers[CesRpcMethod.ListGrants] = createListGrantsHandler({
|
|
316
369
|
persistentGrantStore,
|
|
317
|
-
}) as typeof handlers[string];
|
|
370
|
+
}) as (typeof handlers)[string];
|
|
318
371
|
|
|
319
372
|
handlers[CesRpcMethod.RevokeGrant] = createRevokeGrantHandler({
|
|
320
373
|
persistentGrantStore,
|
|
321
|
-
}) as typeof handlers[string];
|
|
374
|
+
}) as (typeof handlers)[string];
|
|
322
375
|
|
|
323
376
|
// Register audit record handler
|
|
324
377
|
handlers[CesRpcMethod.ListAuditRecords] = createListAuditRecordsHandler({
|
|
325
378
|
auditStore,
|
|
326
|
-
}) as typeof handlers[string];
|
|
379
|
+
}) as (typeof handlers)[string];
|
|
327
380
|
|
|
328
381
|
// Register credential CRUD handlers
|
|
329
382
|
handlers[CesRpcMethod.GetCredential] = (async (req: { account: string }) => {
|
|
330
383
|
const value = await secureKeyBackend.get(req.account);
|
|
331
384
|
return { found: value !== undefined, value };
|
|
332
|
-
}) as typeof handlers[string];
|
|
385
|
+
}) as (typeof handlers)[string];
|
|
333
386
|
|
|
334
|
-
handlers[CesRpcMethod.SetCredential] = (async (req: {
|
|
387
|
+
handlers[CesRpcMethod.SetCredential] = (async (req: {
|
|
388
|
+
account: string;
|
|
389
|
+
value: string;
|
|
390
|
+
}) => {
|
|
335
391
|
const ok = await secureKeyBackend.set(req.account, req.value);
|
|
336
392
|
return { ok };
|
|
337
|
-
}) as typeof handlers[string];
|
|
393
|
+
}) as (typeof handlers)[string];
|
|
338
394
|
|
|
339
|
-
handlers[CesRpcMethod.DeleteCredential] = (async (req: {
|
|
395
|
+
handlers[CesRpcMethod.DeleteCredential] = (async (req: {
|
|
396
|
+
account: string;
|
|
397
|
+
}) => {
|
|
340
398
|
const result = await secureKeyBackend.delete(req.account);
|
|
341
399
|
return { result };
|
|
342
|
-
}) as typeof handlers[string];
|
|
400
|
+
}) as (typeof handlers)[string];
|
|
343
401
|
|
|
344
402
|
handlers[CesRpcMethod.ListCredentials] = (async () => {
|
|
345
403
|
const accounts = await secureKeyBackend.list();
|
|
346
404
|
return { accounts };
|
|
347
|
-
}) as typeof handlers[string];
|
|
405
|
+
}) as (typeof handlers)[string];
|
|
348
406
|
|
|
349
|
-
handlers[CesRpcMethod.BulkSetCredentials] = (async (req: {
|
|
407
|
+
handlers[CesRpcMethod.BulkSetCredentials] = (async (req: {
|
|
408
|
+
credentials: Array<{ account: string; value: string }>;
|
|
409
|
+
}) => {
|
|
350
410
|
const results = [];
|
|
351
411
|
for (const { account, value } of req.credentials) {
|
|
352
412
|
const ok = await secureKeyBackend.set(account, value);
|
|
353
413
|
results.push({ account, ok });
|
|
354
414
|
}
|
|
355
415
|
return { results };
|
|
356
|
-
}) as typeof handlers[string];
|
|
416
|
+
}) as (typeof handlers)[string];
|
|
357
417
|
|
|
358
|
-
return handlers;
|
|
418
|
+
return { handlers, temporaryGrantStore };
|
|
359
419
|
}
|
|
360
420
|
|
|
361
421
|
// ---------------------------------------------------------------------------
|
|
@@ -374,10 +434,9 @@ function startHealthServer(
|
|
|
374
434
|
async fetch(req) {
|
|
375
435
|
const url = new URL(req.url);
|
|
376
436
|
if (url.pathname === "/healthz") {
|
|
377
|
-
return new Response(
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
);
|
|
437
|
+
return new Response(JSON.stringify({ status: "ok" }), {
|
|
438
|
+
headers: { "Content-Type": "application/json" },
|
|
439
|
+
});
|
|
381
440
|
}
|
|
382
441
|
if (url.pathname === "/readyz") {
|
|
383
442
|
// Always return 200 — pod readiness must not depend on whether the
|
|
@@ -386,44 +445,56 @@ function startHealthServer(
|
|
|
386
445
|
// scheduling during dark-launch. The sidecar can't do useful work
|
|
387
446
|
// without a connection anyway, so readiness is purely about the
|
|
388
447
|
// process being up and able to accept a future connection.
|
|
389
|
-
return new Response(
|
|
390
|
-
|
|
391
|
-
{
|
|
392
|
-
|
|
393
|
-
headers: { "Content-Type": "application/json" },
|
|
394
|
-
},
|
|
395
|
-
);
|
|
448
|
+
return new Response(JSON.stringify({ status: "ok", rpcConnected }), {
|
|
449
|
+
status: 200,
|
|
450
|
+
headers: { "Content-Type": "application/json" },
|
|
451
|
+
});
|
|
396
452
|
}
|
|
397
453
|
|
|
398
454
|
// Credential CRUD routes (only if service token is configured)
|
|
399
455
|
if (credentialDeps) {
|
|
400
|
-
const credentialResponse = await handleCredentialRoute(
|
|
456
|
+
const credentialResponse = await handleCredentialRoute(
|
|
457
|
+
req,
|
|
458
|
+
credentialDeps,
|
|
459
|
+
);
|
|
401
460
|
if (credentialResponse) return credentialResponse;
|
|
402
461
|
}
|
|
403
462
|
|
|
404
463
|
// Log export route
|
|
405
|
-
const logExportResponse = await handleLogExportRoute(
|
|
464
|
+
const logExportResponse = await handleLogExportRoute(
|
|
465
|
+
req,
|
|
466
|
+
getCesLogDir("managed"),
|
|
467
|
+
);
|
|
406
468
|
if (logExportResponse) return logExportResponse;
|
|
407
469
|
|
|
408
470
|
return new Response("Not Found", { status: 404 });
|
|
409
471
|
},
|
|
410
472
|
});
|
|
411
473
|
|
|
412
|
-
signal.addEventListener(
|
|
413
|
-
|
|
414
|
-
|
|
474
|
+
signal.addEventListener(
|
|
475
|
+
"abort",
|
|
476
|
+
() => {
|
|
477
|
+
server.stop(true);
|
|
478
|
+
},
|
|
479
|
+
{ once: true },
|
|
480
|
+
);
|
|
415
481
|
|
|
416
482
|
return server;
|
|
417
483
|
}
|
|
418
484
|
|
|
419
485
|
// ---------------------------------------------------------------------------
|
|
420
|
-
// Bootstrap socket server (accepts
|
|
486
|
+
// Bootstrap socket server (accepts one connection at a time)
|
|
421
487
|
// ---------------------------------------------------------------------------
|
|
422
488
|
|
|
423
489
|
/**
|
|
424
|
-
* Listen on a Unix socket, accept
|
|
425
|
-
*
|
|
426
|
-
*
|
|
490
|
+
* Listen on a Unix socket, accept one connection, unlink the socket path,
|
|
491
|
+
* and return readable/writable streams for the accepted connection.
|
|
492
|
+
*
|
|
493
|
+
* The socket is unlinked while a connection is active so no second process
|
|
494
|
+
* can connect concurrently (only one assistant ever talks to CES at a time).
|
|
495
|
+
* When that session ends, the caller re-invokes this function to re-bind the
|
|
496
|
+
* socket and accept the assistant's reconnection — CES outlives any single
|
|
497
|
+
* assistant session (see `main()`).
|
|
427
498
|
*/
|
|
428
499
|
function acceptOneConnection(
|
|
429
500
|
socketPath: string,
|
|
@@ -456,12 +527,17 @@ function acceptOneConnection(
|
|
|
456
527
|
return;
|
|
457
528
|
}
|
|
458
529
|
|
|
459
|
-
|
|
530
|
+
// Remove this listener once the promise settles. Because CES re-binds
|
|
531
|
+
// the socket after each session ends, a long-lived AbortSignal would
|
|
532
|
+
// otherwise accumulate one dangling listener per reconnection.
|
|
533
|
+
const onAbort = () => {
|
|
460
534
|
cleanup();
|
|
461
535
|
reject(new Error("Aborted while waiting for connection"));
|
|
462
|
-
}
|
|
536
|
+
};
|
|
537
|
+
signal.addEventListener("abort", onAbort, { once: true });
|
|
463
538
|
|
|
464
539
|
netServer.on("error", (err) => {
|
|
540
|
+
signal.removeEventListener("abort", onAbort);
|
|
465
541
|
cleanup();
|
|
466
542
|
reject(err);
|
|
467
543
|
});
|
|
@@ -471,8 +547,10 @@ function acceptOneConnection(
|
|
|
471
547
|
});
|
|
472
548
|
|
|
473
549
|
netServer.on("connection", (socket: Socket) => {
|
|
474
|
-
// Accept
|
|
475
|
-
//
|
|
550
|
+
// Accept the connection, then close the listener and unlink the
|
|
551
|
+
// socket path so no other process can connect while this session
|
|
552
|
+
// is active.
|
|
553
|
+
signal.removeEventListener("abort", onAbort);
|
|
476
554
|
log.info("Assistant connected via bootstrap socket");
|
|
477
555
|
netServer.close();
|
|
478
556
|
try {
|
|
@@ -480,7 +558,7 @@ function acceptOneConnection(
|
|
|
480
558
|
} catch {
|
|
481
559
|
// Already unlinked
|
|
482
560
|
}
|
|
483
|
-
log.info("Bootstrap socket unlinked (single
|
|
561
|
+
log.info("Bootstrap socket unlinked (single active connection enforced)");
|
|
484
562
|
|
|
485
563
|
const readable = new Readable({
|
|
486
564
|
read() {
|
|
@@ -529,13 +607,22 @@ async function main(): Promise<void> {
|
|
|
529
607
|
|
|
530
608
|
const controller = new AbortController();
|
|
531
609
|
|
|
532
|
-
// Graceful shutdown
|
|
533
|
-
|
|
534
|
-
|
|
535
|
-
|
|
536
|
-
|
|
537
|
-
|
|
538
|
-
|
|
610
|
+
// Graceful shutdown — pass the signal as the abort reason so consumers
|
|
611
|
+
// of controller.signal can inspect signal.reason for triage.
|
|
612
|
+
process.on("SIGTERM", () => {
|
|
613
|
+
log.warn(
|
|
614
|
+
{ signal: "SIGTERM", pid: process.pid, uptime: process.uptime() },
|
|
615
|
+
"Received SIGTERM — shutting down",
|
|
616
|
+
);
|
|
617
|
+
controller.abort("SIGTERM");
|
|
618
|
+
});
|
|
619
|
+
process.on("SIGINT", () => {
|
|
620
|
+
log.warn(
|
|
621
|
+
{ signal: "SIGINT", pid: process.pid, uptime: process.uptime() },
|
|
622
|
+
"Received SIGINT — shutting down",
|
|
623
|
+
);
|
|
624
|
+
controller.abort("SIGINT");
|
|
625
|
+
});
|
|
539
626
|
|
|
540
627
|
// Create the secure key backend unconditionally — it's needed by both
|
|
541
628
|
// HTTP credential routes (when CES_SERVICE_TOKEN is set) and RPC
|
|
@@ -546,7 +633,11 @@ async function main(): Promise<void> {
|
|
|
546
633
|
const secureKeyBackend = createLocalSecureKeyBackend(vellumRoot);
|
|
547
634
|
|
|
548
635
|
// Run one-time credential store migrations before accepting connections.
|
|
549
|
-
await runCesMigrations(
|
|
636
|
+
await runCesMigrations(
|
|
637
|
+
getCesDataRoot("managed"),
|
|
638
|
+
secureKeyBackend,
|
|
639
|
+
CES_MIGRATIONS,
|
|
640
|
+
);
|
|
550
641
|
log.info("CES managed startup: migrations complete");
|
|
551
642
|
|
|
552
643
|
// Set up credential CRUD routes if a service token is configured.
|
|
@@ -565,73 +656,164 @@ async function main(): Promise<void> {
|
|
|
565
656
|
);
|
|
566
657
|
}
|
|
567
658
|
|
|
568
|
-
// Start health server on dedicated port
|
|
659
|
+
// Start health server on dedicated port. The returned handle isn't
|
|
660
|
+
// needed because the server lifetime is bound to controller.signal,
|
|
661
|
+
// which fires on shutdown and triggers Bun.serve's stop().
|
|
569
662
|
const healthPort = getHealthPort();
|
|
570
|
-
|
|
663
|
+
startHealthServer(healthPort, controller.signal, credentialDeps);
|
|
571
664
|
log.info(`Health server listening on port ${healthPort}`);
|
|
572
665
|
|
|
573
|
-
//
|
|
574
|
-
|
|
575
|
-
|
|
576
|
-
|
|
577
|
-
|
|
578
|
-
|
|
579
|
-
|
|
580
|
-
|
|
581
|
-
|
|
582
|
-
|
|
583
|
-
|
|
584
|
-
|
|
585
|
-
|
|
586
|
-
|
|
587
|
-
|
|
588
|
-
|
|
589
|
-
|
|
590
|
-
// Build the handler registry with all available RPC implementations.
|
|
591
|
-
// Use mutable refs so the handshake-provided session ID and API key
|
|
592
|
-
// are available to handlers at call time (after the handshake completes).
|
|
666
|
+
// Build the handler registry once, up front, and reuse it across every
|
|
667
|
+
// assistant session. All CES state lives behind these handlers — file-backed
|
|
668
|
+
// grant/audit stores plus the in-memory temporary-grant store and the
|
|
669
|
+
// secure-command tool registry — and must be process-scoped so it survives
|
|
670
|
+
// an assistant reconnection. In particular, the tool registry mirrors the
|
|
671
|
+
// persistent toolstore on disk; rebuilding it per session would let a later
|
|
672
|
+
// `unregister` miss a tool registered in an earlier session and orphan its
|
|
673
|
+
// bundle.
|
|
674
|
+
//
|
|
675
|
+
// The in-memory temporary-grant store is the exception: `allow_once` /
|
|
676
|
+
// `allow_10m` grants are keyed by proposal hash only (not session), so they
|
|
677
|
+
// would otherwise leak ephemeral approvals across sessions. It is cleared at
|
|
678
|
+
// the end of every session below so a reconnecting assistant must re-prompt.
|
|
679
|
+
//
|
|
680
|
+
// The mutable refs carry the handshake-provided session ID, API key, and
|
|
681
|
+
// assistant ID; handlers read them at call time, so updating the refs when
|
|
682
|
+
// each session's handshake completes is all that's needed per connection.
|
|
593
683
|
const sessionIdRef: SessionIdRef = { current: `ces-managed-${Date.now()}` };
|
|
594
684
|
const apiKeyRef: ApiKeyRef = { current: "" };
|
|
595
685
|
const assistantIdRef: AssistantIdRef = { current: "" };
|
|
596
|
-
const handlers = buildHandlers(
|
|
686
|
+
const { handlers, temporaryGrantStore } = buildHandlers(
|
|
687
|
+
sessionIdRef,
|
|
688
|
+
apiKeyRef,
|
|
689
|
+
assistantIdRef,
|
|
690
|
+
secureKeyBackend,
|
|
691
|
+
);
|
|
597
692
|
|
|
693
|
+
// Serve loop. CES is a long-lived sidecar that must outlive any single
|
|
694
|
+
// assistant session: the assistant container can crash and be restarted
|
|
695
|
+
// independently of the CES container (Kubernetes restarts containers, not
|
|
696
|
+
// the whole pod), so when the RPC stream ends we re-bind the bootstrap
|
|
697
|
+
// socket and wait for the assistant to reconnect rather than tearing the
|
|
698
|
+
// sidecar down. The loop only exits on a shutdown signal (SIGTERM/SIGINT),
|
|
699
|
+
// which aborts the controller.
|
|
598
700
|
const rpcLog = getLogger("rpc");
|
|
599
|
-
const
|
|
600
|
-
|
|
601
|
-
|
|
602
|
-
|
|
603
|
-
|
|
604
|
-
|
|
605
|
-
|
|
606
|
-
|
|
607
|
-
}
|
|
608
|
-
|
|
609
|
-
|
|
610
|
-
|
|
611
|
-
if (hsApiKey) {
|
|
612
|
-
apiKeyRef.current = hsApiKey;
|
|
613
|
-
log.info("Received assistant API key via handshake");
|
|
614
|
-
}
|
|
615
|
-
if (hsAssistantId) {
|
|
616
|
-
assistantIdRef.current = hsAssistantId;
|
|
617
|
-
log.info("Received assistant ID via handshake");
|
|
618
|
-
}
|
|
619
|
-
},
|
|
620
|
-
onApiKeyUpdate: (newKey, newAssistantId) => {
|
|
621
|
-
apiKeyRef.current = newKey;
|
|
622
|
-
log.info("Assistant API key updated via RPC");
|
|
623
|
-
if (newAssistantId) {
|
|
624
|
-
assistantIdRef.current = newAssistantId;
|
|
625
|
-
log.info("Assistant ID updated via RPC");
|
|
701
|
+
const socketPath = getBootstrapSocketPath();
|
|
702
|
+
|
|
703
|
+
while (!controller.signal.aborted) {
|
|
704
|
+
log.info(`Waiting for assistant connection on ${socketPath}...`);
|
|
705
|
+
|
|
706
|
+
let connection: Awaited<ReturnType<typeof acceptOneConnection>>;
|
|
707
|
+
try {
|
|
708
|
+
connection = await acceptOneConnection(socketPath, controller.signal);
|
|
709
|
+
} catch (err) {
|
|
710
|
+
if (controller.signal.aborted) {
|
|
711
|
+
log.info("Shutdown before assistant connected.");
|
|
712
|
+
return;
|
|
626
713
|
}
|
|
627
|
-
|
|
628
|
-
|
|
714
|
+
throw err;
|
|
715
|
+
}
|
|
716
|
+
|
|
717
|
+
rpcConnected = true;
|
|
629
718
|
|
|
630
|
-
|
|
719
|
+
const server = new CesRpcServer({
|
|
720
|
+
input: connection.readable,
|
|
721
|
+
output: connection.writable,
|
|
722
|
+
handlers,
|
|
723
|
+
logger: {
|
|
724
|
+
log: (msg: string, ...args: unknown[]) => rpcLog.info({ args }, msg),
|
|
725
|
+
warn: (msg: string, ...args: unknown[]) => rpcLog.warn({ args }, msg),
|
|
726
|
+
error: (msg: string, ...args: unknown[]) => rpcLog.error({ args }, msg),
|
|
727
|
+
},
|
|
728
|
+
signal: controller.signal,
|
|
729
|
+
onHandshakeComplete: (hsSessionId, hsApiKey, hsAssistantId) => {
|
|
730
|
+
sessionIdRef.current = hsSessionId;
|
|
731
|
+
// Overwrite the credential refs on every handshake. The handler
|
|
732
|
+
// registry persists across reconnects, so a new session that omits
|
|
733
|
+
// the API key / assistant ID must fail closed (falling back to the
|
|
734
|
+
// env key, or no key) rather than reusing the previous session's
|
|
735
|
+
// credentials.
|
|
736
|
+
applyManagedCredentialRefs(
|
|
737
|
+
apiKeyRef,
|
|
738
|
+
assistantIdRef,
|
|
739
|
+
hsApiKey,
|
|
740
|
+
hsAssistantId,
|
|
741
|
+
);
|
|
742
|
+
if (hsApiKey) {
|
|
743
|
+
log.info("Received assistant API key via handshake");
|
|
744
|
+
}
|
|
745
|
+
if (hsAssistantId) {
|
|
746
|
+
log.info("Received assistant ID via handshake");
|
|
747
|
+
}
|
|
748
|
+
},
|
|
749
|
+
onApiKeyUpdate: (newKey, newAssistantId) => {
|
|
750
|
+
// Overwrite both refs on every credential update, for the same
|
|
751
|
+
// fail-closed reason as the handshake: the assistant sources the
|
|
752
|
+
// assistant ID from the same place it sources the key, so an update
|
|
753
|
+
// that omits the ID means it has none — CES must clear the stale ID
|
|
754
|
+
// rather than keep materializing for the previous session's assistant.
|
|
755
|
+
applyManagedCredentialRefs(
|
|
756
|
+
apiKeyRef,
|
|
757
|
+
assistantIdRef,
|
|
758
|
+
newKey,
|
|
759
|
+
newAssistantId,
|
|
760
|
+
);
|
|
761
|
+
log.info("Assistant API key updated via RPC");
|
|
762
|
+
if (newAssistantId) {
|
|
763
|
+
log.info("Assistant ID updated via RPC");
|
|
764
|
+
}
|
|
765
|
+
},
|
|
766
|
+
});
|
|
631
767
|
|
|
632
|
-
|
|
633
|
-
|
|
634
|
-
|
|
768
|
+
// `serve()` resolves on a clean stream end or signal abort, and rejects
|
|
769
|
+
// when the transport stream errors — which is precisely what a hard
|
|
770
|
+
// disconnect (connection reset when the assistant container crashes)
|
|
771
|
+
// looks like. Both cases must keep the sidecar up; only a shutdown
|
|
772
|
+
// signal should tear it down. So treat a serve() rejection the same as
|
|
773
|
+
// a session end and fall through to await reconnection.
|
|
774
|
+
let endReason: ServeEndReason | "transport_error";
|
|
775
|
+
try {
|
|
776
|
+
endReason = await server.serve();
|
|
777
|
+
} catch (err) {
|
|
778
|
+
server.close();
|
|
779
|
+
endReason = "transport_error";
|
|
780
|
+
log.warn(
|
|
781
|
+
{ err, uptime: process.uptime(), pid: process.pid },
|
|
782
|
+
"RPC transport errored — treating as session end",
|
|
783
|
+
);
|
|
784
|
+
}
|
|
785
|
+
|
|
786
|
+
rpcConnected = false;
|
|
787
|
+
|
|
788
|
+
// Drop all ephemeral approvals when the session ends. `allow_once` /
|
|
789
|
+
// `allow_10m` grants are keyed by proposal hash only, so reusing the
|
|
790
|
+
// store across a reconnect would let a pre-disconnect approval be
|
|
791
|
+
// consumed by a later session without re-prompting. Clearing here
|
|
792
|
+
// restores the prior behavior, where the process exited on stream end
|
|
793
|
+
// and these grants never survived.
|
|
794
|
+
temporaryGrantStore.clear();
|
|
795
|
+
|
|
796
|
+
// A signal-driven end means the process is shutting down; exit the loop.
|
|
797
|
+
// Any other end reason (the assistant disconnected, its stream closed,
|
|
798
|
+
// or the transport errored) means we keep the sidecar up and await a
|
|
799
|
+
// reconnection.
|
|
800
|
+
if (
|
|
801
|
+
controller.signal.aborted ||
|
|
802
|
+
endReason === "signal_aborted" ||
|
|
803
|
+
endReason === "signal_aborted_before_start"
|
|
804
|
+
) {
|
|
805
|
+
log.info(
|
|
806
|
+
{ reason: endReason, uptime: process.uptime(), pid: process.pid },
|
|
807
|
+
"RPC session ended due to shutdown — exiting serve loop",
|
|
808
|
+
);
|
|
809
|
+
break;
|
|
810
|
+
}
|
|
811
|
+
|
|
812
|
+
log.warn(
|
|
813
|
+
{ reason: endReason, uptime: process.uptime(), pid: process.pid },
|
|
814
|
+
"RPC session ended (assistant disconnected) — awaiting reconnection",
|
|
815
|
+
);
|
|
816
|
+
}
|
|
635
817
|
}
|
|
636
818
|
|
|
637
819
|
main().catch((err) => {
|