@vellumai/credential-executor 0.8.5 → 0.8.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json
CHANGED
|
@@ -9,11 +9,76 @@
|
|
|
9
9
|
import { describe, expect, test } from "bun:test";
|
|
10
10
|
|
|
11
11
|
import {
|
|
12
|
+
applyManagedCredentialRefs,
|
|
12
13
|
buildLazyGetters,
|
|
13
14
|
type ApiKeyRef,
|
|
14
15
|
type AssistantIdRef,
|
|
15
16
|
} from "../managed-lazy-getters.js";
|
|
16
17
|
|
|
18
|
+
// ---------------------------------------------------------------------------
|
|
19
|
+
// applyManagedCredentialRefs — fail-closed overwrite across sessions
|
|
20
|
+
// ---------------------------------------------------------------------------
|
|
21
|
+
|
|
22
|
+
describe("applyManagedCredentialRefs", () => {
|
|
23
|
+
test("overwrites both refs with the provided values", () => {
|
|
24
|
+
const apiKeyRef: ApiKeyRef = { current: "old-key" };
|
|
25
|
+
const assistantIdRef: AssistantIdRef = { current: "ast_old" };
|
|
26
|
+
|
|
27
|
+
applyManagedCredentialRefs(apiKeyRef, assistantIdRef, "new-key", "ast_new");
|
|
28
|
+
|
|
29
|
+
expect(apiKeyRef.current).toBe("new-key");
|
|
30
|
+
expect(assistantIdRef.current).toBe("ast_new");
|
|
31
|
+
});
|
|
32
|
+
|
|
33
|
+
test("clears a stale assistant ID when the new value is omitted", () => {
|
|
34
|
+
// A new session (or an API-key-only update) that does not carry an
|
|
35
|
+
// assistant ID must not inherit the previous session's ID.
|
|
36
|
+
const apiKeyRef: ApiKeyRef = { current: "prev-key" };
|
|
37
|
+
const assistantIdRef: AssistantIdRef = { current: "ast_prev" };
|
|
38
|
+
|
|
39
|
+
applyManagedCredentialRefs(
|
|
40
|
+
apiKeyRef,
|
|
41
|
+
assistantIdRef,
|
|
42
|
+
"rotated-key",
|
|
43
|
+
undefined,
|
|
44
|
+
);
|
|
45
|
+
|
|
46
|
+
expect(apiKeyRef.current).toBe("rotated-key");
|
|
47
|
+
expect(assistantIdRef.current).toBe("");
|
|
48
|
+
});
|
|
49
|
+
|
|
50
|
+
test("clears a stale API key when the new value is omitted", () => {
|
|
51
|
+
const apiKeyRef: ApiKeyRef = { current: "prev-key" };
|
|
52
|
+
const assistantIdRef: AssistantIdRef = { current: "ast_prev" };
|
|
53
|
+
|
|
54
|
+
applyManagedCredentialRefs(apiKeyRef, assistantIdRef, undefined, undefined);
|
|
55
|
+
|
|
56
|
+
expect(apiKeyRef.current).toBe("");
|
|
57
|
+
expect(assistantIdRef.current).toBe("");
|
|
58
|
+
});
|
|
59
|
+
|
|
60
|
+
test("lazy getters fail closed after a reconnect that omits the ID", () => {
|
|
61
|
+
const apiKeyRef: ApiKeyRef = { current: "session1-key" };
|
|
62
|
+
const assistantIdRef: AssistantIdRef = { current: "ast_session1" };
|
|
63
|
+
const { getManagedMaterializerOptions } = buildLazyGetters({
|
|
64
|
+
platformBaseUrl: "https://api.vellum.ai",
|
|
65
|
+
assistantIdRef,
|
|
66
|
+
apiKeyRef,
|
|
67
|
+
});
|
|
68
|
+
|
|
69
|
+
// A reconnecting session provides a key but no assistant ID.
|
|
70
|
+
applyManagedCredentialRefs(
|
|
71
|
+
apiKeyRef,
|
|
72
|
+
assistantIdRef,
|
|
73
|
+
"session2-key",
|
|
74
|
+
undefined,
|
|
75
|
+
);
|
|
76
|
+
|
|
77
|
+
// Materialization must not proceed with the prior session's assistant ID.
|
|
78
|
+
expect(getManagedMaterializerOptions()).toBeUndefined();
|
|
79
|
+
});
|
|
80
|
+
});
|
|
81
|
+
|
|
17
82
|
// ---------------------------------------------------------------------------
|
|
18
83
|
// Before API key arrives
|
|
19
84
|
// ---------------------------------------------------------------------------
|
|
@@ -0,0 +1,244 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Managed CES reconnection test (real entrypoint subprocess).
|
|
3
|
+
*
|
|
4
|
+
* Spawns the actual `managed-main.ts` entrypoint and verifies that the CES
|
|
5
|
+
* sidecar survives the assistant disconnecting and accepts a reconnection,
|
|
6
|
+
* rather than shutting down when the RPC stream ends.
|
|
7
|
+
*
|
|
8
|
+
* This guards the core invariant that CES runs independently of whether the
|
|
9
|
+
* assistant is actively connected: the assistant container can crash and be
|
|
10
|
+
* restarted (Kubernetes restarts containers, not the whole pod), and the
|
|
11
|
+
* restarted assistant must be able to reconnect to a still-running CES.
|
|
12
|
+
*/
|
|
13
|
+
|
|
14
|
+
import { afterEach, describe, expect, test } from "bun:test";
|
|
15
|
+
import { createConnection, createServer, type Socket } from "node:net";
|
|
16
|
+
import { existsSync, mkdirSync, mkdtempSync, rmSync } from "node:fs";
|
|
17
|
+
import { tmpdir } from "node:os";
|
|
18
|
+
import { join, resolve } from "node:path";
|
|
19
|
+
|
|
20
|
+
import {
|
|
21
|
+
CES_PROTOCOL_VERSION,
|
|
22
|
+
type HandshakeAck,
|
|
23
|
+
} from "@vellumai/service-contracts/credential-rpc";
|
|
24
|
+
|
|
25
|
+
import type { Subprocess } from "bun";
|
|
26
|
+
|
|
27
|
+
// ---------------------------------------------------------------------------
|
|
28
|
+
// Helpers
|
|
29
|
+
// ---------------------------------------------------------------------------
|
|
30
|
+
|
|
31
|
+
/** Sleep for the given number of milliseconds. */
|
|
32
|
+
function delay(ms: number): Promise<void> {
|
|
33
|
+
return new Promise((r) => setTimeout(r, ms));
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
/** Pick a currently-free TCP port by binding to port 0 and reading it back. */
|
|
37
|
+
function pickFreePort(): Promise<number> {
|
|
38
|
+
return new Promise((resolve, reject) => {
|
|
39
|
+
const srv = createServer();
|
|
40
|
+
srv.listen(0, () => {
|
|
41
|
+
const address = srv.address();
|
|
42
|
+
const port = typeof address === "object" && address ? address.port : 0;
|
|
43
|
+
srv.close(() => (port ? resolve(port) : reject(new Error("no port"))));
|
|
44
|
+
});
|
|
45
|
+
srv.on("error", reject);
|
|
46
|
+
});
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
/** Poll the health endpoint until it responds OK or the deadline passes. */
|
|
50
|
+
async function waitForHealth(port: number, timeoutMs = 15_000): Promise<void> {
|
|
51
|
+
const deadline = Date.now() + timeoutMs;
|
|
52
|
+
while (Date.now() < deadline) {
|
|
53
|
+
try {
|
|
54
|
+
const resp = await fetch(`http://127.0.0.1:${port}/healthz`);
|
|
55
|
+
if (resp.ok) return;
|
|
56
|
+
} catch {
|
|
57
|
+
// not up yet
|
|
58
|
+
}
|
|
59
|
+
await delay(100);
|
|
60
|
+
}
|
|
61
|
+
throw new Error(`CES health endpoint did not come up within ${timeoutMs}ms`);
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
/** Read the `rpcConnected` field from /readyz. */
|
|
65
|
+
async function readyzRpcConnected(port: number): Promise<boolean> {
|
|
66
|
+
const resp = await fetch(`http://127.0.0.1:${port}/readyz`);
|
|
67
|
+
const body = (await resp.json()) as { rpcConnected?: boolean };
|
|
68
|
+
return body.rpcConnected === true;
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
/** Wait until the socket path exists (CES has bound the bootstrap socket). */
|
|
72
|
+
async function waitForSocket(socketPath: string, timeoutMs = 10_000): Promise<void> {
|
|
73
|
+
const deadline = Date.now() + timeoutMs;
|
|
74
|
+
while (Date.now() < deadline) {
|
|
75
|
+
if (existsSync(socketPath)) return;
|
|
76
|
+
await delay(50);
|
|
77
|
+
}
|
|
78
|
+
throw new Error(`Bootstrap socket did not appear within ${timeoutMs}ms`);
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
/** Connect to the bootstrap socket, retrying past the listen/connect race. */
|
|
82
|
+
function connectToSocket(
|
|
83
|
+
socketPath: string,
|
|
84
|
+
{ maxRetries = 40, baseDelayMs = 25 } = {},
|
|
85
|
+
): Promise<Socket> {
|
|
86
|
+
return new Promise((resolveConn, reject) => {
|
|
87
|
+
let attempt = 0;
|
|
88
|
+
const tryConnect = () => {
|
|
89
|
+
const sock = createConnection(socketPath, () => {
|
|
90
|
+
sock.removeAllListeners("error");
|
|
91
|
+
resolveConn(sock);
|
|
92
|
+
});
|
|
93
|
+
sock.on("error", (err: NodeJS.ErrnoException) => {
|
|
94
|
+
sock.destroy();
|
|
95
|
+
attempt++;
|
|
96
|
+
if (
|
|
97
|
+
attempt < maxRetries &&
|
|
98
|
+
(err.code === "ENOENT" || err.code === "ECONNREFUSED")
|
|
99
|
+
) {
|
|
100
|
+
setTimeout(tryConnect, baseDelayMs);
|
|
101
|
+
} else {
|
|
102
|
+
reject(err);
|
|
103
|
+
}
|
|
104
|
+
});
|
|
105
|
+
};
|
|
106
|
+
tryConnect();
|
|
107
|
+
});
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
/** Send a handshake and resolve the resulting ack. */
|
|
111
|
+
function handshake(sock: Socket, sessionId: string): Promise<HandshakeAck> {
|
|
112
|
+
return new Promise((resolveAck, reject) => {
|
|
113
|
+
let buffer = "";
|
|
114
|
+
const timer = setTimeout(() => {
|
|
115
|
+
sock.removeAllListeners("data");
|
|
116
|
+
reject(new Error("Timed out waiting for handshake ack"));
|
|
117
|
+
}, 5_000);
|
|
118
|
+
|
|
119
|
+
sock.on("data", (chunk: Buffer) => {
|
|
120
|
+
buffer += chunk.toString("utf-8");
|
|
121
|
+
const idx = buffer.indexOf("\n");
|
|
122
|
+
if (idx === -1) return;
|
|
123
|
+
const line = buffer.slice(0, idx).trim();
|
|
124
|
+
clearTimeout(timer);
|
|
125
|
+
sock.removeAllListeners("data");
|
|
126
|
+
try {
|
|
127
|
+
resolveAck(JSON.parse(line) as HandshakeAck);
|
|
128
|
+
} catch (err) {
|
|
129
|
+
reject(err as Error);
|
|
130
|
+
}
|
|
131
|
+
});
|
|
132
|
+
sock.on("error", (err) => {
|
|
133
|
+
clearTimeout(timer);
|
|
134
|
+
reject(err);
|
|
135
|
+
});
|
|
136
|
+
|
|
137
|
+
sock.write(
|
|
138
|
+
JSON.stringify({
|
|
139
|
+
type: "handshake_request",
|
|
140
|
+
protocolVersion: CES_PROTOCOL_VERSION,
|
|
141
|
+
sessionId,
|
|
142
|
+
}) + "\n",
|
|
143
|
+
);
|
|
144
|
+
});
|
|
145
|
+
}
|
|
146
|
+
|
|
147
|
+
// ---------------------------------------------------------------------------
|
|
148
|
+
// Test
|
|
149
|
+
// ---------------------------------------------------------------------------
|
|
150
|
+
|
|
151
|
+
let tmpDir: string | undefined;
|
|
152
|
+
let proc: Subprocess | undefined;
|
|
153
|
+
|
|
154
|
+
afterEach(async () => {
|
|
155
|
+
if (proc) {
|
|
156
|
+
proc.kill("SIGTERM");
|
|
157
|
+
await Promise.race([proc.exited, delay(3_000)]);
|
|
158
|
+
proc = undefined;
|
|
159
|
+
}
|
|
160
|
+
if (tmpDir) {
|
|
161
|
+
try {
|
|
162
|
+
rmSync(tmpDir, { recursive: true, force: true });
|
|
163
|
+
} catch {
|
|
164
|
+
/* ok */
|
|
165
|
+
}
|
|
166
|
+
tmpDir = undefined;
|
|
167
|
+
}
|
|
168
|
+
});
|
|
169
|
+
|
|
170
|
+
describe("managed CES reconnection (real entrypoint)", () => {
|
|
171
|
+
test("survives an assistant disconnect and accepts a reconnection", async () => {
|
|
172
|
+
tmpDir = mkdtempSync(join(tmpdir(), "ces-reconnect-"));
|
|
173
|
+
const dataDir = join(tmpDir, "ces-data");
|
|
174
|
+
const socketDir = join(tmpDir, "bootstrap");
|
|
175
|
+
const socketPath = join(socketDir, "ces.sock");
|
|
176
|
+
const assistantDataMount = join(tmpDir, "assistant-data-ro");
|
|
177
|
+
mkdirSync(dataDir, { recursive: true });
|
|
178
|
+
mkdirSync(socketDir, { recursive: true });
|
|
179
|
+
mkdirSync(join(assistantDataMount, ".vellum"), { recursive: true });
|
|
180
|
+
|
|
181
|
+
const healthPort = await pickFreePort();
|
|
182
|
+
const managedMain = resolve(__dirname, "..", "managed-main.ts");
|
|
183
|
+
|
|
184
|
+
proc = Bun.spawn({
|
|
185
|
+
cmd: [process.execPath, managedMain],
|
|
186
|
+
env: {
|
|
187
|
+
...process.env,
|
|
188
|
+
CES_MODE: "managed",
|
|
189
|
+
CES_DATA_DIR: dataDir,
|
|
190
|
+
CES_BOOTSTRAP_SOCKET: socketPath,
|
|
191
|
+
CES_HEALTH_PORT: String(healthPort),
|
|
192
|
+
CES_ASSISTANT_DATA_MOUNT: assistantDataMount,
|
|
193
|
+
},
|
|
194
|
+
stdout: "ignore",
|
|
195
|
+
stderr: "ignore",
|
|
196
|
+
});
|
|
197
|
+
|
|
198
|
+
// Sidecar comes up and binds the bootstrap socket.
|
|
199
|
+
await waitForHealth(healthPort);
|
|
200
|
+
await waitForSocket(socketPath);
|
|
201
|
+
expect(await readyzRpcConnected(healthPort)).toBe(false);
|
|
202
|
+
|
|
203
|
+
// First assistant session connects and completes a handshake.
|
|
204
|
+
const first = await connectToSocket(socketPath);
|
|
205
|
+
const ack1 = await handshake(first, "session-1");
|
|
206
|
+
expect(ack1.accepted).toBe(true);
|
|
207
|
+
|
|
208
|
+
// Give /readyz a moment to flip, then confirm CES sees the connection.
|
|
209
|
+
await delay(200);
|
|
210
|
+
expect(await readyzRpcConnected(healthPort)).toBe(true);
|
|
211
|
+
|
|
212
|
+
// Simulate the assistant pod crashing: drop the connection hard.
|
|
213
|
+
first.destroy();
|
|
214
|
+
|
|
215
|
+
// CES must NOT exit. It should stay healthy, flip rpcConnected back to
|
|
216
|
+
// false, and re-bind the bootstrap socket to await a reconnection.
|
|
217
|
+
await waitForSocket(socketPath);
|
|
218
|
+
expect(proc.killed).toBe(false);
|
|
219
|
+
const resp = await fetch(`http://127.0.0.1:${healthPort}/healthz`);
|
|
220
|
+
expect(resp.ok).toBe(true);
|
|
221
|
+
|
|
222
|
+
// Wait for the new session's rpcConnected to clear before reconnecting.
|
|
223
|
+
const cleared = await (async () => {
|
|
224
|
+
const deadline = Date.now() + 5_000;
|
|
225
|
+
while (Date.now() < deadline) {
|
|
226
|
+
if (!(await readyzRpcConnected(healthPort))) return true;
|
|
227
|
+
await delay(100);
|
|
228
|
+
}
|
|
229
|
+
return false;
|
|
230
|
+
})();
|
|
231
|
+
expect(cleared).toBe(true);
|
|
232
|
+
|
|
233
|
+
// The restarted assistant reconnects and handshakes successfully.
|
|
234
|
+
const second = await connectToSocket(socketPath);
|
|
235
|
+
const ack2 = await handshake(second, "session-2");
|
|
236
|
+
expect(ack2.accepted).toBe(true);
|
|
237
|
+
expect(ack2.sessionId).toBe("session-2");
|
|
238
|
+
|
|
239
|
+
await delay(200);
|
|
240
|
+
expect(await readyzRpcConnected(healthPort)).toBe(true);
|
|
241
|
+
|
|
242
|
+
second.destroy();
|
|
243
|
+
}, 30_000);
|
|
244
|
+
});
|
|
@@ -32,6 +32,27 @@ export interface AssistantIdRef {
|
|
|
32
32
|
current: string;
|
|
33
33
|
}
|
|
34
34
|
|
|
35
|
+
/**
|
|
36
|
+
* Overwrite the session-scoped managed credential refs.
|
|
37
|
+
*
|
|
38
|
+
* The managed handler registry is long-lived — it persists across assistant
|
|
39
|
+
* reconnects — so every handshake and every `update_managed_credential` must
|
|
40
|
+
* fully overwrite these refs, including clearing them when a value is absent.
|
|
41
|
+
* Otherwise a new or reprovisioned session could keep materializing platform
|
|
42
|
+
* credentials with the previous session's API key or assistant ID. Absent
|
|
43
|
+
* values fall back to "" (fail closed): the lazy getters then return no
|
|
44
|
+
* materialization options, and `getAssistantApiKey` falls back to the env key.
|
|
45
|
+
*/
|
|
46
|
+
export function applyManagedCredentialRefs(
|
|
47
|
+
apiKeyRef: ApiKeyRef,
|
|
48
|
+
assistantIdRef: AssistantIdRef,
|
|
49
|
+
apiKey: string | undefined,
|
|
50
|
+
assistantId: string | undefined,
|
|
51
|
+
): void {
|
|
52
|
+
apiKeyRef.current = apiKey ?? "";
|
|
53
|
+
assistantIdRef.current = assistantId ?? "";
|
|
54
|
+
}
|
|
55
|
+
|
|
35
56
|
export interface LazyGetterOptions {
|
|
36
57
|
platformBaseUrl: string;
|
|
37
58
|
assistantIdRef: AssistantIdRef;
|
|
@@ -55,10 +76,11 @@ export interface LazyGetters {
|
|
|
55
76
|
export function buildLazyGetters(opts: LazyGetterOptions): LazyGetters {
|
|
56
77
|
const { platformBaseUrl, assistantIdRef, apiKeyRef, envApiKey } = opts;
|
|
57
78
|
|
|
58
|
-
const getAssistantApiKey = (): string =>
|
|
59
|
-
apiKeyRef.current || envApiKey || "";
|
|
79
|
+
const getAssistantApiKey = (): string => apiKeyRef.current || envApiKey || "";
|
|
60
80
|
|
|
61
|
-
const getManagedSubjectOptions = ():
|
|
81
|
+
const getManagedSubjectOptions = ():
|
|
82
|
+
| ManagedSubjectResolverOptions
|
|
83
|
+
| undefined => {
|
|
62
84
|
const key = getAssistantApiKey();
|
|
63
85
|
const id = assistantIdRef.current;
|
|
64
86
|
return platformBaseUrl && key && id
|
|
@@ -66,7 +88,9 @@ export function buildLazyGetters(opts: LazyGetterOptions): LazyGetters {
|
|
|
66
88
|
: undefined;
|
|
67
89
|
};
|
|
68
90
|
|
|
69
|
-
const getManagedMaterializerOptions = ():
|
|
91
|
+
const getManagedMaterializerOptions = ():
|
|
92
|
+
| ManagedMaterializerOptions
|
|
93
|
+
| undefined => {
|
|
70
94
|
const key = getAssistantApiKey();
|
|
71
95
|
const id = assistantIdRef.current;
|
|
72
96
|
return platformBaseUrl && key && id
|
package/src/managed-main.ts
CHANGED
|
@@ -6,11 +6,15 @@
|
|
|
6
6
|
*
|
|
7
7
|
* 1. Ensures the CES-private data directories exist.
|
|
8
8
|
* 2. Binds a bootstrap Unix socket on the shared bootstrap volume.
|
|
9
|
-
* 3. Accepts
|
|
9
|
+
* 3. Accepts a single assistant runtime connection.
|
|
10
10
|
* 4. Unlinks the socket path immediately after the connection is accepted,
|
|
11
|
-
* preventing any second process from connecting.
|
|
11
|
+
* preventing any second process from connecting while the session is live.
|
|
12
12
|
* 5. Serves RPC on the accepted stream only.
|
|
13
|
-
* 6.
|
|
13
|
+
* 6. When that session ends (the assistant disconnects or its container is
|
|
14
|
+
* restarted), re-binds the socket and awaits a reconnection. CES is a
|
|
15
|
+
* long-lived sidecar — it outlives any single assistant session and only
|
|
16
|
+
* shuts down on SIGTERM/SIGINT. At most one connection is ever active.
|
|
17
|
+
* 7. Simultaneously serves health probes (`/healthz`, `/readyz`) on a
|
|
14
18
|
* dedicated HTTP port for Kubernetes liveness/readiness checks.
|
|
15
19
|
*
|
|
16
20
|
* The managed entrypoint never opens a generic TCP or HTTP command API.
|
|
@@ -52,6 +56,7 @@ import {
|
|
|
52
56
|
registerCommandExecutionHandler,
|
|
53
57
|
registerManageSecureCommandToolHandler,
|
|
54
58
|
type RpcHandlerRegistry,
|
|
59
|
+
type ServeEndReason,
|
|
55
60
|
type SessionIdRef,
|
|
56
61
|
} from "./server.js";
|
|
57
62
|
import {
|
|
@@ -67,6 +72,7 @@ import {
|
|
|
67
72
|
parseHandle,
|
|
68
73
|
} from "@vellumai/service-contracts/credential-rpc";
|
|
69
74
|
import {
|
|
75
|
+
applyManagedCredentialRefs,
|
|
70
76
|
buildLazyGetters,
|
|
71
77
|
type ApiKeyRef,
|
|
72
78
|
type AssistantIdRef,
|
|
@@ -118,7 +124,7 @@ function buildHandlers(
|
|
|
118
124
|
apiKeyRef: ApiKeyRef,
|
|
119
125
|
assistantIdRef: AssistantIdRef,
|
|
120
126
|
secureKeyBackend: SecureKeyBackend,
|
|
121
|
-
): RpcHandlerRegistry {
|
|
127
|
+
): { handlers: RpcHandlerRegistry; temporaryGrantStore: TemporaryGrantStore } {
|
|
122
128
|
// -- Grant stores ----------------------------------------------------------
|
|
123
129
|
const persistentGrantStore = new PersistentGrantStore(
|
|
124
130
|
getCesGrantsDir("managed"),
|
|
@@ -140,15 +146,13 @@ function buildHandlers(
|
|
|
140
146
|
// though handlers are built before the handshake completes.
|
|
141
147
|
const platformBaseUrl = process.env["VELLUM_PLATFORM_URL"] ?? "";
|
|
142
148
|
|
|
143
|
-
const {
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
envApiKey: process.env["ASSISTANT_API_KEY"] || "",
|
|
151
|
-
});
|
|
149
|
+
const { getManagedSubjectOptions, getManagedMaterializerOptions } =
|
|
150
|
+
buildLazyGetters({
|
|
151
|
+
platformBaseUrl,
|
|
152
|
+
assistantIdRef,
|
|
153
|
+
apiKeyRef,
|
|
154
|
+
envApiKey: process.env["ASSISTANT_API_KEY"] || "",
|
|
155
|
+
});
|
|
152
156
|
|
|
153
157
|
if (!platformBaseUrl) {
|
|
154
158
|
log.warn(
|
|
@@ -189,7 +193,10 @@ function buildHandlers(
|
|
|
189
193
|
};
|
|
190
194
|
|
|
191
195
|
const localSubjectDepsStub: LocalSubjectResolverDeps = {
|
|
192
|
-
metadataStore: {
|
|
196
|
+
metadataStore: {
|
|
197
|
+
getById: () => undefined,
|
|
198
|
+
list: () => [],
|
|
199
|
+
} as unknown as LocalSubjectResolverDeps["metadataStore"],
|
|
193
200
|
oauthConnections: { getById: () => undefined },
|
|
194
201
|
};
|
|
195
202
|
|
|
@@ -408,7 +415,7 @@ function buildHandlers(
|
|
|
408
415
|
return { results };
|
|
409
416
|
}) as (typeof handlers)[string];
|
|
410
417
|
|
|
411
|
-
return handlers;
|
|
418
|
+
return { handlers, temporaryGrantStore };
|
|
412
419
|
}
|
|
413
420
|
|
|
414
421
|
// ---------------------------------------------------------------------------
|
|
@@ -476,13 +483,18 @@ function startHealthServer(
|
|
|
476
483
|
}
|
|
477
484
|
|
|
478
485
|
// ---------------------------------------------------------------------------
|
|
479
|
-
// Bootstrap socket server (accepts
|
|
486
|
+
// Bootstrap socket server (accepts one connection at a time)
|
|
480
487
|
// ---------------------------------------------------------------------------
|
|
481
488
|
|
|
482
489
|
/**
|
|
483
|
-
* Listen on a Unix socket, accept
|
|
484
|
-
*
|
|
485
|
-
*
|
|
490
|
+
* Listen on a Unix socket, accept one connection, unlink the socket path,
|
|
491
|
+
* and return readable/writable streams for the accepted connection.
|
|
492
|
+
*
|
|
493
|
+
* The socket is unlinked while a connection is active so no second process
|
|
494
|
+
* can connect concurrently (only one assistant ever talks to CES at a time).
|
|
495
|
+
* When that session ends, the caller re-invokes this function to re-bind the
|
|
496
|
+
* socket and accept the assistant's reconnection — CES outlives any single
|
|
497
|
+
* assistant session (see `main()`).
|
|
486
498
|
*/
|
|
487
499
|
function acceptOneConnection(
|
|
488
500
|
socketPath: string,
|
|
@@ -515,16 +527,17 @@ function acceptOneConnection(
|
|
|
515
527
|
return;
|
|
516
528
|
}
|
|
517
529
|
|
|
518
|
-
|
|
519
|
-
|
|
520
|
-
|
|
521
|
-
|
|
522
|
-
|
|
523
|
-
|
|
524
|
-
|
|
525
|
-
);
|
|
530
|
+
// Remove this listener once the promise settles. Because CES re-binds
|
|
531
|
+
// the socket after each session ends, a long-lived AbortSignal would
|
|
532
|
+
// otherwise accumulate one dangling listener per reconnection.
|
|
533
|
+
const onAbort = () => {
|
|
534
|
+
cleanup();
|
|
535
|
+
reject(new Error("Aborted while waiting for connection"));
|
|
536
|
+
};
|
|
537
|
+
signal.addEventListener("abort", onAbort, { once: true });
|
|
526
538
|
|
|
527
539
|
netServer.on("error", (err) => {
|
|
540
|
+
signal.removeEventListener("abort", onAbort);
|
|
528
541
|
cleanup();
|
|
529
542
|
reject(err);
|
|
530
543
|
});
|
|
@@ -534,8 +547,10 @@ function acceptOneConnection(
|
|
|
534
547
|
});
|
|
535
548
|
|
|
536
549
|
netServer.on("connection", (socket: Socket) => {
|
|
537
|
-
// Accept
|
|
538
|
-
//
|
|
550
|
+
// Accept the connection, then close the listener and unlink the
|
|
551
|
+
// socket path so no other process can connect while this session
|
|
552
|
+
// is active.
|
|
553
|
+
signal.removeEventListener("abort", onAbort);
|
|
539
554
|
log.info("Assistant connected via bootstrap socket");
|
|
540
555
|
netServer.close();
|
|
541
556
|
try {
|
|
@@ -543,7 +558,7 @@ function acceptOneConnection(
|
|
|
543
558
|
} catch {
|
|
544
559
|
// Already unlinked
|
|
545
560
|
}
|
|
546
|
-
log.info("Bootstrap socket unlinked (single
|
|
561
|
+
log.info("Bootstrap socket unlinked (single active connection enforced)");
|
|
547
562
|
|
|
548
563
|
const readable = new Readable({
|
|
549
564
|
read() {
|
|
@@ -648,76 +663,157 @@ async function main(): Promise<void> {
|
|
|
648
663
|
startHealthServer(healthPort, controller.signal, credentialDeps);
|
|
649
664
|
log.info(`Health server listening on port ${healthPort}`);
|
|
650
665
|
|
|
651
|
-
//
|
|
652
|
-
|
|
653
|
-
|
|
654
|
-
|
|
655
|
-
|
|
656
|
-
|
|
657
|
-
|
|
658
|
-
|
|
659
|
-
|
|
660
|
-
|
|
661
|
-
|
|
662
|
-
|
|
663
|
-
|
|
664
|
-
|
|
665
|
-
|
|
666
|
-
|
|
667
|
-
|
|
668
|
-
// Build the handler registry with all available RPC implementations.
|
|
669
|
-
// Use mutable refs so the handshake-provided session ID and API key
|
|
670
|
-
// are available to handlers at call time (after the handshake completes).
|
|
666
|
+
// Build the handler registry once, up front, and reuse it across every
|
|
667
|
+
// assistant session. All CES state lives behind these handlers — file-backed
|
|
668
|
+
// grant/audit stores plus the in-memory temporary-grant store and the
|
|
669
|
+
// secure-command tool registry — and must be process-scoped so it survives
|
|
670
|
+
// an assistant reconnection. In particular, the tool registry mirrors the
|
|
671
|
+
// persistent toolstore on disk; rebuilding it per session would let a later
|
|
672
|
+
// `unregister` miss a tool registered in an earlier session and orphan its
|
|
673
|
+
// bundle.
|
|
674
|
+
//
|
|
675
|
+
// The in-memory temporary-grant store is the exception: `allow_once` /
|
|
676
|
+
// `allow_10m` grants are keyed by proposal hash only (not session), so they
|
|
677
|
+
// would otherwise leak ephemeral approvals across sessions. It is cleared at
|
|
678
|
+
// the end of every session below so a reconnecting assistant must re-prompt.
|
|
679
|
+
//
|
|
680
|
+
// The mutable refs carry the handshake-provided session ID, API key, and
|
|
681
|
+
// assistant ID; handlers read them at call time, so updating the refs when
|
|
682
|
+
// each session's handshake completes is all that's needed per connection.
|
|
671
683
|
const sessionIdRef: SessionIdRef = { current: `ces-managed-${Date.now()}` };
|
|
672
684
|
const apiKeyRef: ApiKeyRef = { current: "" };
|
|
673
685
|
const assistantIdRef: AssistantIdRef = { current: "" };
|
|
674
|
-
const handlers = buildHandlers(
|
|
686
|
+
const { handlers, temporaryGrantStore } = buildHandlers(
|
|
675
687
|
sessionIdRef,
|
|
676
688
|
apiKeyRef,
|
|
677
689
|
assistantIdRef,
|
|
678
690
|
secureKeyBackend,
|
|
679
691
|
);
|
|
680
692
|
|
|
693
|
+
// Serve loop. CES is a long-lived sidecar that must outlive any single
|
|
694
|
+
// assistant session: the assistant container can crash and be restarted
|
|
695
|
+
// independently of the CES container (Kubernetes restarts containers, not
|
|
696
|
+
// the whole pod), so when the RPC stream ends we re-bind the bootstrap
|
|
697
|
+
// socket and wait for the assistant to reconnect rather than tearing the
|
|
698
|
+
// sidecar down. The loop only exits on a shutdown signal (SIGTERM/SIGINT),
|
|
699
|
+
// which aborts the controller.
|
|
681
700
|
const rpcLog = getLogger("rpc");
|
|
682
|
-
const
|
|
683
|
-
|
|
684
|
-
|
|
685
|
-
|
|
686
|
-
|
|
687
|
-
|
|
688
|
-
|
|
689
|
-
|
|
690
|
-
}
|
|
691
|
-
|
|
692
|
-
|
|
693
|
-
|
|
694
|
-
if (hsApiKey) {
|
|
695
|
-
apiKeyRef.current = hsApiKey;
|
|
696
|
-
log.info("Received assistant API key via handshake");
|
|
697
|
-
}
|
|
698
|
-
if (hsAssistantId) {
|
|
699
|
-
assistantIdRef.current = hsAssistantId;
|
|
700
|
-
log.info("Received assistant ID via handshake");
|
|
701
|
-
}
|
|
702
|
-
},
|
|
703
|
-
onApiKeyUpdate: (newKey, newAssistantId) => {
|
|
704
|
-
apiKeyRef.current = newKey;
|
|
705
|
-
log.info("Assistant API key updated via RPC");
|
|
706
|
-
if (newAssistantId) {
|
|
707
|
-
assistantIdRef.current = newAssistantId;
|
|
708
|
-
log.info("Assistant ID updated via RPC");
|
|
701
|
+
const socketPath = getBootstrapSocketPath();
|
|
702
|
+
|
|
703
|
+
while (!controller.signal.aborted) {
|
|
704
|
+
log.info(`Waiting for assistant connection on ${socketPath}...`);
|
|
705
|
+
|
|
706
|
+
let connection: Awaited<ReturnType<typeof acceptOneConnection>>;
|
|
707
|
+
try {
|
|
708
|
+
connection = await acceptOneConnection(socketPath, controller.signal);
|
|
709
|
+
} catch (err) {
|
|
710
|
+
if (controller.signal.aborted) {
|
|
711
|
+
log.info("Shutdown before assistant connected.");
|
|
712
|
+
return;
|
|
709
713
|
}
|
|
710
|
-
|
|
711
|
-
|
|
714
|
+
throw err;
|
|
715
|
+
}
|
|
712
716
|
|
|
713
|
-
|
|
717
|
+
rpcConnected = true;
|
|
714
718
|
|
|
715
|
-
|
|
716
|
-
|
|
717
|
-
|
|
718
|
-
|
|
719
|
-
|
|
720
|
-
|
|
719
|
+
const server = new CesRpcServer({
|
|
720
|
+
input: connection.readable,
|
|
721
|
+
output: connection.writable,
|
|
722
|
+
handlers,
|
|
723
|
+
logger: {
|
|
724
|
+
log: (msg: string, ...args: unknown[]) => rpcLog.info({ args }, msg),
|
|
725
|
+
warn: (msg: string, ...args: unknown[]) => rpcLog.warn({ args }, msg),
|
|
726
|
+
error: (msg: string, ...args: unknown[]) => rpcLog.error({ args }, msg),
|
|
727
|
+
},
|
|
728
|
+
signal: controller.signal,
|
|
729
|
+
onHandshakeComplete: (hsSessionId, hsApiKey, hsAssistantId) => {
|
|
730
|
+
sessionIdRef.current = hsSessionId;
|
|
731
|
+
// Overwrite the credential refs on every handshake. The handler
|
|
732
|
+
// registry persists across reconnects, so a new session that omits
|
|
733
|
+
// the API key / assistant ID must fail closed (falling back to the
|
|
734
|
+
// env key, or no key) rather than reusing the previous session's
|
|
735
|
+
// credentials.
|
|
736
|
+
applyManagedCredentialRefs(
|
|
737
|
+
apiKeyRef,
|
|
738
|
+
assistantIdRef,
|
|
739
|
+
hsApiKey,
|
|
740
|
+
hsAssistantId,
|
|
741
|
+
);
|
|
742
|
+
if (hsApiKey) {
|
|
743
|
+
log.info("Received assistant API key via handshake");
|
|
744
|
+
}
|
|
745
|
+
if (hsAssistantId) {
|
|
746
|
+
log.info("Received assistant ID via handshake");
|
|
747
|
+
}
|
|
748
|
+
},
|
|
749
|
+
onApiKeyUpdate: (newKey, newAssistantId) => {
|
|
750
|
+
// Overwrite both refs on every credential update, for the same
|
|
751
|
+
// fail-closed reason as the handshake: the assistant sources the
|
|
752
|
+
// assistant ID from the same place it sources the key, so an update
|
|
753
|
+
// that omits the ID means it has none — CES must clear the stale ID
|
|
754
|
+
// rather than keep materializing for the previous session's assistant.
|
|
755
|
+
applyManagedCredentialRefs(
|
|
756
|
+
apiKeyRef,
|
|
757
|
+
assistantIdRef,
|
|
758
|
+
newKey,
|
|
759
|
+
newAssistantId,
|
|
760
|
+
);
|
|
761
|
+
log.info("Assistant API key updated via RPC");
|
|
762
|
+
if (newAssistantId) {
|
|
763
|
+
log.info("Assistant ID updated via RPC");
|
|
764
|
+
}
|
|
765
|
+
},
|
|
766
|
+
});
|
|
767
|
+
|
|
768
|
+
// `serve()` resolves on a clean stream end or signal abort, and rejects
|
|
769
|
+
// when the transport stream errors — which is precisely what a hard
|
|
770
|
+
// disconnect (connection reset when the assistant container crashes)
|
|
771
|
+
// looks like. Both cases must keep the sidecar up; only a shutdown
|
|
772
|
+
// signal should tear it down. So treat a serve() rejection the same as
|
|
773
|
+
// a session end and fall through to await reconnection.
|
|
774
|
+
let endReason: ServeEndReason | "transport_error";
|
|
775
|
+
try {
|
|
776
|
+
endReason = await server.serve();
|
|
777
|
+
} catch (err) {
|
|
778
|
+
server.close();
|
|
779
|
+
endReason = "transport_error";
|
|
780
|
+
log.warn(
|
|
781
|
+
{ err, uptime: process.uptime(), pid: process.pid },
|
|
782
|
+
"RPC transport errored — treating as session end",
|
|
783
|
+
);
|
|
784
|
+
}
|
|
785
|
+
|
|
786
|
+
rpcConnected = false;
|
|
787
|
+
|
|
788
|
+
// Drop all ephemeral approvals when the session ends. `allow_once` /
|
|
789
|
+
// `allow_10m` grants are keyed by proposal hash only, so reusing the
|
|
790
|
+
// store across a reconnect would let a pre-disconnect approval be
|
|
791
|
+
// consumed by a later session without re-prompting. Clearing here
|
|
792
|
+
// restores the prior behavior, where the process exited on stream end
|
|
793
|
+
// and these grants never survived.
|
|
794
|
+
temporaryGrantStore.clear();
|
|
795
|
+
|
|
796
|
+
// A signal-driven end means the process is shutting down; exit the loop.
|
|
797
|
+
// Any other end reason (the assistant disconnected, its stream closed,
|
|
798
|
+
// or the transport errored) means we keep the sidecar up and await a
|
|
799
|
+
// reconnection.
|
|
800
|
+
if (
|
|
801
|
+
controller.signal.aborted ||
|
|
802
|
+
endReason === "signal_aborted" ||
|
|
803
|
+
endReason === "signal_aborted_before_start"
|
|
804
|
+
) {
|
|
805
|
+
log.info(
|
|
806
|
+
{ reason: endReason, uptime: process.uptime(), pid: process.pid },
|
|
807
|
+
"RPC session ended due to shutdown — exiting serve loop",
|
|
808
|
+
);
|
|
809
|
+
break;
|
|
810
|
+
}
|
|
811
|
+
|
|
812
|
+
log.warn(
|
|
813
|
+
{ reason: endReason, uptime: process.uptime(), pid: process.pid },
|
|
814
|
+
"RPC session ended (assistant disconnected) — awaiting reconnection",
|
|
815
|
+
);
|
|
816
|
+
}
|
|
721
817
|
}
|
|
722
818
|
|
|
723
819
|
main().catch((err) => {
|