@vellumai/cli 0.8.4 → 0.8.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/AGENTS.md +17 -1
- package/knip.json +2 -1
- package/package.json +1 -1
- package/src/__tests__/api-key-check.test.ts +78 -0
- package/src/__tests__/backup.test.ts +38 -0
- package/src/__tests__/recover.test.ts +307 -0
- package/src/__tests__/retire.test.ts +241 -0
- package/src/__tests__/wake.test.ts +215 -0
- package/src/commands/backup.ts +2 -0
- package/src/commands/client.ts +62 -32
- package/src/commands/flags.ts +197 -0
- package/src/commands/gateway/token.ts +73 -0
- package/src/commands/gateway.ts +29 -0
- package/src/commands/logs.ts +6 -18
- package/src/commands/ps.ts +41 -41
- package/src/commands/recover.ts +47 -9
- package/src/commands/restore.ts +8 -1
- package/src/commands/retire.ts +145 -55
- package/src/commands/roadmap.ts +449 -0
- package/src/commands/rollback.ts +2 -14
- package/src/commands/ssh.ts +5 -24
- package/src/commands/teleport.ts +34 -26
- package/src/commands/upgrade.ts +8 -16
- package/src/commands/wake.ts +68 -45
- package/src/index.ts +9 -0
- package/src/lib/__tests__/port-allocator.test.ts +117 -0
- package/src/lib/__tests__/step-runner.test.ts +133 -0
- package/src/lib/api-key-check.ts +40 -0
- package/src/lib/assistant-config.ts +13 -0
- package/src/lib/config-utils.ts +24 -3
- package/src/lib/docker.ts +72 -8
- package/src/lib/hatch-local.ts +15 -2
- package/src/lib/http-client.ts +1 -3
- package/src/lib/local.ts +173 -292
- package/src/lib/orphan-detection.ts +9 -5
- package/src/lib/pgrep.ts +5 -1
- package/src/lib/platform-client.ts +97 -49
- package/src/lib/port-allocator.ts +93 -0
- package/src/lib/process.ts +109 -39
- package/src/lib/statefulset.ts +0 -10
- package/src/lib/step-runner.ts +102 -9
- package/src/lib/sync-cloud-assistants.ts +17 -0
- package/src/shared/provider-env-vars.ts +1 -0
|
@@ -521,6 +521,8 @@ export async function hatchAssistant(
|
|
|
521
521
|
);
|
|
522
522
|
}
|
|
523
523
|
|
|
524
|
+
const PLATFORM_FETCH_TIMEOUT_MS = 10_000;
|
|
525
|
+
|
|
524
526
|
/**
|
|
525
527
|
* Lightweight pre-check: returns the first active managed assistant for the
|
|
526
528
|
* authenticated user, or `null` if none exists. Calls `GET /v1/assistants/`
|
|
@@ -536,20 +538,31 @@ export async function checkExistingPlatformAssistant(
|
|
|
536
538
|
const resolvedUrl = platformUrl || getPlatformUrl();
|
|
537
539
|
const url = `${resolvedUrl}/v1/assistants/`;
|
|
538
540
|
|
|
539
|
-
const
|
|
540
|
-
|
|
541
|
-
|
|
541
|
+
const controller = new AbortController();
|
|
542
|
+
const timeoutId = setTimeout(
|
|
543
|
+
() => controller.abort(),
|
|
544
|
+
PLATFORM_FETCH_TIMEOUT_MS,
|
|
545
|
+
);
|
|
542
546
|
|
|
543
|
-
|
|
544
|
-
|
|
545
|
-
|
|
546
|
-
|
|
547
|
+
try {
|
|
548
|
+
const response = await fetch(url, {
|
|
549
|
+
signal: controller.signal,
|
|
550
|
+
headers: await authHeaders(token, platformUrl),
|
|
551
|
+
});
|
|
547
552
|
|
|
548
|
-
|
|
549
|
-
|
|
550
|
-
|
|
551
|
-
|
|
552
|
-
|
|
553
|
+
if (!response.ok) {
|
|
554
|
+
// Non-fatal: if the list call fails, fall through and let hatch handle it.
|
|
555
|
+
return null;
|
|
556
|
+
}
|
|
557
|
+
|
|
558
|
+
const body = (await response.json()) as {
|
|
559
|
+
results?: HatchedAssistant[];
|
|
560
|
+
};
|
|
561
|
+
const active = body.results?.find((a) => a.status === "active");
|
|
562
|
+
return active ?? null;
|
|
563
|
+
} finally {
|
|
564
|
+
clearTimeout(timeoutId);
|
|
565
|
+
}
|
|
553
566
|
}
|
|
554
567
|
|
|
555
568
|
/**
|
|
@@ -563,17 +576,28 @@ export async function fetchPlatformAssistants(
|
|
|
563
576
|
const resolvedUrl = platformUrl || getPlatformUrl();
|
|
564
577
|
const url = `${resolvedUrl}/v1/assistants/`;
|
|
565
578
|
|
|
566
|
-
const
|
|
567
|
-
|
|
568
|
-
|
|
579
|
+
const controller = new AbortController();
|
|
580
|
+
const timeoutId = setTimeout(
|
|
581
|
+
() => controller.abort(),
|
|
582
|
+
PLATFORM_FETCH_TIMEOUT_MS,
|
|
583
|
+
);
|
|
569
584
|
|
|
570
|
-
|
|
585
|
+
try {
|
|
586
|
+
const response = await fetch(url, {
|
|
587
|
+
signal: controller.signal,
|
|
588
|
+
headers: await authHeaders(token, platformUrl),
|
|
589
|
+
});
|
|
571
590
|
|
|
572
|
-
|
|
573
|
-
results?: HatchedAssistant[];
|
|
574
|
-
};
|
|
591
|
+
if (!response.ok) return [];
|
|
575
592
|
|
|
576
|
-
|
|
593
|
+
const body = (await response.json()) as {
|
|
594
|
+
results?: HatchedAssistant[];
|
|
595
|
+
};
|
|
596
|
+
|
|
597
|
+
return (body.results ?? []).filter((a) => a.status === "active");
|
|
598
|
+
} finally {
|
|
599
|
+
clearTimeout(timeoutId);
|
|
600
|
+
}
|
|
577
601
|
}
|
|
578
602
|
|
|
579
603
|
export interface PlatformUser {
|
|
@@ -592,22 +616,34 @@ export async function fetchOrganizationId(
|
|
|
592
616
|
): Promise<string> {
|
|
593
617
|
const resolvedUrl = platformUrl || getPlatformUrl();
|
|
594
618
|
const url = `${resolvedUrl}/v1/organizations/`;
|
|
595
|
-
const response = await fetch(url, {
|
|
596
|
-
headers: { ...tokenAuthHeader(token) },
|
|
597
|
-
});
|
|
598
619
|
|
|
599
|
-
|
|
600
|
-
|
|
601
|
-
|
|
602
|
-
|
|
603
|
-
|
|
620
|
+
const controller = new AbortController();
|
|
621
|
+
const timeoutId = setTimeout(
|
|
622
|
+
() => controller.abort(),
|
|
623
|
+
PLATFORM_FETCH_TIMEOUT_MS,
|
|
624
|
+
);
|
|
604
625
|
|
|
605
|
-
|
|
606
|
-
|
|
607
|
-
|
|
608
|
-
|
|
626
|
+
try {
|
|
627
|
+
const response = await fetch(url, {
|
|
628
|
+
signal: controller.signal,
|
|
629
|
+
headers: { ...tokenAuthHeader(token) },
|
|
630
|
+
});
|
|
631
|
+
|
|
632
|
+
if (!response.ok) {
|
|
633
|
+
throw new Error(
|
|
634
|
+
`Failed to fetch organizations from ${resolvedUrl} (${response.status}). Try logging in again.`,
|
|
635
|
+
);
|
|
636
|
+
}
|
|
637
|
+
|
|
638
|
+
const body = (await response.json()) as OrganizationListResponse;
|
|
639
|
+
const orgId = body.results?.[0]?.id;
|
|
640
|
+
if (!orgId) {
|
|
641
|
+
throw new Error("No organization found for this account.");
|
|
642
|
+
}
|
|
643
|
+
return orgId;
|
|
644
|
+
} finally {
|
|
645
|
+
clearTimeout(timeoutId);
|
|
609
646
|
}
|
|
610
|
-
return orgId;
|
|
611
647
|
}
|
|
612
648
|
|
|
613
649
|
interface AllauthSessionResponse {
|
|
@@ -627,25 +663,37 @@ export async function fetchCurrentUser(
|
|
|
627
663
|
): Promise<PlatformUser> {
|
|
628
664
|
const resolvedUrl = platformUrl || getPlatformUrl();
|
|
629
665
|
const url = `${resolvedUrl}/_allauth/app/v1/auth/session`;
|
|
630
|
-
const response = await fetch(url, {
|
|
631
|
-
headers: { "X-Session-Token": token },
|
|
632
|
-
});
|
|
633
666
|
|
|
634
|
-
|
|
635
|
-
|
|
636
|
-
|
|
637
|
-
|
|
638
|
-
|
|
639
|
-
|
|
640
|
-
|
|
667
|
+
const controller = new AbortController();
|
|
668
|
+
const timeoutId = setTimeout(
|
|
669
|
+
() => controller.abort(),
|
|
670
|
+
PLATFORM_FETCH_TIMEOUT_MS,
|
|
671
|
+
);
|
|
672
|
+
|
|
673
|
+
try {
|
|
674
|
+
const response = await fetch(url, {
|
|
675
|
+
signal: controller.signal,
|
|
676
|
+
headers: { "X-Session-Token": token },
|
|
677
|
+
});
|
|
678
|
+
|
|
679
|
+
if (!response.ok) {
|
|
680
|
+
if (
|
|
681
|
+
response.status === 401 ||
|
|
682
|
+
response.status === 403 ||
|
|
683
|
+
response.status === 410
|
|
684
|
+
) {
|
|
685
|
+
throw new Error("Invalid or expired token. Please login again.");
|
|
686
|
+
}
|
|
687
|
+
throw new Error(
|
|
688
|
+
`Platform API error: ${response.status} ${response.statusText}`,
|
|
689
|
+
);
|
|
641
690
|
}
|
|
642
|
-
throw new Error(
|
|
643
|
-
`Platform API error: ${response.status} ${response.statusText}`,
|
|
644
|
-
);
|
|
645
|
-
}
|
|
646
691
|
|
|
647
|
-
|
|
648
|
-
|
|
692
|
+
const body = (await response.json()) as AllauthSessionResponse;
|
|
693
|
+
return body.data.user;
|
|
694
|
+
} finally {
|
|
695
|
+
clearTimeout(timeoutId);
|
|
696
|
+
}
|
|
649
697
|
}
|
|
650
698
|
|
|
651
699
|
// ---------------------------------------------------------------------------
|
|
@@ -0,0 +1,93 @@
|
|
|
1
|
+
import { createServer } from "net";
|
|
2
|
+
|
|
3
|
+
/**
|
|
4
|
+
* Walks upward from `preferred` and returns the first host port that the
|
|
5
|
+
* kernel will let us bind to. Used by `hatchDocker` to pick the gateway's
|
|
6
|
+
* host-side port instead of always grabbing the env-default (e.g. 7830 /
|
|
7
|
+
* 20100), which collides with any other local assistant — eval-spawned or
|
|
8
|
+
* otherwise — already bound there.
|
|
9
|
+
*
|
|
10
|
+
* The previous design (`evals/src/lib/orphan-cleanup.ts`) tried to fix this
|
|
11
|
+
* by sweeping dead eval-run resources before the next hatch. That only
|
|
12
|
+
* helped when the conflict came from a prior eval run; an unrelated local
|
|
13
|
+
* `vellum hatch` holding the port wedged the whole flow. Discovering an
|
|
14
|
+
* open port at hatch time is the proper fix and lets us delete the cleanup
|
|
15
|
+
* pre-flight entirely.
|
|
16
|
+
*
|
|
17
|
+
* Walks linearly from `preferred` upward rather than asking the kernel for
|
|
18
|
+
* an arbitrary ephemeral port (`listen(0)`) so the resulting port stays
|
|
19
|
+
* legible to operators — three local assistants land on N, N+1, N+2
|
|
20
|
+
* instead of three random numbers in the 32768-60999 range.
|
|
21
|
+
*/
|
|
22
|
+
export async function findOpenPort(
|
|
23
|
+
preferred: number,
|
|
24
|
+
options: { maxAttempts?: number; host?: string } = {},
|
|
25
|
+
): Promise<number> {
|
|
26
|
+
const maxAttempts = options.maxAttempts ?? 50;
|
|
27
|
+
const host = options.host ?? "0.0.0.0";
|
|
28
|
+
|
|
29
|
+
if (!Number.isInteger(preferred) || preferred < 1 || preferred > 65535) {
|
|
30
|
+
throw new Error(
|
|
31
|
+
`findOpenPort: preferred port ${preferred} is not a valid TCP port`,
|
|
32
|
+
);
|
|
33
|
+
}
|
|
34
|
+
if (!Number.isInteger(maxAttempts) || maxAttempts < 1) {
|
|
35
|
+
throw new Error(
|
|
36
|
+
`findOpenPort: maxAttempts ${maxAttempts} must be a positive integer`,
|
|
37
|
+
);
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
let lastError: Error | null = null;
|
|
41
|
+
for (let offset = 0; offset < maxAttempts; offset++) {
|
|
42
|
+
const port = preferred + offset;
|
|
43
|
+
if (port > 65535) break;
|
|
44
|
+
try {
|
|
45
|
+
await probePort(port, host);
|
|
46
|
+
return port;
|
|
47
|
+
} catch (err) {
|
|
48
|
+
lastError = err as Error;
|
|
49
|
+
if (!isPortInUseError(err)) {
|
|
50
|
+
// EACCES / EPERM / etc. are not "try the next port" signals — those
|
|
51
|
+
// are configuration problems an operator needs to see immediately.
|
|
52
|
+
throw err;
|
|
53
|
+
}
|
|
54
|
+
}
|
|
55
|
+
}
|
|
56
|
+
throw new Error(
|
|
57
|
+
`findOpenPort: no open port in range [${preferred}, ${preferred + maxAttempts - 1}]` +
|
|
58
|
+
(lastError ? ` (last error: ${lastError.message})` : ""),
|
|
59
|
+
);
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
/**
|
|
63
|
+
* Resolves if `port` on `host` can be bound right now. Rejects with the
|
|
64
|
+
* server's `error` event (typically `EADDRINUSE`) otherwise. Always closes
|
|
65
|
+
* the probe server before resolving so we don't leak the port we just
|
|
66
|
+
* proved was free.
|
|
67
|
+
*/
|
|
68
|
+
function probePort(port: number, host: string): Promise<void> {
|
|
69
|
+
return new Promise((resolve, reject) => {
|
|
70
|
+
const server = createServer();
|
|
71
|
+
const cleanup = (cb: () => void): void => {
|
|
72
|
+
server.removeAllListeners();
|
|
73
|
+
server.close(() => cb());
|
|
74
|
+
};
|
|
75
|
+
server.once("error", (err) => {
|
|
76
|
+
// close() on a server that never listened is a no-op; calling it
|
|
77
|
+
// anyway keeps cleanup uniform.
|
|
78
|
+
cleanup(() => reject(err));
|
|
79
|
+
});
|
|
80
|
+
server.once("listening", () => {
|
|
81
|
+
cleanup(() => resolve());
|
|
82
|
+
});
|
|
83
|
+
server.listen(port, host);
|
|
84
|
+
});
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
function isPortInUseError(err: unknown): boolean {
|
|
88
|
+
if (err instanceof Error && "code" in err) {
|
|
89
|
+
const code = (err as NodeJS.ErrnoException).code;
|
|
90
|
+
return code === "EADDRINUSE" || code === "EADDRNOTAVAIL";
|
|
91
|
+
}
|
|
92
|
+
return false;
|
|
93
|
+
}
|
package/src/lib/process.ts
CHANGED
|
@@ -1,6 +1,8 @@
|
|
|
1
1
|
import { execFileSync } from "child_process";
|
|
2
2
|
import { existsSync, readFileSync, unlinkSync } from "fs";
|
|
3
3
|
|
|
4
|
+
import { httpHealthCheck, waitForDaemonReady } from "./http-client.js";
|
|
5
|
+
|
|
4
6
|
/**
|
|
5
7
|
* Verify that a PID belongs to a vellum-related process by inspecting its
|
|
6
8
|
* command line via `ps`. Prevents killing unrelated processes when a PID file
|
|
@@ -21,13 +23,15 @@ export function isVellumProcess(pid: number): boolean {
|
|
|
21
23
|
}
|
|
22
24
|
}
|
|
23
25
|
|
|
26
|
+
/** Discriminated union: when `alive` is true, `pid` is guaranteed non-null. */
|
|
27
|
+
export type ProcessAliveResult =
|
|
28
|
+
| { alive: true; pid: number }
|
|
29
|
+
| { alive: false; pid: null };
|
|
30
|
+
|
|
24
31
|
/**
|
|
25
32
|
* Check if a PID file's process is alive.
|
|
26
33
|
*/
|
|
27
|
-
export function isProcessAlive(pidFile: string): {
|
|
28
|
-
alive: boolean;
|
|
29
|
-
pid: number | null;
|
|
30
|
-
} {
|
|
34
|
+
export function isProcessAlive(pidFile: string): ProcessAliveResult {
|
|
31
35
|
if (!existsSync(pidFile)) {
|
|
32
36
|
return { alive: false, pid: null };
|
|
33
37
|
}
|
|
@@ -46,6 +50,91 @@ export function isProcessAlive(pidFile: string): {
|
|
|
46
50
|
}
|
|
47
51
|
}
|
|
48
52
|
|
|
53
|
+
/** Discriminated union: when `alive` is true, `pid` is guaranteed non-null. */
|
|
54
|
+
export type ProcessHealthResult =
|
|
55
|
+
| { alive: true; healthy: boolean; pid: number }
|
|
56
|
+
| { alive: false; healthy: false; pid: null };
|
|
57
|
+
|
|
58
|
+
/**
|
|
59
|
+
* Check if a PID file's process is alive AND responding to HTTP health checks.
|
|
60
|
+
*
|
|
61
|
+
* Combines PID existence check with an HTTP `/healthz` probe. A process that
|
|
62
|
+
* exists but does not respond (hung, deadlocked, at 100% CPU) returns
|
|
63
|
+
* `alive: true, healthy: false` — callers should kill and restart it.
|
|
64
|
+
*/
|
|
65
|
+
export async function isProcessHealthy(
|
|
66
|
+
pidFile: string,
|
|
67
|
+
healthPort: number,
|
|
68
|
+
timeoutMs: number = 3000,
|
|
69
|
+
): Promise<ProcessHealthResult> {
|
|
70
|
+
const { alive, pid } = isProcessAlive(pidFile);
|
|
71
|
+
if (!alive || pid === null) {
|
|
72
|
+
return { alive: false, healthy: false, pid: null };
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
const healthy = await httpHealthCheck(healthPort, timeoutMs);
|
|
76
|
+
return { alive: true, healthy, pid };
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
/**
|
|
80
|
+
* Outcome of {@link resolveProcessState}. Callers switch on `status`:
|
|
81
|
+
* - `"healthy"` — process is alive and responding; `pid` is the live PID.
|
|
82
|
+
* - `"needs_start"` — process was dead, hung (and killed), or a stale PID
|
|
83
|
+
* was cleaned up. Caller should start a fresh process.
|
|
84
|
+
*/
|
|
85
|
+
export type ProcessState =
|
|
86
|
+
| { status: "healthy"; pid: number }
|
|
87
|
+
| { status: "needs_start"; pid: number | null };
|
|
88
|
+
|
|
89
|
+
/**
|
|
90
|
+
* Determine whether a PID-tracked process is alive and healthy. If the
|
|
91
|
+
* process exists but is unresponsive, waits up to `readinessWaitMs`
|
|
92
|
+
* (default 60s — matches the spawner's own `waitForDaemonReady` timeout
|
|
93
|
+
* so a concurrent caller never kills a daemon the spawner is still
|
|
94
|
+
* waiting on) for it to finish initializing. If it remains unresponsive,
|
|
95
|
+
* verifies it belongs to Vellum before killing it, then cleans up the
|
|
96
|
+
* PID file.
|
|
97
|
+
*
|
|
98
|
+
* Encapsulates the full health → readiness-wait → guard → kill → cleanup
|
|
99
|
+
* flow so callers don't need to reimplement it.
|
|
100
|
+
*/
|
|
101
|
+
export async function resolveProcessState(
|
|
102
|
+
pidFile: string,
|
|
103
|
+
healthPort: number,
|
|
104
|
+
label: string,
|
|
105
|
+
readinessWaitMs: number = 60_000,
|
|
106
|
+
): Promise<ProcessState> {
|
|
107
|
+
const result = await isProcessHealthy(pidFile, healthPort);
|
|
108
|
+
|
|
109
|
+
if (!result.alive) {
|
|
110
|
+
return { status: "needs_start", pid: null };
|
|
111
|
+
}
|
|
112
|
+
|
|
113
|
+
if (result.healthy) {
|
|
114
|
+
return { status: "healthy", pid: result.pid };
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
// Alive but not healthy — may still be starting up.
|
|
118
|
+
const becameHealthy = await waitForDaemonReady(healthPort, readinessWaitMs);
|
|
119
|
+
if (becameHealthy) {
|
|
120
|
+
return { status: "healthy", pid: result.pid };
|
|
121
|
+
}
|
|
122
|
+
|
|
123
|
+
// Genuinely hung — kill if it belongs to Vellum, otherwise just clean up.
|
|
124
|
+
if (isVellumProcess(result.pid)) {
|
|
125
|
+
console.log(
|
|
126
|
+
`${label} process alive (pid ${result.pid}) but not responding — killing and restarting...`,
|
|
127
|
+
);
|
|
128
|
+
await stopProcess(result.pid, label);
|
|
129
|
+
} else {
|
|
130
|
+
console.log(
|
|
131
|
+
`Stale PID file (pid ${result.pid} is not a Vellum process) — cleaning up...`,
|
|
132
|
+
);
|
|
133
|
+
}
|
|
134
|
+
removeFiles(pidFile);
|
|
135
|
+
return { status: "needs_start", pid: result.pid };
|
|
136
|
+
}
|
|
137
|
+
|
|
49
138
|
/**
|
|
50
139
|
* Stop a process by PID: SIGTERM, wait up to `timeoutMs`, then SIGKILL if still alive.
|
|
51
140
|
* Returns true if the process was stopped, false if it wasn't alive.
|
|
@@ -85,6 +174,18 @@ export async function stopProcess(
|
|
|
85
174
|
return true;
|
|
86
175
|
}
|
|
87
176
|
|
|
177
|
+
/** Remove one or more files, ignoring missing-file errors. */
|
|
178
|
+
function removeFiles(...files: (string | string[] | undefined)[]): void {
|
|
179
|
+
for (const entry of files) {
|
|
180
|
+
if (!entry) continue;
|
|
181
|
+
for (const f of Array.isArray(entry) ? entry : [entry]) {
|
|
182
|
+
try {
|
|
183
|
+
unlinkSync(f);
|
|
184
|
+
} catch {}
|
|
185
|
+
}
|
|
186
|
+
}
|
|
187
|
+
}
|
|
188
|
+
|
|
88
189
|
/**
|
|
89
190
|
* Stop a process tracked by a PID file, then clean up the file.
|
|
90
191
|
* Returns true if the process was stopped, false if it wasn't alive.
|
|
@@ -92,24 +193,13 @@ export async function stopProcess(
|
|
|
92
193
|
export async function stopProcessByPidFile(
|
|
93
194
|
pidFile: string,
|
|
94
195
|
label: string,
|
|
95
|
-
|
|
196
|
+
extraCleanupFiles?: string[],
|
|
96
197
|
timeoutMs?: number,
|
|
97
198
|
): Promise<boolean> {
|
|
98
199
|
const { alive, pid } = isProcessAlive(pidFile);
|
|
99
200
|
|
|
100
201
|
if (!alive || pid === null) {
|
|
101
|
-
|
|
102
|
-
try {
|
|
103
|
-
unlinkSync(pidFile);
|
|
104
|
-
} catch {}
|
|
105
|
-
}
|
|
106
|
-
if (cleanupFiles) {
|
|
107
|
-
for (const f of cleanupFiles) {
|
|
108
|
-
try {
|
|
109
|
-
unlinkSync(f);
|
|
110
|
-
} catch {}
|
|
111
|
-
}
|
|
112
|
-
}
|
|
202
|
+
removeFiles(pidFile, extraCleanupFiles);
|
|
113
203
|
return false;
|
|
114
204
|
}
|
|
115
205
|
|
|
@@ -120,32 +210,12 @@ export async function stopProcessByPidFile(
|
|
|
120
210
|
console.log(
|
|
121
211
|
`PID ${pid} is not a vellum process — cleaning up stale ${label} PID file.`,
|
|
122
212
|
);
|
|
123
|
-
|
|
124
|
-
unlinkSync(pidFile);
|
|
125
|
-
} catch {}
|
|
126
|
-
if (cleanupFiles) {
|
|
127
|
-
for (const f of cleanupFiles) {
|
|
128
|
-
try {
|
|
129
|
-
unlinkSync(f);
|
|
130
|
-
} catch {}
|
|
131
|
-
}
|
|
132
|
-
}
|
|
213
|
+
removeFiles(pidFile, extraCleanupFiles);
|
|
133
214
|
return false;
|
|
134
215
|
}
|
|
135
216
|
|
|
136
217
|
const stopped = await stopProcess(pid, label, timeoutMs);
|
|
137
|
-
|
|
138
|
-
try {
|
|
139
|
-
unlinkSync(pidFile);
|
|
140
|
-
} catch {}
|
|
141
|
-
if (cleanupFiles) {
|
|
142
|
-
for (const f of cleanupFiles) {
|
|
143
|
-
try {
|
|
144
|
-
unlinkSync(f);
|
|
145
|
-
} catch {}
|
|
146
|
-
}
|
|
147
|
-
}
|
|
148
|
-
|
|
218
|
+
removeFiles(pidFile, extraCleanupFiles);
|
|
149
219
|
return stopped;
|
|
150
220
|
}
|
|
151
221
|
|
package/src/lib/statefulset.ts
CHANGED
|
@@ -257,7 +257,6 @@ export interface BuildServiceRunArgsOpts extends DockerRunSecrets {
|
|
|
257
257
|
instanceName: string;
|
|
258
258
|
res: DockerResourceNames;
|
|
259
259
|
extraAssistantEnv?: Record<string, string>;
|
|
260
|
-
defaultWorkspaceConfigPath?: string;
|
|
261
260
|
/** Avatar device path, if available. Injected by `docker.ts` after resolving. */
|
|
262
261
|
avatarDevicePath?: string;
|
|
263
262
|
}
|
|
@@ -286,7 +285,6 @@ export function buildServiceRunArgs(
|
|
|
286
285
|
instanceName,
|
|
287
286
|
res,
|
|
288
287
|
extraAssistantEnv,
|
|
289
|
-
defaultWorkspaceConfigPath,
|
|
290
288
|
avatarDevicePath,
|
|
291
289
|
} = opts;
|
|
292
290
|
|
|
@@ -355,14 +353,6 @@ export function buildServiceRunArgs(
|
|
|
355
353
|
"-e", `GATEWAY_INTERNAL_URL=http://localhost:${GATEWAY_INTERNAL_PORT}`,
|
|
356
354
|
);
|
|
357
355
|
|
|
358
|
-
if (defaultWorkspaceConfigPath) {
|
|
359
|
-
const cPath = `/tmp/vellum-default-workspace-config-${Date.now()}.json`;
|
|
360
|
-
args.push(
|
|
361
|
-
"-v", `${defaultWorkspaceConfigPath}:${cPath}:ro`,
|
|
362
|
-
"-e", `VELLUM_DEFAULT_WORKSPACE_CONFIG_PATH=${cPath}`,
|
|
363
|
-
);
|
|
364
|
-
}
|
|
365
|
-
|
|
366
356
|
if (extraAssistantEnv) {
|
|
367
357
|
for (const [k, v] of Object.entries(extraAssistantEnv)) {
|
|
368
358
|
args.push("-e", `${k}=${v}`);
|
package/src/lib/step-runner.ts
CHANGED
|
@@ -1,5 +1,38 @@
|
|
|
1
1
|
import { spawn } from "child_process";
|
|
2
2
|
|
|
3
|
+
/**
|
|
4
|
+
* Build the error message for a failed child process. **Never include the
|
|
5
|
+
* argv** — `docker run ...` invocations carry `-e ANTHROPIC_API_KEY=…` /
|
|
6
|
+
* `-e OPENAI_API_KEY=…` style flags, and the resulting `Error.message`
|
|
7
|
+
* propagates all the way to:
|
|
8
|
+
*
|
|
9
|
+
* - the CLI's top-level catch (`console.error("Error:", err.message)`)
|
|
10
|
+
* which leaks them onto stderr,
|
|
11
|
+
* - `subprocess-*.log` files captured by the evals harness when it
|
|
12
|
+
* spawns `vellum hatch` (which then becomes the inlined log on the
|
|
13
|
+
* run-detail report page),
|
|
14
|
+
* - `run.json#error` and the last-N-lines tail in `progress.ndjson`
|
|
15
|
+
* that the evals harness emits for `SubprocessFailedError`.
|
|
16
|
+
*
|
|
17
|
+
* The diagnostic substring callers actually grep for ("no such container",
|
|
18
|
+
* "is not running", "port is already allocated", …) lives in the child's
|
|
19
|
+
* stderr/stdout, which we DO preserve below. Keep the command name only —
|
|
20
|
+
* it's enough to disambiguate which step failed without quoting secrets.
|
|
21
|
+
*
|
|
22
|
+
* Exported so the unit test can assert no `-e KEY=...` slips back in.
|
|
23
|
+
*/
|
|
24
|
+
export function buildExecErrorMessage(
|
|
25
|
+
command: string,
|
|
26
|
+
code: number | null,
|
|
27
|
+
stderr: string,
|
|
28
|
+
stdout: string,
|
|
29
|
+
): string {
|
|
30
|
+
const codeLabel = code === null ? "an unknown code" : `code ${code}`;
|
|
31
|
+
const header = `${command} exited with ${codeLabel}`;
|
|
32
|
+
const output = [stderr.trim(), stdout.trim()].filter(Boolean).join("\n");
|
|
33
|
+
return output ? `${header}\n${output}` : header;
|
|
34
|
+
}
|
|
35
|
+
|
|
3
36
|
export function exec(
|
|
4
37
|
command: string,
|
|
5
38
|
args: string[],
|
|
@@ -25,21 +58,59 @@ export function exec(
|
|
|
25
58
|
if (code === 0) {
|
|
26
59
|
resolve();
|
|
27
60
|
} else {
|
|
28
|
-
|
|
29
|
-
const output = [stderr.trim(), stdout.trim()]
|
|
30
|
-
.filter(Boolean)
|
|
31
|
-
.join("\n");
|
|
32
|
-
reject(new Error(output ? `${msg}\n${output}` : msg));
|
|
61
|
+
reject(new Error(buildExecErrorMessage(command, code, stderr, stdout)));
|
|
33
62
|
}
|
|
34
63
|
});
|
|
35
64
|
child.on("error", reject);
|
|
36
65
|
});
|
|
37
66
|
}
|
|
38
67
|
|
|
39
|
-
|
|
68
|
+
/**
|
|
69
|
+
* Run `command` with `args` and pipe `input` to its stdin. Mirrors `exec` —
|
|
70
|
+
* same no-args-in-error-message contract from `buildExecErrorMessage` — but
|
|
71
|
+
* lets callers stream content (e.g. a small JSON blob) into a child process
|
|
72
|
+
* without having to put the content on the command line where `ps` could
|
|
73
|
+
* read it and where Docker bind-mounts would be involved.
|
|
74
|
+
*/
|
|
75
|
+
export function execWithStdin(
|
|
40
76
|
command: string,
|
|
41
77
|
args: string[],
|
|
78
|
+
input: string,
|
|
42
79
|
options: { cwd?: string } = {},
|
|
80
|
+
): Promise<void> {
|
|
81
|
+
return new Promise((resolve, reject) => {
|
|
82
|
+
const child = spawn(command, args, {
|
|
83
|
+
cwd: options.cwd,
|
|
84
|
+
stdio: ["pipe", "pipe", "pipe"],
|
|
85
|
+
});
|
|
86
|
+
|
|
87
|
+
let stdout = "";
|
|
88
|
+
child.stdout.on("data", (data: Buffer) => {
|
|
89
|
+
stdout += data.toString();
|
|
90
|
+
});
|
|
91
|
+
|
|
92
|
+
let stderr = "";
|
|
93
|
+
child.stderr.on("data", (data: Buffer) => {
|
|
94
|
+
stderr += data.toString();
|
|
95
|
+
});
|
|
96
|
+
|
|
97
|
+
child.on("close", (code) => {
|
|
98
|
+
if (code === 0) {
|
|
99
|
+
resolve();
|
|
100
|
+
} else {
|
|
101
|
+
reject(new Error(buildExecErrorMessage(command, code, stderr, stdout)));
|
|
102
|
+
}
|
|
103
|
+
});
|
|
104
|
+
child.on("error", reject);
|
|
105
|
+
|
|
106
|
+
child.stdin.end(input);
|
|
107
|
+
});
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
export function execOutput(
|
|
111
|
+
command: string,
|
|
112
|
+
args: string[],
|
|
113
|
+
options: { cwd?: string; timeoutMs?: number } = {},
|
|
43
114
|
): Promise<string> {
|
|
44
115
|
return new Promise((resolve, reject) => {
|
|
45
116
|
const child = spawn(command, args, {
|
|
@@ -47,6 +118,21 @@ export function execOutput(
|
|
|
47
118
|
stdio: ["pipe", "pipe", "pipe"],
|
|
48
119
|
});
|
|
49
120
|
|
|
121
|
+
let settled = false;
|
|
122
|
+
let timer: ReturnType<typeof setTimeout> | undefined;
|
|
123
|
+
|
|
124
|
+
if (options.timeoutMs !== undefined) {
|
|
125
|
+
timer = setTimeout(() => {
|
|
126
|
+
if (!settled) {
|
|
127
|
+
settled = true;
|
|
128
|
+
child.kill("SIGTERM");
|
|
129
|
+
reject(
|
|
130
|
+
new Error(`${command} timed out after ${options.timeoutMs}ms`),
|
|
131
|
+
);
|
|
132
|
+
}
|
|
133
|
+
}, options.timeoutMs);
|
|
134
|
+
}
|
|
135
|
+
|
|
50
136
|
let stdout = "";
|
|
51
137
|
child.stdout.on("data", (data: Buffer) => {
|
|
52
138
|
stdout += data.toString();
|
|
@@ -58,13 +144,20 @@ export function execOutput(
|
|
|
58
144
|
});
|
|
59
145
|
|
|
60
146
|
child.on("close", (code) => {
|
|
147
|
+
if (settled) return;
|
|
148
|
+
settled = true;
|
|
149
|
+
if (timer) clearTimeout(timer);
|
|
61
150
|
if (code === 0) {
|
|
62
151
|
resolve(stdout.trim());
|
|
63
152
|
} else {
|
|
64
|
-
|
|
65
|
-
reject(new Error(stderr.trim() ? `${msg}\n${stderr.trim()}` : msg));
|
|
153
|
+
reject(new Error(buildExecErrorMessage(command, code, stderr, "")));
|
|
66
154
|
}
|
|
67
155
|
});
|
|
68
|
-
child.on("error",
|
|
156
|
+
child.on("error", (err) => {
|
|
157
|
+
if (settled) return;
|
|
158
|
+
settled = true;
|
|
159
|
+
if (timer) clearTimeout(timer);
|
|
160
|
+
reject(err);
|
|
161
|
+
});
|
|
69
162
|
});
|
|
70
163
|
}
|