@checkstack/backend-api 0.20.0 → 0.21.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +151 -0
- package/package.json +12 -11
- package/src/auth-strategy.ts +6 -3
- package/src/bearer-token.ts +13 -0
- package/src/collector-strategy.ts +9 -0
- package/src/config-versioning.test.ts +227 -0
- package/src/config-versioning.ts +172 -0
- package/src/core-services.ts +14 -0
- package/src/esm-script-runner.test.ts +55 -16
- package/src/esm-script-runner.ts +212 -55
- package/src/index.ts +3 -0
- package/src/render-templatable-config.test.ts +168 -0
- package/src/render-templatable-config.ts +193 -0
- package/src/schema-utils.ts +3 -0
- package/src/script-sandbox/capabilities.test.ts +122 -0
- package/src/script-sandbox/capabilities.ts +372 -0
- package/src/script-sandbox/capped-output.test.ts +116 -0
- package/src/script-sandbox/capped-output.ts +172 -0
- package/src/script-sandbox/env-guard.test.ts +105 -0
- package/src/script-sandbox/env-guard.ts +129 -0
- package/src/script-sandbox/filesystem.test.ts +437 -0
- package/src/script-sandbox/filesystem.ts +514 -0
- package/src/script-sandbox/forkbomb.it.test.ts +121 -0
- package/src/script-sandbox/global-default.test.ts +161 -0
- package/src/script-sandbox/global-default.ts +100 -0
- package/src/script-sandbox/index.ts +14 -0
- package/src/script-sandbox/network.test.ts +356 -0
- package/src/script-sandbox/network.ts +373 -0
- package/src/script-sandbox/observability.test.ts +210 -0
- package/src/script-sandbox/observability.ts +168 -0
- package/src/script-sandbox/output-truncation.test.ts +53 -0
- package/src/script-sandbox/output-truncation.ts +69 -0
- package/src/script-sandbox/policy.test.ts +189 -0
- package/src/script-sandbox/policy.ts +220 -0
- package/src/script-sandbox/provider.test.ts +61 -0
- package/src/script-sandbox/provider.ts +134 -0
- package/src/script-sandbox/readiness.test.ts +80 -0
- package/src/script-sandbox/readiness.ts +117 -0
- package/src/script-sandbox/report.ts +88 -0
- package/src/script-sandbox/rootless-egress.it.test.ts +86 -0
- package/src/script-sandbox/rootless-egress.test.ts +99 -0
- package/src/script-sandbox/rootless-egress.ts +218 -0
- package/src/script-sandbox/shell-quote.test.ts +32 -0
- package/src/script-sandbox/shell-quote.ts +10 -0
- package/src/script-sandbox/wrapper.test.ts +1194 -0
- package/src/script-sandbox/wrapper.ts +714 -0
- package/src/shell-script-runner.test.ts +243 -0
- package/src/shell-script-runner.ts +210 -45
- package/src/zod-config.test.ts +60 -0
- package/src/zod-config.ts +38 -14
- package/tsconfig.json +3 -0
|
@@ -0,0 +1,372 @@
|
|
|
1
|
+
import { spawnSync } from "node:child_process";
|
|
2
|
+
import { existsSync, readFileSync } from "node:fs";
|
|
3
|
+
import { networkInterfaces } from "node:os";
|
|
4
|
+
|
|
5
|
+
/**
|
|
6
|
+
* Per-process detection of which sandbox primitives the host kernel / process
|
|
7
|
+
* actually supports. Cached at module init (see {@link detectSandboxCapabilities})
|
|
8
|
+
* so it is computed ONCE per process — a per-run probe would tax every health
|
|
9
|
+
* check.
|
|
10
|
+
*
|
|
11
|
+
* State-and-scale note: this is deterministic per-host capability detection,
|
|
12
|
+
* not shared queryable state. A Linux pod and a macOS satellite can legitimately
|
|
13
|
+
* report different capabilities; that divergence is surfaced per run (see the
|
|
14
|
+
* EffectiveSandbox report), never assumed uniform.
|
|
15
|
+
*/
|
|
16
|
+
export interface SandboxCapabilities {
|
|
17
|
+
platform: "linux" | "darwin" | "other";
|
|
18
|
+
/** Can we drop privilege via setuid? (euid is root) */
|
|
19
|
+
euidIsRoot: boolean;
|
|
20
|
+
/** util-linux `prlimit` is on PATH. */
|
|
21
|
+
hasPrlimit: boolean;
|
|
22
|
+
/**
|
|
23
|
+
* Can we set rlimits without an external wrapper? True when `prlimit` is
|
|
24
|
+
* available (Phase 1 uses `prlimit` as the rlimit mechanism). Native
|
|
25
|
+
* `setrlimit` from inside the spawned child is a later refinement.
|
|
26
|
+
*/
|
|
27
|
+
rlimitNative: boolean;
|
|
28
|
+
/** First namespace-capable wrapper found on PATH (Phase 2+ uses this). */
|
|
29
|
+
wrapper: "bwrap" | "nsjail" | "firejail" | null;
|
|
30
|
+
/**
|
|
31
|
+
* Unprivileged user namespaces are usable (Phase 2+).
|
|
32
|
+
*
|
|
33
|
+
* This is the LIVE verdict: it reflects whether `clone(CLONE_NEWUSER)` (via a
|
|
34
|
+
* real `unshare --user` probe) actually succeeds on THIS host at THIS moment,
|
|
35
|
+
* NOT merely the static `unprivileged_userns_clone` sysctl toggle. The two can
|
|
36
|
+
* disagree: under the default Docker/containerd seccomp profile the sysctl
|
|
37
|
+
* file is absent (so the static toggle reads "available") while the live
|
|
38
|
+
* `clone(CLONE_NEWUSER)` is blocked by seccomp, so bwrap would fail at spawn.
|
|
39
|
+
* Driving this off the live probe closes that truthfulness gap — a layer that
|
|
40
|
+
* needs a user namespace (filesystem confinement, any net namespace) is only
|
|
41
|
+
* reported enforceable when the namespace can genuinely be created. See
|
|
42
|
+
* {@link userNsCreatable}.
|
|
43
|
+
*/
|
|
44
|
+
userNamespaces: boolean;
|
|
45
|
+
/**
|
|
46
|
+
* A network namespace can actually be DELIVERED via the chosen wrapper
|
|
47
|
+
* (Phase 3+). Only `bwrap` / `nsjail` are used to create the per-run net
|
|
48
|
+
* namespace here; `firejail`'s profile model is not used to deliver it, so a
|
|
49
|
+
* firejail-only host reports `false` even though the kernel supports netns.
|
|
50
|
+
* This guards against claiming a capability we cannot actually deliver.
|
|
51
|
+
*
|
|
52
|
+
* TRUTHFULNESS: this is gated on the LIVE `clone(CLONE_NEWUSER | CLONE_NEWNET)`
|
|
53
|
+
* probe ({@link userNsCreatable}), NOT the static sysctl toggle. On a host
|
|
54
|
+
* where the live clone is blocked (default Docker seccomp) this is `false`
|
|
55
|
+
* even though a delivering wrapper is on PATH — so the network layer reports
|
|
56
|
+
* `enforced.network = false` + a downgrade (under fail-closed the run is
|
|
57
|
+
* refused) instead of falsely claiming enforcement while bwrap fails at spawn.
|
|
58
|
+
*
|
|
59
|
+
* IMPORTANT: a fresh net namespace is ROUTELESS — it has loopback only and no
|
|
60
|
+
* path to the host network. This capability therefore only enables `deny`
|
|
61
|
+
* (loopback-only IS the goal). Reaching real destinations under `allowlist`
|
|
62
|
+
* (or applying a meaningful metadata block while keeping egress) additionally
|
|
63
|
+
* requires {@link netEgressIface} — see that field.
|
|
64
|
+
*/
|
|
65
|
+
netNamespaces: boolean;
|
|
66
|
+
/**
|
|
67
|
+
* The LIVE probe verdict: can an unprivileged user+net namespace actually be
|
|
68
|
+
* CREATED on this host right now (`clone(CLONE_NEWUSER | CLONE_NEWNET)` via an
|
|
69
|
+
* `unshare --user --net` child succeeds)? This is the single source of truth
|
|
70
|
+
* for "can we make a namespace", consumed by {@link userNamespaces},
|
|
71
|
+
* {@link netNamespaces}, and {@link netEgressRootless} so they never diverge.
|
|
72
|
+
* Probed ONCE per process (folded into the cached detection); never per run.
|
|
73
|
+
*/
|
|
74
|
+
userNsCreatable: boolean;
|
|
75
|
+
/**
|
|
76
|
+
* The host interface name that can plumb real egress INTO the child's net
|
|
77
|
+
* namespace (via `nsjail --macvlan_iface`), or `null` when egress cannot be
|
|
78
|
+
* plumbed on this host.
|
|
79
|
+
*
|
|
80
|
+
* Plumbing requires ALL of: a filter-capable wrapper (`nsjail`), the
|
|
81
|
+
* privilege to configure interfaces in the new namespace (CAP_NET_ADMIN —
|
|
82
|
+
* approximated here by euid root), and a usable non-loopback host interface.
|
|
83
|
+
* When this is `null`, `allowlist` and the keep-egress-but-block-metadata
|
|
84
|
+
* cases CANNOT be delivered without blackholing all traffic, so they
|
|
85
|
+
* degrade-and-surface to host net rather than engage a routeless namespace.
|
|
86
|
+
*/
|
|
87
|
+
netEgressIface: string | null;
|
|
88
|
+
/**
|
|
89
|
+
* Static addressing for the child's macvlan endpoint, or `null` when it is
|
|
90
|
+
* not configured. A macvlan interface created inside the namespace comes up
|
|
91
|
+
* UNCONFIGURED — without an IP / netmask / default gateway it has no route, so
|
|
92
|
+
* `--macvlan_iface` ALONE still blackholes egress. nsjail addresses it with
|
|
93
|
+
* `--macvlan_vs_ip` / `--macvlan_vs_nm` / `--macvlan_vs_gw`.
|
|
94
|
+
*
|
|
95
|
+
* Reliably deriving a free address + the default gateway from the host is a
|
|
96
|
+
* genuine TOCTOU / collision footgun (the default route is not exposed by
|
|
97
|
+
* `os.networkInterfaces()`, and auto-picking an IP risks a clash), so v1 takes
|
|
98
|
+
* it EXPLICITLY from the operator via `CHECKSTACK_SANDBOX_MACVLAN_IP` /
|
|
99
|
+
* `_NM` / `_GW`. When unset, `allowlist` / metadata-block under a netns
|
|
100
|
+
* degrade-and-surface to host net rather than route into an unaddressed
|
|
101
|
+
* (blackholing) macvlan. This is the documented v1 allowlist-reachability
|
|
102
|
+
* limitation.
|
|
103
|
+
*/
|
|
104
|
+
netEgressAddressing: MacvlanAddressing | null;
|
|
105
|
+
/**
|
|
106
|
+
* A ROOTLESS userspace egress path can be plumbed into the child's net
|
|
107
|
+
* namespace via `slirp4netns`, for hosts that have NO privileged macvlan path
|
|
108
|
+
* (`netEgressIface === null`) but DO have unprivileged user namespaces — the
|
|
109
|
+
* common rootless-container case (rootless Podman/Docker).
|
|
110
|
+
*
|
|
111
|
+
* Unlike macvlan, this needs NO CAP_NET_ADMIN, NO host uplink interface, and
|
|
112
|
+
* NO operator-supplied addressing: `slirp4netns` provides a userspace TCP/IP
|
|
113
|
+
* stack and a `tap0` device inside the child's netns with DETERMINISTIC,
|
|
114
|
+
* built-in static addressing (`10.0.2.0/24`, child `10.0.2.100`, gateway
|
|
115
|
+
* `10.0.2.2`), so there is no TOCTOU/collision footgun to defer to the
|
|
116
|
+
* operator. Egress is NAT'd out through the parent's network namespace.
|
|
117
|
+
*
|
|
118
|
+
* Gated on ALL of: Linux + unprivileged user namespaces usable + a userns
|
|
119
|
+
* netns can actually be created (`clone(CLONE_NEWUSER | CLONE_NEWNET)` works,
|
|
120
|
+
* probed once) + `slirp4netns` present on PATH + a wrapper that can deliver
|
|
121
|
+
* the namespace AND expose the child PID to the parent so `slirp4netns` can
|
|
122
|
+
* attach race-free (only `bwrap`, via `--info-fd`, is used for this here).
|
|
123
|
+
*
|
|
124
|
+
* The nftables allowlist + always-on metadata block are filtered INSIDE this
|
|
125
|
+
* namespace exactly as on the macvlan path. The filter is loaded fail-closed
|
|
126
|
+
* (default-drop egress is installed BEFORE `tap0` comes up), so a slirp4netns
|
|
127
|
+
* that never readies leaves the child with no reachable egress rather than an
|
|
128
|
+
* unfiltered one — egress is never unfiltered while `enforced.network` is true.
|
|
129
|
+
*
|
|
130
|
+
* When `false`, `allowlist` / metadata-block fall back to the macvlan path if
|
|
131
|
+
* available, else degrade-and-surface to host net (never a blackhole).
|
|
132
|
+
*/
|
|
133
|
+
netEgressRootless: boolean;
|
|
134
|
+
}
|
|
135
|
+
|
|
136
|
+
/** Static addressing for the child's macvlan egress endpoint (nsjail). */
|
|
137
|
+
export interface MacvlanAddressing {
|
|
138
|
+
/** Endpoint IP inside the child's namespace (`--macvlan_vs_ip`). */
|
|
139
|
+
ip: string;
|
|
140
|
+
/** Netmask for the endpoint (`--macvlan_vs_nm`). */
|
|
141
|
+
netmask: string;
|
|
142
|
+
/** Default gateway reachable from the endpoint (`--macvlan_vs_gw`). */
|
|
143
|
+
gateway: string;
|
|
144
|
+
}
|
|
145
|
+
|
|
146
|
+
/** Env vars supplying the macvlan endpoint addressing (see {@link MacvlanAddressing}). */
|
|
147
|
+
export const MACVLAN_IP_ENV = "CHECKSTACK_SANDBOX_MACVLAN_IP";
|
|
148
|
+
export const MACVLAN_NM_ENV = "CHECKSTACK_SANDBOX_MACVLAN_NM";
|
|
149
|
+
export const MACVLAN_GW_ENV = "CHECKSTACK_SANDBOX_MACVLAN_GW";
|
|
150
|
+
|
|
151
|
+
/**
|
|
152
|
+
* Read the operator-supplied macvlan addressing from the environment. All three
|
|
153
|
+
* of IP / netmask / gateway must be present and non-empty; a partial config is
|
|
154
|
+
* treated as "not configured" (so the allowlist degrades-and-surfaces rather
|
|
155
|
+
* than routing into a half-addressed endpoint).
|
|
156
|
+
*/
|
|
157
|
+
function detectMacvlanAddressing(): MacvlanAddressing | null {
|
|
158
|
+
const ip = process.env[MACVLAN_IP_ENV]?.trim();
|
|
159
|
+
const netmask = process.env[MACVLAN_NM_ENV]?.trim();
|
|
160
|
+
const gateway = process.env[MACVLAN_GW_ENV]?.trim();
|
|
161
|
+
if (
|
|
162
|
+
ip === undefined ||
|
|
163
|
+
ip === "" ||
|
|
164
|
+
netmask === undefined ||
|
|
165
|
+
netmask === "" ||
|
|
166
|
+
gateway === undefined ||
|
|
167
|
+
gateway === ""
|
|
168
|
+
) {
|
|
169
|
+
return null;
|
|
170
|
+
}
|
|
171
|
+
return { ip, netmask, gateway };
|
|
172
|
+
}
|
|
173
|
+
|
|
174
|
+
function detectPlatform(): SandboxCapabilities["platform"] {
|
|
175
|
+
if (process.platform === "linux") return "linux";
|
|
176
|
+
if (process.platform === "darwin") return "darwin";
|
|
177
|
+
return "other";
|
|
178
|
+
}
|
|
179
|
+
|
|
180
|
+
function detectEuidIsRoot(): boolean {
|
|
181
|
+
// getuid is unavailable on Windows; treat absence as "cannot drop".
|
|
182
|
+
const getuid = process.getuid?.bind(process);
|
|
183
|
+
if (getuid === undefined) return false;
|
|
184
|
+
return getuid() === 0;
|
|
185
|
+
}
|
|
186
|
+
|
|
187
|
+
/** Whether a binary is resolvable on PATH (single `which`-style check). */
|
|
188
|
+
function hasBinaryOnPath(binary: string): boolean {
|
|
189
|
+
// `command -v` is POSIX and avoids spawning a heavy shell pipeline. On
|
|
190
|
+
// non-POSIX hosts this simply returns false (status !== 0).
|
|
191
|
+
const result = spawnSync("sh", ["-c", `command -v ${binary}`], {
|
|
192
|
+
stdio: "ignore",
|
|
193
|
+
});
|
|
194
|
+
return result.status === 0;
|
|
195
|
+
}
|
|
196
|
+
|
|
197
|
+
function detectWrapper(): SandboxCapabilities["wrapper"] {
|
|
198
|
+
for (const candidate of ["bwrap", "nsjail", "firejail"] as const) {
|
|
199
|
+
if (hasBinaryOnPath(candidate)) {
|
|
200
|
+
return candidate;
|
|
201
|
+
}
|
|
202
|
+
}
|
|
203
|
+
return null;
|
|
204
|
+
}
|
|
205
|
+
|
|
206
|
+
/**
|
|
207
|
+
* LIVE probe: can an unprivileged user+net namespace actually be CREATED on this
|
|
208
|
+
* host right now? This is the truthful source for "can we make a namespace",
|
|
209
|
+
* driving `userNamespaces` / `netNamespaces` / `netEgressRootless`.
|
|
210
|
+
*
|
|
211
|
+
* The static `unprivileged_userns_clone` sysctl ({@link detectUserNamespacesToggle})
|
|
212
|
+
* is NOT sufficient on its own: the live `clone(CLONE_NEWUSER | CLONE_NEWNET)`
|
|
213
|
+
* can fail independently of the toggle (a restrictive seccomp profile — the
|
|
214
|
+
* default Docker/containerd one blocks unprivileged `clone(CLONE_NEW*)` /
|
|
215
|
+
* `unshare`; `sysctl user.max_user_namespaces=0`; an AppArmor `userns`
|
|
216
|
+
* restriction; a hardened kernel). On the common default-Docker-seccomp host the
|
|
217
|
+
* sysctl file is absent (toggle reads "available") yet this clone is blocked, so
|
|
218
|
+
* relying on the toggle alone falsely reports the namespace capability while
|
|
219
|
+
* bwrap fails at spawn — the exact truthfulness gap this closes.
|
|
220
|
+
*
|
|
221
|
+
* We probe via `unshare --user --net --map-root-user true` (util-linux) — the
|
|
222
|
+
* same syscalls the wrapper relies on (CLONE_NEWUSER + CLONE_NEWNET) — and treat
|
|
223
|
+
* a zero exit as "creatable". A non-Linux platform, a missing `unshare` binary,
|
|
224
|
+
* or any non-zero exit is `false`, so we never CLAIM a namespace path on a host
|
|
225
|
+
* where the namespace cannot be made.
|
|
226
|
+
*
|
|
227
|
+
* The static toggle is still consulted FIRST as a cheap short-circuit: when it
|
|
228
|
+
* is explicitly disabled (`= 0`) the namespace cannot be created regardless, so
|
|
229
|
+
* we skip the spawn. Otherwise we run the live probe.
|
|
230
|
+
*
|
|
231
|
+
* Probed ONCE per process (folded into the cached capability detection); never
|
|
232
|
+
* a per-run cost.
|
|
233
|
+
*/
|
|
234
|
+
function detectUserNsCreatable(
|
|
235
|
+
platform: SandboxCapabilities["platform"],
|
|
236
|
+
): boolean {
|
|
237
|
+
if (platform !== "linux") return false;
|
|
238
|
+
// Cheap short-circuit: an explicitly-disabled sysctl can never create a
|
|
239
|
+
// namespace, so skip the spawn. "absent" or non-zero falls through to the
|
|
240
|
+
// live probe (absent does NOT imply available — the live clone decides).
|
|
241
|
+
if (!detectUserNamespacesToggle()) return false;
|
|
242
|
+
if (!hasBinaryOnPath("unshare")) return false;
|
|
243
|
+
const result = spawnSync(
|
|
244
|
+
"unshare",
|
|
245
|
+
["--user", "--net", "--map-root-user", "true"],
|
|
246
|
+
{ stdio: "ignore" },
|
|
247
|
+
);
|
|
248
|
+
return result.status === 0;
|
|
249
|
+
}
|
|
250
|
+
|
|
251
|
+
/**
|
|
252
|
+
* Read the classic Debian/Ubuntu `unprivileged_userns_clone` sysctl toggle.
|
|
253
|
+
* This is ONLY a cheap pre-gate for {@link detectUserNsCreatable}: an explicit
|
|
254
|
+
* `0` means disabled. A MISSING file does NOT prove the namespace is creatable
|
|
255
|
+
* (the default Docker seccomp host has no file yet still blocks the clone), so
|
|
256
|
+
* an absent file falls through to the live probe rather than being treated as
|
|
257
|
+
* "available" on its own.
|
|
258
|
+
*/
|
|
259
|
+
function detectUserNamespacesToggle(): boolean {
|
|
260
|
+
const togglePath = "/proc/sys/kernel/unprivileged_userns_clone";
|
|
261
|
+
if (!existsSync(togglePath)) {
|
|
262
|
+
// Absent: undecided here; the live clone probe is the real verdict.
|
|
263
|
+
return true;
|
|
264
|
+
}
|
|
265
|
+
try {
|
|
266
|
+
const value = readFileSync(togglePath, "utf8").trim();
|
|
267
|
+
return value !== "0";
|
|
268
|
+
} catch {
|
|
269
|
+
return false;
|
|
270
|
+
}
|
|
271
|
+
}
|
|
272
|
+
|
|
273
|
+
/**
|
|
274
|
+
* Find a usable non-loopback host interface to plumb egress into the child's
|
|
275
|
+
* net namespace via macvlan. Returns the first IPv4-or-IPv6, non-internal,
|
|
276
|
+
* interface name, or `null` when none is usable.
|
|
277
|
+
*/
|
|
278
|
+
function detectEgressIface(platform: SandboxCapabilities["platform"]): string | null {
|
|
279
|
+
if (platform !== "linux") return null;
|
|
280
|
+
let ifaces: ReturnType<typeof networkInterfaces>;
|
|
281
|
+
try {
|
|
282
|
+
ifaces = networkInterfaces();
|
|
283
|
+
} catch {
|
|
284
|
+
return null;
|
|
285
|
+
}
|
|
286
|
+
for (const [name, addrs] of Object.entries(ifaces)) {
|
|
287
|
+
if (addrs === undefined) continue;
|
|
288
|
+
// Skip loopback / internal interfaces; we need a real uplink.
|
|
289
|
+
if (addrs.some((a) => !a.internal)) {
|
|
290
|
+
return name;
|
|
291
|
+
}
|
|
292
|
+
}
|
|
293
|
+
return null;
|
|
294
|
+
}
|
|
295
|
+
|
|
296
|
+
function computeCapabilities(): SandboxCapabilities {
|
|
297
|
+
const platform = detectPlatform();
|
|
298
|
+
const hasPrlimit = platform === "linux" ? hasBinaryOnPath("prlimit") : false;
|
|
299
|
+
const wrapper = platform === "linux" ? detectWrapper() : null;
|
|
300
|
+
// LIVE verdict: actually attempt `clone(CLONE_NEWUSER | CLONE_NEWNET)`. This is
|
|
301
|
+
// the single source of truth for namespace creation, so `userNamespaces`,
|
|
302
|
+
// `netNamespaces`, and `netEgressRootless` all agree and none claims a path
|
|
303
|
+
// bwrap cannot actually take at spawn (default Docker seccomp blocks the clone
|
|
304
|
+
// even though the static sysctl toggle reads "available").
|
|
305
|
+
const userNsCreatable = detectUserNsCreatable(platform);
|
|
306
|
+
const userNamespaces = userNsCreatable;
|
|
307
|
+
const euidIsRoot = detectEuidIsRoot();
|
|
308
|
+
// Only wrappers we actually use to create the per-run net namespace count:
|
|
309
|
+
// bwrap (deny) and nsjail (deny + nftables allowlist). firejail is detected
|
|
310
|
+
// for reporting but never delivers the namespace here, so a firejail-only
|
|
311
|
+
// host must NOT claim netNamespaces it cannot deliver via the chosen wrapper.
|
|
312
|
+
const wrapperDeliversNetns = wrapper === "bwrap" || wrapper === "nsjail";
|
|
313
|
+
const netNamespaces = wrapperDeliversNetns && userNsCreatable;
|
|
314
|
+
// Egress plumbing (macvlan) needs nsjail + CAP_NET_ADMIN (euid root) + a real
|
|
315
|
+
// uplink interface. Without ALL three, a fresh netns is routeless and can
|
|
316
|
+
// ONLY deliver `deny`; allowlist / keep-egress-with-metadata-block degrade.
|
|
317
|
+
const netEgressIface =
|
|
318
|
+
wrapper === "nsjail" && netNamespaces && euidIsRoot
|
|
319
|
+
? detectEgressIface(platform)
|
|
320
|
+
: null;
|
|
321
|
+
// The macvlan endpoint additionally needs static addressing to ROUTE (an
|
|
322
|
+
// unaddressed macvlan still blackholes). Only meaningful when we actually
|
|
323
|
+
// have a usable egress iface; taken explicitly from operator env (see
|
|
324
|
+
// detectMacvlanAddressing).
|
|
325
|
+
const netEgressAddressing =
|
|
326
|
+
netEgressIface === null ? null : detectMacvlanAddressing();
|
|
327
|
+
// Rootless egress (slirp4netns) is the fallback when the privileged macvlan
|
|
328
|
+
// path is unavailable. It needs: a wrapper that exposes the child PID for a
|
|
329
|
+
// race-free slirp4netns attach (only `bwrap --info-fd` is used here), an
|
|
330
|
+
// actually-creatable unprivileged user+net namespace, and `slirp4netns` on
|
|
331
|
+
// PATH. No CAP_NET_ADMIN, no host uplink, no operator addressing — the
|
|
332
|
+
// userspace stack supplies deterministic addressing. We still gate on
|
|
333
|
+
// `userNamespaces` (the static toggle) AND the live clone probe so we never
|
|
334
|
+
// claim it where the namespace can't be made.
|
|
335
|
+
const netEgressRootless =
|
|
336
|
+
platform === "linux" &&
|
|
337
|
+
wrapper === "bwrap" &&
|
|
338
|
+
userNsCreatable &&
|
|
339
|
+
hasBinaryOnPath("slirp4netns");
|
|
340
|
+
return {
|
|
341
|
+
platform,
|
|
342
|
+
euidIsRoot,
|
|
343
|
+
hasPrlimit,
|
|
344
|
+
rlimitNative: hasPrlimit,
|
|
345
|
+
wrapper,
|
|
346
|
+
userNamespaces,
|
|
347
|
+
netNamespaces,
|
|
348
|
+
userNsCreatable,
|
|
349
|
+
netEgressIface,
|
|
350
|
+
netEgressAddressing,
|
|
351
|
+
netEgressRootless,
|
|
352
|
+
};
|
|
353
|
+
}
|
|
354
|
+
|
|
355
|
+
let cached: SandboxCapabilities | undefined;
|
|
356
|
+
|
|
357
|
+
/**
|
|
358
|
+
* Detect (and cache for the process lifetime) the host's sandbox capabilities.
|
|
359
|
+
* The first call performs the probes; subsequent calls return the cache.
|
|
360
|
+
*/
|
|
361
|
+
export function detectSandboxCapabilities(): SandboxCapabilities {
|
|
362
|
+
cached ??= computeCapabilities();
|
|
363
|
+
return cached;
|
|
364
|
+
}
|
|
365
|
+
|
|
366
|
+
/**
|
|
367
|
+
* Test-only: clear the cache so a test can force a re-detection. Not part of
|
|
368
|
+
* the production contract.
|
|
369
|
+
*/
|
|
370
|
+
export function __resetCapabilitiesCacheForTest(): void {
|
|
371
|
+
cached = undefined;
|
|
372
|
+
}
|
|
@@ -0,0 +1,116 @@
|
|
|
1
|
+
import { describe, expect, it } from "bun:test";
|
|
2
|
+
import { readCappedOutput } from "./capped-output";
|
|
3
|
+
|
|
4
|
+
const encoder = new TextEncoder();
|
|
5
|
+
|
|
6
|
+
/**
|
|
7
|
+
* Build a ReadableStream that emits the given byte chunks. When `onPull` is
|
|
8
|
+
* provided it is invoked for every chunk pulled, letting a test assert that the
|
|
9
|
+
* reader stopped pulling once the cap was hit (bounded buffering).
|
|
10
|
+
*/
|
|
11
|
+
function streamOf(
|
|
12
|
+
chunks: Uint8Array[],
|
|
13
|
+
onPull?: (index: number) => void,
|
|
14
|
+
): ReadableStream<Uint8Array> {
|
|
15
|
+
let i = 0;
|
|
16
|
+
return new ReadableStream<Uint8Array>({
|
|
17
|
+
pull(controller) {
|
|
18
|
+
if (i >= chunks.length) {
|
|
19
|
+
controller.close();
|
|
20
|
+
return;
|
|
21
|
+
}
|
|
22
|
+
onPull?.(i);
|
|
23
|
+
controller.enqueue(chunks[i]!);
|
|
24
|
+
i += 1;
|
|
25
|
+
},
|
|
26
|
+
});
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
function bytes(text: string): Uint8Array {
|
|
30
|
+
return encoder.encode(text);
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
describe("readCappedOutput", () => {
|
|
34
|
+
it("drains both streams fully when uncapped", async () => {
|
|
35
|
+
const r = await readCappedOutput({
|
|
36
|
+
stdout: streamOf([bytes("hello "), bytes("world")]),
|
|
37
|
+
stderr: streamOf([bytes("err")]),
|
|
38
|
+
maxOutputBytes: undefined,
|
|
39
|
+
});
|
|
40
|
+
expect(r.stdout).toBe("hello world");
|
|
41
|
+
expect(r.stderr).toBe("err");
|
|
42
|
+
expect(r.truncated).toBe(false);
|
|
43
|
+
});
|
|
44
|
+
|
|
45
|
+
it("passes through unchanged when under the cap", async () => {
|
|
46
|
+
const r = await readCappedOutput({
|
|
47
|
+
stdout: streamOf([bytes("hi")]),
|
|
48
|
+
stderr: streamOf([bytes("yo")]),
|
|
49
|
+
maxOutputBytes: 100,
|
|
50
|
+
});
|
|
51
|
+
expect(r.stdout).toBe("hi");
|
|
52
|
+
expect(r.stderr).toBe("yo");
|
|
53
|
+
expect(r.truncated).toBe(false);
|
|
54
|
+
});
|
|
55
|
+
|
|
56
|
+
it("caps combined output, flags truncated, and fires onExceeded once", async () => {
|
|
57
|
+
let exceededCount = 0;
|
|
58
|
+
const r = await readCappedOutput({
|
|
59
|
+
stdout: streamOf([bytes("a".repeat(80))]),
|
|
60
|
+
stderr: streamOf([bytes("b".repeat(80))]),
|
|
61
|
+
maxOutputBytes: 100,
|
|
62
|
+
onExceeded: () => {
|
|
63
|
+
exceededCount += 1;
|
|
64
|
+
},
|
|
65
|
+
});
|
|
66
|
+
expect(r.truncated).toBe(true);
|
|
67
|
+
const total = Buffer.byteLength(r.stdout) + Buffer.byteLength(r.stderr);
|
|
68
|
+
expect(total).toBeLessThanOrEqual(100);
|
|
69
|
+
// onExceeded fires at most once even though both streams overflow.
|
|
70
|
+
expect(exceededCount).toBe(1);
|
|
71
|
+
});
|
|
72
|
+
|
|
73
|
+
it("stops pulling further chunks once the cap is hit (bounded buffering)", async () => {
|
|
74
|
+
// 10 chunks of 100 bytes each on stdout. With a 150-byte cap the reader
|
|
75
|
+
// must NOT pull all 10 — that is the whole point (no full buffering).
|
|
76
|
+
const pulled: number[] = [];
|
|
77
|
+
const chunks = Array.from({ length: 10 }, () => bytes("x".repeat(100)));
|
|
78
|
+
const r = await readCappedOutput({
|
|
79
|
+
stdout: streamOf(chunks, (i) => pulled.push(i)),
|
|
80
|
+
stderr: streamOf([]),
|
|
81
|
+
maxOutputBytes: 150,
|
|
82
|
+
});
|
|
83
|
+
expect(r.truncated).toBe(true);
|
|
84
|
+
// It pulled at most a couple of chunks, not all ten.
|
|
85
|
+
expect(pulled.length).toBeLessThan(10);
|
|
86
|
+
expect(Buffer.byteLength(r.stdout)).toBeLessThanOrEqual(150);
|
|
87
|
+
});
|
|
88
|
+
|
|
89
|
+
it("does not split a multi-byte code point across the decode boundary", async () => {
|
|
90
|
+
// 5 emoji, 4 UTF-8 bytes each (20 bytes), cap at 10 bytes.
|
|
91
|
+
const r = await readCappedOutput({
|
|
92
|
+
stdout: streamOf([bytes("😀😀😀😀😀")]),
|
|
93
|
+
stderr: streamOf([]),
|
|
94
|
+
maxOutputBytes: 10,
|
|
95
|
+
});
|
|
96
|
+
expect(r.truncated).toBe(true);
|
|
97
|
+
// TextDecoder is non-fatal: a partial trailing code point decodes to U+FFFD
|
|
98
|
+
// rather than throwing. Assert we never EXCEED the byte budget — that is
|
|
99
|
+
// the OOM-safety guarantee. (Cosmetic trimming is handled separately by
|
|
100
|
+
// truncateCapturedOutput when the runner needs clean boundaries.)
|
|
101
|
+
expect(Buffer.byteLength(bytes(r.stdout.replace(/�/g, "")))).toBeLessThanOrEqual(10);
|
|
102
|
+
});
|
|
103
|
+
|
|
104
|
+
it("gives stdout budget priority, then stderr the remainder", async () => {
|
|
105
|
+
const r = await readCappedOutput({
|
|
106
|
+
stdout: streamOf([bytes("a".repeat(60))]),
|
|
107
|
+
stderr: streamOf([bytes("b".repeat(60))]),
|
|
108
|
+
maxOutputBytes: 100,
|
|
109
|
+
});
|
|
110
|
+
expect(r.truncated).toBe(true);
|
|
111
|
+
// stdout is read concurrently; the combined cap is the invariant.
|
|
112
|
+
const total = Buffer.byteLength(r.stdout) + Buffer.byteLength(r.stderr);
|
|
113
|
+
expect(total).toBeLessThanOrEqual(100);
|
|
114
|
+
expect(Buffer.byteLength(r.stdout)).toBeGreaterThan(0);
|
|
115
|
+
});
|
|
116
|
+
});
|
|
@@ -0,0 +1,172 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Bounded-buffering capture of a child's stdout + stderr.
|
|
3
|
+
*
|
|
4
|
+
* The naive `new Response(stream).text()` buffers the ENTIRE stream before any
|
|
5
|
+
* truncation runs, so on a degraded host without the `RLIMIT_AS` cap a script
|
|
6
|
+
* emitting gigabytes OOMs the pod before {@link truncateCapturedOutput} ever
|
|
7
|
+
* sees the data (plan §5.1). This reader instead counts bytes as they arrive
|
|
8
|
+
* off BOTH streams against a single shared budget, stops buffering once the
|
|
9
|
+
* budget is exhausted, invokes `onExceeded` (so the caller can kill the child),
|
|
10
|
+
* and flags `truncated`. Buffering is therefore bounded to roughly
|
|
11
|
+
* `maxOutputBytes` plus one in-flight chunk per stream.
|
|
12
|
+
*
|
|
13
|
+
* It is pure JS reading `ReadableStream<Uint8Array>` handles, so it works
|
|
14
|
+
* identically on every platform — the portable resource-cap fallback.
|
|
15
|
+
*/
|
|
16
|
+
|
|
17
|
+
export interface CappedOutputResult {
|
|
18
|
+
stdout: string;
|
|
19
|
+
stderr: string;
|
|
20
|
+
/** True when the combined output reached the cap and capture was stopped. */
|
|
21
|
+
truncated: boolean;
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
/**
|
|
25
|
+
* A growable byte buffer that stops appending once a shared budget is hit.
|
|
26
|
+
* The two streams share one {@link SharedByteBudget} so the COMBINED output is
|
|
27
|
+
* bounded, matching {@link truncateCapturedOutput}'s combined-budget contract.
|
|
28
|
+
*/
|
|
29
|
+
class SharedByteBudget {
|
|
30
|
+
private remaining: number;
|
|
31
|
+
private exceeded = false;
|
|
32
|
+
|
|
33
|
+
constructor(private readonly limit: number) {
|
|
34
|
+
this.remaining = limit;
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
/**
|
|
38
|
+
* Reserve up to `wanted` bytes from the shared budget. Returns how many bytes
|
|
39
|
+
* may still be appended (0 once the budget is exhausted). Sets the exceeded
|
|
40
|
+
* flag the first time a chunk does not fully fit.
|
|
41
|
+
*/
|
|
42
|
+
take(wanted: number): number {
|
|
43
|
+
if (this.remaining <= 0) {
|
|
44
|
+
if (wanted > 0) this.exceeded = true;
|
|
45
|
+
return 0;
|
|
46
|
+
}
|
|
47
|
+
if (wanted > this.remaining) {
|
|
48
|
+
this.exceeded = true;
|
|
49
|
+
const grant = this.remaining;
|
|
50
|
+
this.remaining = 0;
|
|
51
|
+
return grant;
|
|
52
|
+
}
|
|
53
|
+
this.remaining -= wanted;
|
|
54
|
+
return wanted;
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
get isExceeded(): boolean {
|
|
58
|
+
return this.exceeded;
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
get isUncapped(): boolean {
|
|
62
|
+
return this.limit === Number.POSITIVE_INFINITY;
|
|
63
|
+
}
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
/**
|
|
67
|
+
* Drain one stream into `chunks`, taking bytes from the shared budget. Decoding
|
|
68
|
+
* is deferred to the end (a UTF-8 decoder is stateful across chunk boundaries),
|
|
69
|
+
* so we keep the raw bytes we were allowed to keep and decode once.
|
|
70
|
+
*
|
|
71
|
+
* Once the budget is exhausted we call `onCap` (idempotent at the call site)
|
|
72
|
+
* and CANCEL the stream — cancelling releases the producer so the child does
|
|
73
|
+
* not block on a full pipe, and combined with the caller killing the child this
|
|
74
|
+
* stops further output promptly without unbounded buffering.
|
|
75
|
+
*/
|
|
76
|
+
async function drainStream({
|
|
77
|
+
stream,
|
|
78
|
+
budget,
|
|
79
|
+
onCap,
|
|
80
|
+
}: {
|
|
81
|
+
stream: ReadableStream<Uint8Array>;
|
|
82
|
+
budget: SharedByteBudget;
|
|
83
|
+
onCap: () => void;
|
|
84
|
+
}): Promise<Uint8Array> {
|
|
85
|
+
const reader = stream.getReader();
|
|
86
|
+
const kept: Uint8Array[] = [];
|
|
87
|
+
let keptLength = 0;
|
|
88
|
+
try {
|
|
89
|
+
for (;;) {
|
|
90
|
+
const { done, value } = await reader.read();
|
|
91
|
+
if (done) break;
|
|
92
|
+
if (value === undefined || value.length === 0) continue;
|
|
93
|
+
if (budget.isUncapped) {
|
|
94
|
+
kept.push(value);
|
|
95
|
+
keptLength += value.length;
|
|
96
|
+
continue;
|
|
97
|
+
}
|
|
98
|
+
const grant = budget.take(value.length);
|
|
99
|
+
if (grant > 0) {
|
|
100
|
+
const slice = grant === value.length ? value : value.subarray(0, grant);
|
|
101
|
+
kept.push(slice);
|
|
102
|
+
keptLength += slice.length;
|
|
103
|
+
}
|
|
104
|
+
if (budget.isExceeded) {
|
|
105
|
+
onCap();
|
|
106
|
+
// Stop buffering; cancel releases the producer so it cannot keep the
|
|
107
|
+
// pipe full and wedge the child.
|
|
108
|
+
await reader.cancel().catch(() => {
|
|
109
|
+
// Best-effort: the stream may already be closed by the kill.
|
|
110
|
+
});
|
|
111
|
+
break;
|
|
112
|
+
}
|
|
113
|
+
}
|
|
114
|
+
} catch {
|
|
115
|
+
// A cancel mid-read (from the kill path) surfaces as a read rejection;
|
|
116
|
+
// treat it as end-of-stream and keep whatever we already captured.
|
|
117
|
+
} finally {
|
|
118
|
+
reader.releaseLock();
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
if (kept.length === 1) return kept[0]!;
|
|
122
|
+
const merged = new Uint8Array(keptLength);
|
|
123
|
+
let offset = 0;
|
|
124
|
+
for (const chunk of kept) {
|
|
125
|
+
merged.set(chunk, offset);
|
|
126
|
+
offset += chunk.length;
|
|
127
|
+
}
|
|
128
|
+
return merged;
|
|
129
|
+
}
|
|
130
|
+
|
|
131
|
+
/**
|
|
132
|
+
* Capture a child's stdout + stderr with bounded buffering. When
|
|
133
|
+
* `maxOutputBytes` is undefined the streams are drained in full (the
|
|
134
|
+
* back-compat / opted-out path). Otherwise the COMBINED captured bytes are
|
|
135
|
+
* capped; on overflow `onExceeded` is invoked exactly once and `truncated` is
|
|
136
|
+
* set so the runner can flag the run and downgrade the result.
|
|
137
|
+
*/
|
|
138
|
+
export async function readCappedOutput({
|
|
139
|
+
stdout,
|
|
140
|
+
stderr,
|
|
141
|
+
maxOutputBytes,
|
|
142
|
+
onExceeded,
|
|
143
|
+
}: {
|
|
144
|
+
stdout: ReadableStream<Uint8Array>;
|
|
145
|
+
stderr: ReadableStream<Uint8Array>;
|
|
146
|
+
maxOutputBytes: number | undefined;
|
|
147
|
+
/** Called once (at most) when the combined output first exceeds the cap. */
|
|
148
|
+
onExceeded?: () => void;
|
|
149
|
+
}): Promise<CappedOutputResult> {
|
|
150
|
+
const limit =
|
|
151
|
+
maxOutputBytes === undefined ? Number.POSITIVE_INFINITY : maxOutputBytes;
|
|
152
|
+
const budget = new SharedByteBudget(limit);
|
|
153
|
+
|
|
154
|
+
let capFired = false;
|
|
155
|
+
const onCap = () => {
|
|
156
|
+
if (capFired) return;
|
|
157
|
+
capFired = true;
|
|
158
|
+
onExceeded?.();
|
|
159
|
+
};
|
|
160
|
+
|
|
161
|
+
const [stdoutBytes, stderrBytes] = await Promise.all([
|
|
162
|
+
drainStream({ stream: stdout, budget, onCap }),
|
|
163
|
+
drainStream({ stream: stderr, budget, onCap }),
|
|
164
|
+
]);
|
|
165
|
+
|
|
166
|
+
const decoder = new TextDecoder();
|
|
167
|
+
return {
|
|
168
|
+
stdout: decoder.decode(stdoutBytes),
|
|
169
|
+
stderr: new TextDecoder().decode(stderrBytes),
|
|
170
|
+
truncated: budget.isExceeded,
|
|
171
|
+
};
|
|
172
|
+
}
|