@checkstack/backend-api 0.20.0 → 0.21.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. package/CHANGELOG.md +151 -0
  2. package/package.json +12 -11
  3. package/src/auth-strategy.ts +6 -3
  4. package/src/bearer-token.ts +13 -0
  5. package/src/collector-strategy.ts +9 -0
  6. package/src/config-versioning.test.ts +227 -0
  7. package/src/config-versioning.ts +172 -0
  8. package/src/core-services.ts +14 -0
  9. package/src/esm-script-runner.test.ts +55 -16
  10. package/src/esm-script-runner.ts +212 -55
  11. package/src/index.ts +3 -0
  12. package/src/render-templatable-config.test.ts +168 -0
  13. package/src/render-templatable-config.ts +193 -0
  14. package/src/schema-utils.ts +3 -0
  15. package/src/script-sandbox/capabilities.test.ts +122 -0
  16. package/src/script-sandbox/capabilities.ts +372 -0
  17. package/src/script-sandbox/capped-output.test.ts +116 -0
  18. package/src/script-sandbox/capped-output.ts +172 -0
  19. package/src/script-sandbox/env-guard.test.ts +105 -0
  20. package/src/script-sandbox/env-guard.ts +129 -0
  21. package/src/script-sandbox/filesystem.test.ts +437 -0
  22. package/src/script-sandbox/filesystem.ts +514 -0
  23. package/src/script-sandbox/forkbomb.it.test.ts +121 -0
  24. package/src/script-sandbox/global-default.test.ts +161 -0
  25. package/src/script-sandbox/global-default.ts +100 -0
  26. package/src/script-sandbox/index.ts +14 -0
  27. package/src/script-sandbox/network.test.ts +356 -0
  28. package/src/script-sandbox/network.ts +373 -0
  29. package/src/script-sandbox/observability.test.ts +210 -0
  30. package/src/script-sandbox/observability.ts +168 -0
  31. package/src/script-sandbox/output-truncation.test.ts +53 -0
  32. package/src/script-sandbox/output-truncation.ts +69 -0
  33. package/src/script-sandbox/policy.test.ts +189 -0
  34. package/src/script-sandbox/policy.ts +220 -0
  35. package/src/script-sandbox/provider.test.ts +61 -0
  36. package/src/script-sandbox/provider.ts +134 -0
  37. package/src/script-sandbox/readiness.test.ts +80 -0
  38. package/src/script-sandbox/readiness.ts +117 -0
  39. package/src/script-sandbox/report.ts +88 -0
  40. package/src/script-sandbox/rootless-egress.it.test.ts +86 -0
  41. package/src/script-sandbox/rootless-egress.test.ts +99 -0
  42. package/src/script-sandbox/rootless-egress.ts +218 -0
  43. package/src/script-sandbox/shell-quote.test.ts +32 -0
  44. package/src/script-sandbox/shell-quote.ts +10 -0
  45. package/src/script-sandbox/wrapper.test.ts +1194 -0
  46. package/src/script-sandbox/wrapper.ts +714 -0
  47. package/src/shell-script-runner.test.ts +243 -0
  48. package/src/shell-script-runner.ts +210 -45
  49. package/src/zod-config.test.ts +60 -0
  50. package/src/zod-config.ts +38 -14
  51. package/tsconfig.json +3 -0
@@ -0,0 +1,714 @@
1
+ import type { SandboxCapabilities } from "./capabilities";
2
+ import { buildSubprocessEnv } from "./env-guard";
3
+ import {
4
+ buildFilesystemLayer,
5
+ type FilesystemRunInputs,
6
+ } from "./filesystem";
7
+ import { buildNetworkLayer, type NetworkDecision } from "./network";
8
+ import type { SandboxPolicy } from "./policy";
9
+ import { buildRootlessLauncherScript } from "./rootless-egress";
10
+ import {
11
+ type EffectiveSandbox,
12
+ type EnforcedLayers,
13
+ type SandboxDowngrade,
14
+ type SandboxNote,
15
+ SandboxUnavailableError,
16
+ } from "./report";
17
+
18
+ /**
19
+ * Translates a validated {@link SandboxPolicy} + detected
20
+ * {@link SandboxCapabilities} into the concrete extra `Bun.spawn` options the
21
+ * two runners apply, plus an {@link EffectiveSandbox} report of what was
22
+ * actually enforced vs. degraded.
23
+ *
24
+ * Enforced so far:
25
+ * - resource caps via a `prlimit` argv prelude (Linux + util-linux), with
26
+ * `maxOutputBytes` surfaced for runner-side truncation (portable);
27
+ * - privilege drop via `uid`/`gid` (when euid is root);
28
+ * - the env denylist (via {@link buildSubprocessEnv});
29
+ * - filesystem isolation via a namespace wrapper (`bwrap`/`nsjail`) when one
30
+ * is present and the runner supplies a scratch dir (Phase 2, see
31
+ * {@link buildFilesystemLayer}); otherwise the layer degrades (or fails per
32
+ * `onUnavailable`) and the gap is surfaced;
33
+ * - network egress control (Phase 3): a fresh net namespace for `deny` /
34
+ * `allowlist` (and the always-on metadata/link-local block under
35
+ * `unrestricted`), COMPOSED into the SAME wrapper invocation as the FS layer
36
+ * so the two never fight over the net namespace — when network confinement
37
+ * is on the wrapper takes a fresh net namespace instead of `--share-net`,
38
+ * and for `allowlist` an nftables egress filter is installed (nsjail).
39
+ */
40
+
41
+ export interface BuildSpawnHardeningInput {
42
+ policy: SandboxPolicy;
43
+ caps: SandboxCapabilities;
44
+ /**
45
+ * Base env (typically `pickSafeEnv()`); the curated safe vars that are
46
+ * always forwarded.
47
+ */
48
+ baseEnv: Record<string, string>;
49
+ /** Caller-supplied env overrides (secret env, scope env, operator env). */
50
+ envOverrides?: Record<string, string>;
51
+ /**
52
+ * Per-run filesystem inputs (the ESM runner's scratch dir + the reconciled
53
+ * node_modules tree + the interpreter path to bind). Omit for runners with no
54
+ * per-run scratch dir (e.g. the shell runner): the filesystem layer then
55
+ * degrades when requested, but a network-only namespace is still built.
56
+ */
57
+ filesystem?: FilesystemRunInputs;
58
+ /**
59
+ * Path at which the runner WILL write the nftables egress ruleset (for the
60
+ * network allowlist / metadata-block filter installed via
61
+ * `nsjail --nftables_file`). The runner reads back {@link SpawnHardening.nftRuleset}
62
+ * and writes it here before spawn. Omit on runners that cannot stage a file
63
+ * (the network filter then degrades).
64
+ */
65
+ nftRulesetPath?: string;
66
+ /**
67
+ * Path at which the runner WILL write the ROOTLESS egress launcher script
68
+ * (see {@link SpawnHardening.rootlessLauncher}). When the resolved network
69
+ * decision picks the rootless slirp4netns path, the prelude becomes
70
+ * `["sh", rootlessLauncherPath]`; the runner stages the script here (and the
71
+ * nft ruleset at {@link nftRulesetPath}) before spawn. Omit on runners that
72
+ * cannot stage a file — the rootless path then degrades-and-surfaces (host
73
+ * net), never a blackhole.
74
+ */
75
+ rootlessLauncherPath?: string;
76
+ /**
77
+ * Whether THIS runner actually applies the JS-heap memory cap
78
+ * (`NODE_OPTIONS=--max-old-space-size`) to the spawned child. The ESM runner
79
+ * execs a Node/Bun interpreter that honours it, so it passes `true`. The
80
+ * SHELL runner execs `sh -c`, which ignores `NODE_OPTIONS`, so it passes
81
+ * `false` (or omits it): for shell scripts there is NO per-run memory
82
+ * enforcement and the ceiling is purely the container cgroup. When a
83
+ * `memoryBytes` cap is requested but this runner cannot apply the heap cap,
84
+ * the gap is surfaced as a NON-FATAL note (never a downgrade, so it never
85
+ * fail-closes) — see {@link SandboxNote}. Defaults to `false` (the
86
+ * conservative, honest assumption: do not imply a guarantee a runner may not
87
+ * provide).
88
+ */
89
+ appliesNodeMemoryCap?: boolean;
90
+ }
91
+
92
+ export interface SpawnHardening {
93
+ /** Wrap the real command argv with any rlimit prelude. */
94
+ wrapCmd(cmd: string[]): string[];
95
+ /** Final subprocess env (safe base + overrides, denylist applied). */
96
+ env: Record<string, string>;
97
+ /** Env keys dropped by the denylist (for surfacing). */
98
+ droppedEnvKeys: string[];
99
+ /**
100
+ * The privilege-drop UID/GID, for OBSERVABILITY only. These are NEVER passed
101
+ * to `Bun.spawn` (its uid/gid is a silent no-op today, and a forward-compat
102
+ * hazard if Bun starts honouring it: it would spawn the namespace wrapper
103
+ * ITSELF as the dropped id and break userns creation). The actual drop is
104
+ * delivered by the wrapper's `--uid`/`--gid` (root supervisor) or by
105
+ * inheritance from a non-root supervisor. Present only on the genuine
106
+ * root-supervisor wrapper-drop path; undefined otherwise.
107
+ */
108
+ uid?: number;
109
+ gid?: number;
110
+ /**
111
+ * Hard cap on captured stdout+stderr bytes, or undefined for no cap. The
112
+ * runner enforces this purely in JS (portable across platforms).
113
+ */
114
+ maxOutputBytes?: number;
115
+ /** Extra env to merge for the ESM runner's portable memory fallback. */
116
+ nodeMemoryFlagEnv?: Record<string, string>;
117
+ /**
118
+ * The nftables egress ruleset the runner must write to `nftRulesetPath`
119
+ * before spawn (consumed by `nsjail --nftables_file`). Undefined when no
120
+ * filter is needed (FS-only, host net, or pure `deny`).
121
+ */
122
+ nftRuleset?: string;
123
+ /**
124
+ * The ROOTLESS egress launcher script the runner must write to
125
+ * `rootlessLauncherPath` before spawn (then the prelude execs it as
126
+ * `sh <path>`). Undefined unless the network decision picked the rootless
127
+ * slirp4netns path AND it could be staged. The launcher orchestrates
128
+ * slirp4netns + the fail-closed nftables filter; see `rootless-egress.ts`.
129
+ */
130
+ rootlessLauncher?: string;
131
+ /** What was actually enforced / degraded. */
132
+ effective: EffectiveSandbox;
133
+ }
134
+
135
+ /**
136
+ * Build the `prlimit` argv prelude for the requested resource caps.
137
+ *
138
+ * `nprocIsolated` controls whether `--nproc` (RLIMIT_NPROC) is emitted, the
139
+ * fork-bomb cap. RLIMIT_NPROC is enforced PER (UID, user-namespace): the kernel
140
+ * counts a process against its real UID *within its user namespace*. So the cap
141
+ * genuinely isolates a single run's process count from the supervisor (and from
142
+ * sibling runs) IN EITHER of two ways:
143
+ *
144
+ * 1. The run executes inside a WRAPPER-CREATED USER NAMESPACE (the shipped
145
+ * non-root model: rootless `bwrap --unshare-all`, which always creates a
146
+ * fresh user namespace). Even though the child shares the supervisor's host
147
+ * UID (65532), the fresh user namespace means its RLIMIT_NPROC count starts
148
+ * from zero relative to that namespace, so the bomb is capped WITHOUT
149
+ * throttling the supervisor or sibling runs in other namespaces. (Verified
150
+ * in-container: a fork bomb under `bwrap --unshare-all` + `prlimit
151
+ * --nproc=N` is capped at N while the supervisor — same UID 65532, parent
152
+ * namespace — keeps forking freely.) The fresh PID namespace that
153
+ * `--unshare-all` also creates means a single kill of the wrapper reaps the
154
+ * whole tree.
155
+ * 2. The run dropped to a DEDICATED low-priv UID via the namespace wrapper's
156
+ * `--uid` (the legacy ROOT-supervisor path): a distinct UID is isolated by
157
+ * construction.
158
+ *
159
+ * It is NOT safe (and so omitted) when the run executes WITHOUT a wrapper
160
+ * namespace under the supervisor's own UID — there the cap would count EVERY
161
+ * process owned by that UID, including the supervisor and sibling runs, so a low
162
+ * `maxProcesses` could starve the host of fork capacity. All other rlimits
163
+ * (`--cpu`, `--nofile`, `--fsize`) are per-process and always safe.
164
+ */
165
+ function buildPrlimitPrelude({
166
+ policy,
167
+ nprocIsolated,
168
+ }: {
169
+ policy: SandboxPolicy;
170
+ nprocIsolated: boolean;
171
+ }): string[] {
172
+ const { resources } = policy;
173
+ const args: string[] = [];
174
+ if (resources.cpuSeconds !== undefined) {
175
+ args.push(`--cpu=${resources.cpuSeconds}`);
176
+ }
177
+ // NOTE: `memoryBytes` is deliberately NOT mapped to `prlimit --as`
178
+ // (RLIMIT_ADDRESS_SPACE). RLIMIT_AS caps the VIRTUAL address space, not the
179
+ // resident set, and modern runtimes (Bun, Node, the JVM) RESERVE tens of GiB
180
+ // of virtual space at startup, so a `--as` equal to the intended RSS makes the
181
+ // interpreter SIGABRT immediately (verified: `bun` aborts under `--as=512MB`).
182
+ // RLIMIT_DATA has the same problem; RLIMIT_RSS is unenforced on current
183
+ // kernels. The correct hard memory cap is a CGROUP limit (Docker `--memory` /
184
+ // a Kubernetes resources.limits.memory), which the deployment supplies. We
185
+ // therefore enforce memory via (a) the ESM heap cap
186
+ // `NODE_OPTIONS=--max-old-space-size` (a real JS-heap limit, always applied
187
+ // below) and (b) the cgroup limit from the runtime - never a broken `--as`.
188
+ if (resources.maxOpenFiles !== undefined) {
189
+ args.push(`--nofile=${resources.maxOpenFiles}`);
190
+ }
191
+ if (resources.maxProcesses !== undefined && nprocIsolated) {
192
+ args.push(`--nproc=${resources.maxProcesses}`);
193
+ }
194
+ if (resources.maxFileSizeBytes !== undefined) {
195
+ args.push(`--fsize=${resources.maxFileSizeBytes}`);
196
+ }
197
+ if (args.length === 0) {
198
+ return [];
199
+ }
200
+ return ["prlimit", ...args, "--"];
201
+ }
202
+
203
+ /** True when the resource policy requests any rlimit-backed cap. */
204
+ function requestsRlimitCaps(policy: SandboxPolicy): boolean {
205
+ const { resources } = policy;
206
+ return (
207
+ resources.cpuSeconds !== undefined ||
208
+ resources.memoryBytes !== undefined ||
209
+ resources.maxOpenFiles !== undefined ||
210
+ resources.maxProcesses !== undefined ||
211
+ resources.maxFileSizeBytes !== undefined
212
+ );
213
+ }
214
+
215
+ /**
216
+ * Build the OS-level hardening for a single run. Pure & synchronous
217
+ * (capability detection is cached upstream), so neither runner gains an
218
+ * `await` before spawn.
219
+ *
220
+ * @throws {SandboxUnavailableError} when a requested layer cannot be enforced
221
+ * and `policy.onUnavailable === "fail"`. The runner catches this and returns
222
+ * a clean failure WITHOUT spawning an unsandboxed child.
223
+ */
224
+ export function buildSpawnHardening({
225
+ policy,
226
+ caps,
227
+ baseEnv,
228
+ envOverrides,
229
+ filesystem,
230
+ nftRulesetPath,
231
+ rootlessLauncherPath,
232
+ appliesNodeMemoryCap,
233
+ }: BuildSpawnHardeningInput): SpawnHardening {
234
+ const downgrades: SandboxDowngrade[] = [];
235
+ const notes: SandboxNote[] = [];
236
+ const enforced: EnforcedLayers = {
237
+ resources: false,
238
+ filesystem: false,
239
+ network: false,
240
+ privilege: false,
241
+ };
242
+
243
+ // When the sandbox is disabled, behave exactly as before: full safe-env
244
+ // merge with NO denylist, no caps, no uid/gid, no truncation.
245
+ if (!policy.enabled) {
246
+ const { env } = buildSubprocessEnv({
247
+ base: baseEnv,
248
+ overrides: envOverrides,
249
+ applyDenylist: false,
250
+ });
251
+ return {
252
+ wrapCmd: (cmd) => cmd,
253
+ env,
254
+ droppedEnvKeys: [],
255
+ effective: {
256
+ requested: policy,
257
+ enforced,
258
+ downgrades: [],
259
+ notes: [],
260
+ platform: caps.platform,
261
+ },
262
+ };
263
+ }
264
+
265
+ // --- Env (denylist applied when enabled) -------------------------------
266
+ const { env, dropped: droppedEnvKeys } = buildSubprocessEnv({
267
+ base: baseEnv,
268
+ overrides: envOverrides,
269
+ applyDenylist: true,
270
+ });
271
+
272
+ // --- Layer: privilege (resolved FIRST) ---------------------------------
273
+ // Decided before resources because RLIMIT_NPROC is per-UID and is only safe
274
+ // when the child is genuinely a low-priv id (see buildPrlimitPrelude).
275
+ //
276
+ // TWO supervisor models, both supported:
277
+ //
278
+ // 1. NON-ROOT supervisor (the SHIPPED images: uid 65532). The child INHERITS
279
+ // the supervisor's non-root uid by construction, so it can NEVER be
280
+ // host-root - the "script is not root" requirement is satisfied by
281
+ // inheritance, regardless of whether a wrapper is engaged. A `drop-to-uid`
282
+ // to a DIFFERENT id is generally impossible rootless (no subuid/newuidmap)
283
+ // and is NOT needed, so we do NOT carry a wrapper `--uid` flag here. Under
284
+ // rootless bwrap (`--unshare-user`) in-namespace root maps back to this
285
+ // same unprivileged host uid, so even mapped-root cannot escape to host
286
+ // root. `enforced.privilege` is therefore TRUE.
287
+ //
288
+ // 2. ROOT supervisor (legacy / some deployments: euid 0). The drop MUST be
289
+ // performed by the NAMESPACE WRAPPER (`bwrap --uid` / `nsjail --user`),
290
+ // NOT by `Bun.spawn`'s uid/gid (Bun silently ignores those - the child
291
+ // stays root). So a `drop-to-uid` only actually drops when the wrapper is
292
+ // engaged; the `enforced.privilege` verdict is deferred until after the FS
293
+ // layer resolves (see `privilegeDropWanted` + the verdict block below).
294
+ //
295
+ // INVARIANT: `enforced.privilege` is true ONLY when the child cannot run as
296
+ // host-root. It is NEVER true on a path where the script could be host-root
297
+ // (root supervisor + no wrapper carrying the drop).
298
+ let uid: number | undefined;
299
+ let gid: number | undefined;
300
+ // Whether the ROOT-supervisor wrapper drop is wanted (only meaningful when
301
+ // euid is root). For a non-root supervisor this stays false: there is nothing
302
+ // to drop and nothing for the wrapper to carry.
303
+ let privilegeDropWanted = false;
304
+ if (policy.privilege.mode === "drop-to-uid") {
305
+ if (caps.euidIsRoot) {
306
+ if (policy.privilege.uid === undefined) {
307
+ downgrades.push({
308
+ layer: "privilege",
309
+ reason:
310
+ "drop-to-uid requested but no target uid configured and the supervisor euid is root: the child would run as host-root",
311
+ });
312
+ } else {
313
+ // Root supervisor: the wrapper must carry the `--uid` drop. Verdict gated
314
+ // below on the wrapper actually being engaged.
315
+ uid = policy.privilege.uid;
316
+ gid = policy.privilege.gid;
317
+ privilegeDropWanted = true;
318
+ }
319
+ } else {
320
+ // Non-root supervisor: the child inherits non-root and cannot be
321
+ // host-root. Enforced by construction; no wrapper `--uid`, no spawn
322
+ // uid/gid, no downgrade. (A configured target uid is irrelevant rootless;
323
+ // we never attempt a different-id map.)
324
+ enforced.privilege = true;
325
+ }
326
+ } else if (caps.euidIsRoot) {
327
+ // inherit under a ROOT supervisor leaves the child as host-root, which is
328
+ // NOT a privilege-enforced state.
329
+ downgrades.push({
330
+ layer: "privilege",
331
+ reason:
332
+ "privilege inherit under a ROOT supervisor leaves the child running as host-root; run the supervisor as a non-root uid, or use drop-to-uid with a wrapper engaged",
333
+ });
334
+ } else {
335
+ // inherit under a NON-root supervisor: the child inherits non-root and
336
+ // cannot be host-root. Enforced by construction.
337
+ enforced.privilege = true;
338
+ }
339
+ // RLIMIT_NPROC is per-UID, so it is only safe when the child runs under a uid
340
+ // ISOLATED from the host supervisor's uid. That holds ONLY on the
341
+ // root-supervisor path where the wrapper maps the child to a DEDICATED
342
+ // low-priv id via `--uid` (recomputed in the verdict block). It does NOT hold
343
+ // for the non-root supervisor: there the child INHERITS the supervisor's own
344
+ // uid (65532), so an `--nproc` cap would also throttle the supervisor and its
345
+ // sibling runs - the exact starvation the guard prevents. So this stays false
346
+ // here and is only flipped true on the root-wrapper-drop path below.
347
+ let dropsPrivilege = false;
348
+
349
+ // --- Layer: resources --------------------------------------------------
350
+ // The `prlimit` prelude itself is built AFTER the FS layer below, because
351
+ // RLIMIT_NPROC may only be applied once the privilege drop is genuinely in
352
+ // effect, and the drop is delivered by the wrapper (resolved below). Here we
353
+ // only decide `enforced.resources` and whether prlimit is available.
354
+ let rlimitPrelude: string[] = [];
355
+ const wantsRlimit = requestsRlimitCaps(policy);
356
+ if (wantsRlimit) {
357
+ if (caps.rlimitNative) {
358
+ enforced.resources = true;
359
+ } else {
360
+ downgrades.push({
361
+ layer: "resources",
362
+ reason:
363
+ "rlimit caps not enforceable (no prlimit on this host); " +
364
+ "falling back to wall-clock timeout + output truncation" +
365
+ (caps.platform === "linux" ? "" : ` (platform=${caps.platform})`),
366
+ });
367
+ // maxOutputBytes (below) is still enforced — that is the portable subset.
368
+ }
369
+ } else if (policy.resources.maxOutputBytes !== undefined) {
370
+ // No rlimit caps requested but an output cap is — that is fully portable.
371
+ enforced.resources = true;
372
+ }
373
+
374
+ // --- ESM memory cap (the JS-heap limit) --------------------------------
375
+ // Memory is enforced via the JS heap cap `NODE_OPTIONS=--max-old-space-size`
376
+ // (a REAL heap limit the runtime honours) rather than `prlimit --as` (which
377
+ // breaks the interpreter - see buildPrlimitPrelude). Applied whenever a memory
378
+ // cap is requested, on every host (it is portable and does not depend on any
379
+ // namespace primitive). The shell runner ignores this env; for shell scripts
380
+ // the hard memory ceiling is the cgroup limit from the container runtime.
381
+ let nodeMemoryFlagEnv: Record<string, string> | undefined;
382
+ if (policy.resources.memoryBytes !== undefined) {
383
+ const megabytes = Math.max(
384
+ 16,
385
+ Math.floor(policy.resources.memoryBytes / (1024 * 1024)),
386
+ );
387
+ // Intentionally NOT subject to the env denylist (we set it ourselves; it
388
+ // is a controlled cap, not a caller-supplied override).
389
+ nodeMemoryFlagEnv = {
390
+ NODE_OPTIONS: `--max-old-space-size=${megabytes}`,
391
+ };
392
+ // SHELL MEMORY HONESTY (review MAJOR): the heap cap is only applied by a
393
+ // runner that execs a Node/Bun interpreter (`appliesNodeMemoryCap`). The
394
+ // SHELL runner execs `sh -c`, which IGNORES `NODE_OPTIONS`, so a shell run
395
+ // has NO per-run memory enforcement - its only ceiling is the container
396
+ // cgroup. That is a legitimate ceiling, not a missing layer: refusing every
397
+ // shell run under fail-closed would break all shell health-checks and
398
+ // automation. So we surface it as a NON-FATAL note (never a downgrade, so
399
+ // it never trips `onUnavailable: "fail"`), and `enforced.resources` is NOT
400
+ // taken to imply a per-run memory guarantee for shell.
401
+ if (appliesNodeMemoryCap !== true) {
402
+ notes.push({
403
+ layer: "resources",
404
+ note:
405
+ "per-run memory (memoryBytes) is NOT enforced for this run: the " +
406
+ "NODE_OPTIONS=--max-old-space-size heap cap is honoured only by the " +
407
+ "ESM/Node interpreter, not by `sh -c`. The hard memory ceiling for " +
408
+ "shell scripts is the container cgroup limit (Docker --memory / " +
409
+ "Kubernetes resources.limits.memory), which the deployment supplies.",
410
+ });
411
+ }
412
+ }
413
+
414
+ // --- Layer: network (Phase 3) — resolved FIRST so it composes into the FS
415
+ // wrapper invocation (FS + net share ONE bwrap/nsjail call). The decision is
416
+ // also what tells the FS layer whether to keep `--share-net` or take a fresh
417
+ // net namespace.
418
+ const networkDecision = buildNetworkLayer({ policy: policy.network, caps });
419
+ let nftRuleset: string | undefined;
420
+ // The rootless slirp4netns path additionally needs a launcher-script staging
421
+ // path (the orchestration cannot be a plain argv prelude). Track whether the
422
+ // chosen decision is rootless AND fully stageable.
423
+ const wantsRootlessEgress =
424
+ networkDecision.kind === "namespaced" &&
425
+ networkDecision.egressPath === "rootless";
426
+ const rootlessStageable =
427
+ wantsRootlessEgress &&
428
+ nftRulesetPath !== undefined &&
429
+ rootlessLauncherPath !== undefined;
430
+ if (networkDecision.kind === "host") {
431
+ if (networkDecision.metadataBlockUnenforceable) {
432
+ downgrades.push({
433
+ layer: "network",
434
+ reason:
435
+ "metadata/link-local egress block requested but not enforceable on this host: it needs a net namespace WITH real egress plumbed in (privileged macvlan: nsjail + CAP_NET_ADMIN + a usable host interface; or rootless: bwrap + an unprivileged user+net namespace + slirp4netns) so ordinary traffic still flows; a routeless namespace would sever ALL egress, so host net is kept; egress unrestricted (metadata NOT blocked)",
436
+ });
437
+ } else {
438
+ enforced.network = true;
439
+ }
440
+ } else if (networkDecision.kind === "namespaced") {
441
+ // Confinement is deliverable IF the FS layer below actually builds the
442
+ // wrapper (same primitives). A pure-deny netns needs no ruleset; an
443
+ // allowlist / metadata-block ruleset is staged by the runner.
444
+ nftRuleset = networkDecision.nftRuleset;
445
+ if (nftRuleset !== undefined && nftRulesetPath === undefined) {
446
+ // The filter could not be staged (runner can't write a ruleset file):
447
+ // refuse to silently run without it.
448
+ downgrades.push({
449
+ layer: "network",
450
+ reason:
451
+ "network egress filter could not be staged (no ruleset file path available from this runner); egress unrestricted",
452
+ });
453
+ } else if (wantsRootlessEgress && !rootlessStageable) {
454
+ // The rootless launcher script could not be staged by this runner.
455
+ // Degrade rather than take a routeless namespace (which would blackhole).
456
+ downgrades.push({
457
+ layer: "network",
458
+ reason:
459
+ "rootless egress (slirp4netns) requires staging a launcher script, but this runner provided no launcher path; egress unrestricted",
460
+ });
461
+ } else {
462
+ enforced.network = true;
463
+ }
464
+ } else {
465
+ downgrades.push({ layer: "network", reason: networkDecision.reason });
466
+ }
467
+ // If the network filter / launcher could not be staged, fall back to keeping
468
+ // host net in the wrapper (so we don't take a netns we can't filter and
469
+ // accidentally cut off all egress under what the operator asked to be an
470
+ // allowlist).
471
+ const netUnstageable =
472
+ networkDecision.kind === "namespaced" &&
473
+ ((networkDecision.nftRuleset !== undefined && nftRulesetPath === undefined) ||
474
+ (wantsRootlessEgress && !rootlessStageable));
475
+ const effectiveNetDecision: NetworkDecision = netUnstageable
476
+ ? { kind: "host", metadataBlockUnenforceable: false }
477
+ : networkDecision;
478
+
479
+ // --- Layer: filesystem (Phase 2) — composed with the network decision ---
480
+ let fsPrelude: string[] = [];
481
+ // For the ROOTLESS egress path the prelude is a generated launcher script
482
+ // (slirp4netns + the fail-closed nft filter) that wraps the bwrap argv. The
483
+ // launcher is staged by the runner; here we only build the script text and
484
+ // the `sh <launcher> -- <inner cmd>` invocation in `wrapCmd`.
485
+ let rootlessLauncher: string | undefined;
486
+ const fsLayer = buildFilesystemLayer({
487
+ policy: policy.filesystem,
488
+ caps,
489
+ // Thread the resolved privilege drop target into the wrapper so it can
490
+ // carry `--uid`/`--gid` (the ONLY mechanism that actually drops; Bun.spawn's
491
+ // uid/gid is ignored). `uid`/`gid` are undefined unless a drop was resolved.
492
+ inputs: { ...filesystem, dropUid: uid, dropGid: gid },
493
+ network: effectiveNetDecision,
494
+ nftRulesetPath,
495
+ });
496
+ switch (fsLayer.kind) {
497
+ case "off": {
498
+ enforced.filesystem = true; // "off" is trivially "enforced as requested".
499
+ break;
500
+ }
501
+ case "enforced": {
502
+ fsPrelude = fsLayer.prelude;
503
+ enforced.filesystem = true;
504
+ break;
505
+ }
506
+ case "enforced-rootless-egress": {
507
+ // FS confinement (if requested) is delivered by the SAME bwrap argv the
508
+ // launcher wraps, so it is enforced here too. The launcher text embeds
509
+ // the bwrap argv + the nft ruleset path; the real command is appended at
510
+ // `wrapCmd` time as the launcher's positional args.
511
+ enforced.filesystem = true;
512
+ if (nftRulesetPath !== undefined && rootlessLauncherPath !== undefined) {
513
+ rootlessLauncher = buildRootlessLauncherScript({
514
+ bwrapArgv: fsLayer.bwrapArgv,
515
+ nftRulesetPath,
516
+ });
517
+ } else {
518
+ // Should not happen: effectiveNetDecision is only rootless when
519
+ // rootlessStageable. Defensive — degrade the network layer if it does.
520
+ enforced.network = false;
521
+ nftRuleset = undefined;
522
+ downgrades.push({
523
+ layer: "network",
524
+ reason:
525
+ "rootless egress launcher could not be staged (missing ruleset/launcher path); egress unrestricted",
526
+ });
527
+ }
528
+ break;
529
+ }
530
+ case "degrade": {
531
+ // The wrapper could not be built. If FS confinement was requested this is
532
+ // a filesystem downgrade; either way a requested net namespace also
533
+ // cannot be delivered (same primitives), so reconcile the network state.
534
+ if (policy.filesystem.mode !== "off") {
535
+ downgrades.push({ layer: "filesystem", reason: fsLayer.reason });
536
+ }
537
+ if (effectiveNetDecision.kind === "namespaced") {
538
+ enforced.network = false;
539
+ downgrades.push({
540
+ layer: "network",
541
+ reason: `network egress namespace could not be delivered: ${fsLayer.reason}`,
542
+ });
543
+ nftRuleset = undefined;
544
+ }
545
+ break;
546
+ }
547
+ }
548
+
549
+ // --- Privilege verdict (root-supervisor path only) ---------------------
550
+ // Only relevant when the supervisor euid is root and a `drop-to-uid` target
551
+ // was configured (`privilegeDropWanted`). The wrapper carries the
552
+ // `--uid`/`--gid` drop only when it was actually built: `enforced` (plain
553
+ // prelude) or `enforced-rootless-egress` with a staged launcher. A
554
+ // `drop-to-uid` with NO wrapper engaged cannot drop (Bun's spawn uid/gid is a
555
+ // silent no-op), so the child would run as host-root: NOT enforced, surfaced.
556
+ // (The non-root supervisor path already set `enforced.privilege = true` by
557
+ // inheritance above and left `uid`/`gid` unset, so it falls through here.)
558
+ const wrapperCarriesDrop =
559
+ fsLayer.kind === "enforced" ||
560
+ (fsLayer.kind === "enforced-rootless-egress" && rootlessLauncher !== undefined);
561
+ if (privilegeDropWanted) {
562
+ if (wrapperCarriesDrop) {
563
+ enforced.privilege = true;
564
+ dropsPrivilege = true;
565
+ } else {
566
+ // The wrapper is not engaged, so the drop cannot happen and the child
567
+ // would run as host-root. Clear the observability uid/gid (there is no
568
+ // real drop to report) and surface the gap.
569
+ uid = undefined;
570
+ gid = undefined;
571
+ downgrades.push({
572
+ layer: "privilege",
573
+ reason:
574
+ "privilege drop-to-uid could not be enforced: it is performed by the " +
575
+ "namespace wrapper (bwrap/nsjail), which is not engaged for this run " +
576
+ "(no filesystem confinement and no network namespace), and Bun's spawn " +
577
+ "uid/gid is silently ignored; the child would run as host-root. Run " +
578
+ "the supervisor as a non-root uid (the shipped images do) so the child " +
579
+ "inherits non-root regardless of the wrapper",
580
+ });
581
+ }
582
+ }
583
+
584
+ // --- Resource prelude (built now that the drop + wrapper verdict is known) --
585
+ // RLIMIT_NPROC (the fork-bomb cap) is per (UID, user-namespace), so it is
586
+ // applied whenever the run is `nprocIsolated`: it executes inside a
587
+ // WRAPPER-CREATED USER NAMESPACE (rootless `bwrap --unshare-all`, the shipped
588
+ // model — the fresh user namespace makes the cap count only THIS run, even
589
+ // under the shared supervisor uid) OR it dropped to a dedicated uid via the
590
+ // root-supervisor wrapper `--uid`. Outside a wrapper namespace it would count
591
+ // the supervisor's processes too, so it is omitted there. Other rlimits
592
+ // (`--cpu`, `--nofile`, `--fsize`) are per-process and always safe.
593
+ //
594
+ // The wrapper engages a user namespace exactly when the FS/net layer built a
595
+ // prelude (`enforced`) or the rootless-egress launcher (`enforced-rootless-
596
+ // egress`). Both `bwrap --unshare-all` and `nsjail` create the user (and PID)
597
+ // namespace, so the cap isolates the run and a single kill of the wrapper
598
+ // reaps the whole fork tree.
599
+ const runsInWrapperUserns =
600
+ fsLayer.kind === "enforced" ||
601
+ (fsLayer.kind === "enforced-rootless-egress" && rootlessLauncher !== undefined);
602
+ const nprocIsolated = dropsPrivilege || runsInWrapperUserns;
603
+ if (wantsRlimit && caps.rlimitNative) {
604
+ rlimitPrelude = buildPrlimitPrelude({ policy, nprocIsolated });
605
+ if (policy.resources.maxProcesses !== undefined && !nprocIsolated) {
606
+ if (caps.euidIsRoot) {
607
+ // ROOT supervisor with no wrapper namespace and no in-effect drop: a
608
+ // real gap (the limit could have isolated a dedicated uid had the
609
+ // wrapper carried the drop).
610
+ downgrades.push({
611
+ layer: "resources",
612
+ reason:
613
+ "maxProcesses (RLIMIT_NPROC) not applied: no namespace wrapper is " +
614
+ "engaged (no filesystem confinement and no network namespace) to " +
615
+ "isolate the process count, and the limit is per-UID so it would also " +
616
+ "throttle the host process; set privilege.mode=drop-to-uid and/or " +
617
+ "enable a wrapper layer to enforce it",
618
+ });
619
+ } else {
620
+ // NON-ROOT supervisor with NO wrapper namespace (FS off AND host net):
621
+ // the child shares the supervisor's uid (65532) in the SAME user
622
+ // namespace, so an `--nproc` cap would also throttle the supervisor and
623
+ // its sibling runs. It is therefore NOT applied per-run on this
624
+ // wrapper-less path; the fork-bomb ceiling is the container cgroup
625
+ // `pids` controller. This is an accepted, expected characteristic of an
626
+ // unwrapped run - NOT a failure to enforce the policy - so it is a
627
+ // NON-FATAL note (never a downgrade, so the fail-closed default still
628
+ // runs). NOTE: under the shipped secure default a wrapper IS engaged
629
+ // (scratch-plus-ro filesystem), so this branch is the unwrapped-FS-off
630
+ // corner case, not the default path - the default path applies the cap
631
+ // via `runsInWrapperUserns`.
632
+ notes.push({
633
+ layer: "resources",
634
+ note:
635
+ "maxProcesses (RLIMIT_NPROC) is NOT applied for this run: no " +
636
+ "namespace wrapper is engaged (filesystem confinement is off and the " +
637
+ "network stays on the host), so the child shares the supervisor's uid " +
638
+ "and user namespace and a per-UID nproc cap would also throttle the " +
639
+ "supervisor. With the shipped secure default (scratch-plus-ro " +
640
+ "filesystem) the wrapper IS engaged and the cap is applied per-run. " +
641
+ "Either way the cgroup pids limit (Docker --pids-limit / Kubernetes) " +
642
+ "remains a backstop the deployment supplies.",
643
+ });
644
+ }
645
+ }
646
+ }
647
+
648
+ // --- TMPDIR override under FS confinement ------------------------------
649
+ // FS confinement mounts a FRESH tmpfs at /tmp, but the forwarded host TMPDIR
650
+ // may point outside it (so `$TMPDIR`-based mktemp would fail inside the
651
+ // namespace). When the FS wrapper is actually enforced, pin TMPDIR to the
652
+ // in-namespace tmpfs so temp-file creation works.
653
+ let tmpdirOverrideEnv: Record<string, string> | undefined;
654
+ if (
655
+ (fsLayer.kind === "enforced" ||
656
+ (fsLayer.kind === "enforced-rootless-egress" &&
657
+ rootlessLauncher !== undefined)) &&
658
+ policy.filesystem.mode !== "off"
659
+ ) {
660
+ tmpdirOverrideEnv = { TMPDIR: "/tmp" };
661
+ }
662
+
663
+ // --- Fail-closed handling ----------------------------------------------
664
+ if (policy.onUnavailable === "fail" && downgrades.length > 0) {
665
+ throw new SandboxUnavailableError(downgrades);
666
+ }
667
+
668
+ // Compose preludes outer-to-inner: the FS/net wrapper (bwrap/nsjail) sets up
669
+ // the namespaces FIRST, then `prlimit` applies the rlimits INSIDE that
670
+ // namespace, then the real command. Each prelude terminates with its own `--`.
671
+ const captureCmdPrelude = [...fsPrelude, ...rlimitPrelude];
672
+
673
+ // The TMPDIR override is sandbox-set (not a caller override), so it is folded
674
+ // in here rather than going through the denylist. It only applies when the
675
+ // in-namespace tmpfs is actually present.
676
+ const finalEnv =
677
+ tmpdirOverrideEnv === undefined ? env : { ...env, ...tmpdirOverrideEnv };
678
+
679
+ // For the rootless egress path, the prelude is the staged launcher script.
680
+ // The launcher wraps bwrap; the rlimit prelude + real command are appended as
681
+ // the launcher's positional args (forwarded verbatim into the namespace and
682
+ // exec'd after slirp4netns + the nft filter are up). `wrapCmd` therefore
683
+ // produces `sh <launcher> <rlimit-prelude...> <cmd...>`.
684
+ const launcherPath: string | undefined =
685
+ rootlessLauncher === undefined ? undefined : rootlessLauncherPath;
686
+
687
+ return {
688
+ wrapCmd: (cmd) => {
689
+ if (launcherPath !== undefined) {
690
+ // Rootless: `sh <launcher> <rlimit-prelude...> <cmd...>`. The launcher
691
+ // threads its positional args through bwrap into the namespace.
692
+ return ["sh", launcherPath, ...rlimitPrelude, ...cmd];
693
+ }
694
+ return captureCmdPrelude.length > 0
695
+ ? [...captureCmdPrelude, ...cmd]
696
+ : cmd;
697
+ },
698
+ env: finalEnv,
699
+ droppedEnvKeys,
700
+ uid,
701
+ gid,
702
+ maxOutputBytes: policy.resources.maxOutputBytes,
703
+ nodeMemoryFlagEnv,
704
+ nftRuleset,
705
+ rootlessLauncher,
706
+ effective: {
707
+ requested: policy,
708
+ enforced,
709
+ downgrades,
710
+ notes,
711
+ platform: caps.platform,
712
+ },
713
+ };
714
+ }