@specific.dev/spectest 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,1324 @@
1
+ import { AsyncLocalStorage } from "node:async_hooks";
2
+ import { spawn as nodeSpawn } from "node:child_process";
3
+ import { randomUUID } from "node:crypto";
4
+ import { existsSync, readFileSync } from "node:fs";
5
+ import { readFile, unlink } from "node:fs/promises";
6
+
7
+ import {
8
+ AppsV1Api,
9
+ CoreV1Api,
10
+ KubeConfig,
11
+ KubernetesObjectApi,
12
+ ResponseContext,
13
+ ServerConfiguration,
14
+ createConfiguration,
15
+ loadAllYaml,
16
+ type KubernetesObject,
17
+ type RequestContext,
18
+ } from "@kubernetes/client-node";
19
+ import { Observable } from "@kubernetes/client-node/dist/gen/rxjsStub.js";
20
+
21
+ import type { ServiceDefinition } from "../index.js";
22
+ import { dnsName, provides, SELF_SERVICE_TOKEN } from "../index.js";
23
+ import { readRaw, readTag, wrap } from "../inspect.js";
24
+ import type { Wrapped } from "../inspect.js";
25
+ import { recorderAnnotate, recorderRemove } from "../recorder.js";
26
+
27
+ export interface K3sOptions {
28
+ /** Image tag for the official `rancher/k3s` image. Default `"v1.30.6-k3s1"`. */
29
+ version?: string;
30
+ /**
31
+ * Extra arguments appended to `k3s server`. Useful for `--tls-san=...`,
32
+ * additional `--disable=<addon>`, custom CIDRs, etc.
33
+ */
34
+ extraArgs?: string[];
35
+ /**
36
+ * Readiness probe timeout in seconds. k3s on a warm image is ready in
37
+ * a few seconds; the first cold start of an env (image pull + cluster
38
+ * bootstrap) can take 30–60s. Default `120`.
39
+ */
40
+ readyTimeoutSecs?: number;
41
+ /**
42
+ * Run an in-cluster OCI registry (CNCF `distribution` / `registry:2`)
43
+ * that the cluster's own containerd trusts. This is the hermetic
44
+ * stand-in for a cloud registry (ECR/GCR/GHCR): a peer service builds
45
+ * an image, pushes it here over plain HTTP, references
46
+ * `<cluster-key>.internal:5000/...` from a Deployment, and the kubelet
47
+ * pulls it straight back. Lets you test a real
48
+ * `build → push → deploy → pull` pipeline with no external registry
49
+ * and no image pre-baking.
50
+ *
51
+ * **The registry's push/pull address is `<cluster-key>.internal:5000`**,
52
+ * where `<cluster-key>` is the key you give this service in the
53
+ * `services` map (e.g. a cluster at `services.cluster` is reachable at
54
+ * `cluster.internal:5000`). That's the cluster service's own
55
+ * unconditional `.internal` alias — peer-reachable and never clobbered
56
+ * by any `hostnames` you set — so it resolves identically from peer
57
+ * containers (push) and the cluster's own containerd (pull). Wire it
58
+ * into your platform, e.g. `env: { REGISTRY_URL: "cluster.internal:5000" }`.
59
+ * Plain HTTP, so configure your push client for an insecure registry.
60
+ *
61
+ * On by default. Set `false` for clusters that only ever run public
62
+ * images — that skips the extra pod.
63
+ */
64
+ registry?: boolean;
65
+ /**
66
+ * Domains to route into this cluster's ingress via **wildcard DNS**. For
67
+ * each `"example.com"`, spectest-resolver answers any `*.example.com`
68
+ * query with the cluster container's IP, where Traefik dispatches by Host
69
+ * to the matching Ingress. This lets a test `kubectl apply` an Ingress for
70
+ * any host under the domain and reach it immediately — no need to
71
+ * pre-declare each hostname in `hostnames`.
72
+ *
73
+ * ```ts
74
+ * services: { k8s: k3s({ ingressDomains: ["example.com"] }) }
75
+ * // a test then applies an Ingress for foo.example.com and fetches it.
76
+ * ```
77
+ *
78
+ * For one-off hosts not under a declared domain, a test can also register
79
+ * dynamically with `ctx.dnsName(host, { service: "k8s" })`.
80
+ *
81
+ * **TLS.** Setting `ingressDomains` also makes those domains reachable
82
+ * over **HTTPS**: Traefik gains a `:443` entrypoint and serves a default
83
+ * certificate minted from the in-VM root CA with SANs `*.<domain>` for
84
+ * each declared domain. The CA is already trusted by the test framework
85
+ * (Node `fetch`, `ctx.browser()`, Python, the system store), so
86
+ * `ctx.fetch("https://foo.example.com")` gets a clean handshake — no
87
+ * per-Ingress `spec.tls` and no `--insecure` needed. Only hosts **under**
88
+ * a declared domain are covered by the cert; static `hostnames` not under
89
+ * one (and one-off `ctx.dnsName` hosts) remain HTTP-only.
90
+ */
91
+ ingressDomains?: string[];
92
+ }
93
+
94
+ /** Port the in-cluster registry listens on (plain HTTP). */
95
+ const K3S_REGISTRY_PORT = 5000;
96
+
97
+ /**
98
+ * Rewrites a `@kubernetes/client-node` API class so each method's resolved
99
+ * value comes back **inspect-wrapped** ({@link Wrapped}) — exactly what
100
+ * `withTagging` does at runtime. The method *signatures* (argument types) are
101
+ * untouched; only the `Promise<R>` result becomes `Promise<Wrapped<R>>`, so
102
+ * `expect(pod.status.phase)` links to the API call with no cast and you
103
+ * `.unwrap()` before using a value as raw data. Non-method
104
+ * members pass through unchanged. Mirrors {@link RecordingSqlClient} on the
105
+ * postgres side.
106
+ */
107
+ type Tagged<T> = {
108
+ [K in keyof T]: T[K] extends (...args: infer A) => Promise<infer R>
109
+ ? (...args: A) => Promise<Wrapped<R>>
110
+ : T[K];
111
+ };
112
+
113
+ /**
114
+ * Pre-instantiated `@kubernetes/client-node` API clients sharing the
115
+ * same recording HTTP transport. Every method call lands on the test
116
+ * event log as an HTTP event alongside `fetch` calls, and its result is
117
+ * inspect-wrapped (see {@link Tagged}) so assertions on it stay linked.
118
+ */
119
+ export interface K3sClient {
120
+ core: Tagged<CoreV1Api>;
121
+ apps: Tagged<AppsV1Api>;
122
+ /** Generic object API — `create()`, `read()`, `patch()`, `delete()`
123
+ * against any Kubernetes resource (custom resources included). */
124
+ objects: Tagged<KubernetesObjectApi>;
125
+ }
126
+
127
+ /** Helpers a `k3s(...)` service exposes on `ctx.svc.<name>`. */
128
+ export interface K3sHelpers {
129
+ /**
130
+ * Fully-loaded `KubeConfig`. The cluster server URL is rewritten to
131
+ * `https://<service-name>.internal:6443`, the auto-assigned DNS name
132
+ * for this service on `spectest-net`. TLS verification is off because
133
+ * Bun's fetch doesn't honor an https.Agent's CA option (the client
134
+ * cert from the kubeconfig still flows through for auth), so the
135
+ * server's cert SAN list doesn't need to include the .internal name.
136
+ */
137
+ kubeconfig: KubeConfig;
138
+ /** Pre-built API clients. */
139
+ client: K3sClient;
140
+ /**
141
+ * Apply a (multi-document) YAML manifest. Each parsed document is
142
+ * created via `KubernetesObjectApi.create`. Returns the API server's
143
+ * response objects in input order — each element inspect-wrapped (the
144
+ * array container itself is plain), so `expect(created[0]!.metadata.uid)`
145
+ * links to its create call.
146
+ */
147
+ apply: (manifest: string) => Promise<Wrapped<KubernetesObject>[]>;
148
+ }
149
+
150
+ interface DockerExecResult {
151
+ stdout: string;
152
+ stderr: string;
153
+ code: number;
154
+ }
155
+
156
+ function runProcess(
157
+ cmd: string,
158
+ args: string[],
159
+ timeoutMs = 30_000,
160
+ ): Promise<DockerExecResult> {
161
+ return new Promise((resolve, reject) => {
162
+ const cp = nodeSpawn(cmd, args, { stdio: ["ignore", "pipe", "pipe"] });
163
+ const out: Buffer[] = [];
164
+ const err: Buffer[] = [];
165
+ cp.stdout!.on("data", (c) => out.push(c));
166
+ cp.stderr!.on("data", (c) => err.push(c));
167
+ const t = setTimeout(() => cp.kill("SIGKILL"), timeoutMs);
168
+ cp.on("error", (e) => {
169
+ clearTimeout(t);
170
+ reject(e);
171
+ });
172
+ cp.on("close", (code) => {
173
+ clearTimeout(t);
174
+ resolve({
175
+ stdout: Buffer.concat(out).toString("utf8"),
176
+ stderr: Buffer.concat(err).toString("utf8"),
177
+ code: code ?? -1,
178
+ });
179
+ });
180
+ });
181
+ }
182
+
183
+ function runDocker(
184
+ args: string[],
185
+ timeoutMs = 30_000,
186
+ ): Promise<DockerExecResult> {
187
+ return runProcess("docker", args, timeoutMs);
188
+ }
189
+
190
+ // In-VM root CA, generated once into the base snapshot (see
191
+ // control-plane `base.rs`). Trusted everywhere the test framework runs —
192
+ // Node (`NODE_EXTRA_CA_CERTS`), Chromium (NSS DB), Python, the system
193
+ // store — so a leaf signed by it gives `ctx.fetch`/`ctx.browser()` a
194
+ // clean HTTPS handshake. The k3s `setup` hook runs inside the daemon's
195
+ // Bun process (root in the VM), so it can read the CA key and mint
196
+ // directly. These constants are intentionally redeclared here rather than
197
+ // imported from the daemon: the SDK ships to end users and must not
198
+ // depend on daemon internals.
199
+ const CA_PATH = process.env.SPECTEST_CA_PATH ?? "/etc/spectest/ca.crt";
200
+ const CA_KEY_PATH = process.env.SPECTEST_CA_KEY_PATH ?? "/etc/spectest/ca.key";
201
+
202
+ function caPresent(): boolean {
203
+ return existsSync(CA_PATH) && existsSync(CA_KEY_PATH);
204
+ }
205
+
206
+ /**
207
+ * Mint a leaf certificate from the in-VM root CA covering `hostnames`
208
+ * (used here as the SANs of the cluster's wildcard ingress domains).
209
+ * Returns the cert + key as PEM strings. Self-contained openssl shell-out
210
+ * — deliberately not shared with the daemon's own cert minting to keep
211
+ * the distributed SDK decoupled from daemon code.
212
+ */
213
+ async function issueIngressCert(
214
+ hostnames: string[],
215
+ ): Promise<{ cert: string; key: string }> {
216
+ const id = `spectest-k3s-ingress-${randomUUID().slice(0, 8)}`;
217
+ const keyPath = `/tmp/${id}.key`;
218
+ const crtPath = `/tmp/${id}.crt`;
219
+ const sans = hostnames.map((h) => `DNS:${h}`).join(",");
220
+ const r = await runProcess(
221
+ "openssl",
222
+ [
223
+ "req",
224
+ "-newkey",
225
+ "rsa:2048",
226
+ "-nodes",
227
+ "-keyout",
228
+ keyPath,
229
+ "-out",
230
+ crtPath,
231
+ "-x509",
232
+ "-CA",
233
+ CA_PATH,
234
+ "-CAkey",
235
+ CA_KEY_PATH,
236
+ "-days",
237
+ "3650",
238
+ "-subj",
239
+ "/CN=spectest-k3s-ingress",
240
+ "-addext",
241
+ `subjectAltName=${sans}`,
242
+ "-addext",
243
+ "basicConstraints=CA:FALSE",
244
+ "-addext",
245
+ "extendedKeyUsage=serverAuth",
246
+ "-addext",
247
+ "keyUsage=digitalSignature,keyEncipherment",
248
+ ],
249
+ 30_000,
250
+ );
251
+ if (r.code !== 0) {
252
+ throw new Error(
253
+ `k3s ingress cert minting failed (openssl rc=${r.code}): ${
254
+ r.stderr.trim() || r.stdout.trim()
255
+ }`,
256
+ );
257
+ }
258
+ try {
259
+ const [cert, key] = await Promise.all([
260
+ readFile(crtPath, "utf8"),
261
+ readFile(keyPath, "utf8"),
262
+ ]);
263
+ return { cert, key };
264
+ } finally {
265
+ await Promise.all([
266
+ unlink(keyPath).catch(() => {}),
267
+ unlink(crtPath).catch(() => {}),
268
+ ]);
269
+ }
270
+ }
271
+
272
+ // Holds the inspector `sourceSeq` for the most recent HTTP call inside
273
+ // a single API-method invocation. Filled in by `doFetch` after each
274
+ // request, read by the `withTagging` proxy when the method's promise
275
+ // resolves so the returned parsed object carries the back-reference.
276
+ // AsyncLocalStorage is the right scope here — every call to an Api
277
+ // method runs in its own holder and concurrent calls don't race.
278
+ interface CallSlot {
279
+ seq?: number;
280
+ }
281
+ const callContext = new AsyncLocalStorage<CallSlot>();
282
+
283
+ // HTTP transport for `@kubernetes/client-node` that routes through
284
+ // `globalThis.fetch`. Two reasons:
285
+ // 1. The daemon's per-test `installFetchWrapper` already records every
286
+ // `globalThis.fetch` call as an HTTP event — using fetch here gets
287
+ // k8s API calls recorded for free, no library-specific wiring.
288
+ // 2. Bun's fetch needs Bun-shaped TLS options (`tls: { ... }`) for
289
+ // mTLS; node-fetch's `agent` parameter — which the library's
290
+ // default transport relies on — is silently ignored under Bun.
291
+ // We honor the lib's auth flow (`KubeConfig.applySecurityAuthentication`
292
+ // sets an Agent on the request) by extracting cert/key off that
293
+ // agent and passing them via the Bun-shaped option.
294
+ class FetchHttpLibrary {
295
+ send(request: RequestContext): Observable<ResponseContext> {
296
+ const promise = doFetch(request);
297
+ return new Observable(promise);
298
+ }
299
+ }
300
+
301
+ /**
302
+ * Parsed Kubernetes semantics of a single API request, derived purely
303
+ * from the HTTP method + request path. Fed to `recorderAnnotate` to
304
+ * reclassify the generic `http` event the fetch wrapper recorded into a
305
+ * Kubernetes-specific `kube` event.
306
+ */
307
+ interface KubeRequestMeta {
308
+ verb: string;
309
+ group?: string;
310
+ apiVersion?: string;
311
+ resource?: string;
312
+ subresource?: string;
313
+ name?: string;
314
+ namespace?: string;
315
+ }
316
+
317
+ /**
318
+ * Map a Kubernetes API request to its `(verb, group/version, resource,
319
+ * namespace, name, subresource)` from the URL + HTTP method alone.
320
+ *
321
+ * Path grammar (the two API roots):
322
+ * - core group: `/api/<version>/...`
323
+ * - named group: `/apis/<group>/<version>/...`
324
+ * after which the remainder is either a cluster-scoped resource
325
+ * (`nodes`, `namespaces`, …) or `namespaces/<ns>/<resource>...`. The
326
+ * trailing `<resource>[/<name>[/<subresource>]]` shape plus the method
327
+ * (and `?watch=`) yields the verb.
328
+ *
329
+ * Returns `null` for non-resource paths — discovery (`/api`, `/apis`,
330
+ * `/apis/<group>/<version>`), `/version`, `/healthz`, `/openapi/...` —
331
+ * so those stay rendered as plain `http`.
332
+ */
333
+ function describeKubeRequest(
334
+ method: string,
335
+ rawUrl: string,
336
+ ): KubeRequestMeta | null {
337
+ let path: string;
338
+ let query: URLSearchParams;
339
+ try {
340
+ const u = new URL(rawUrl);
341
+ path = u.pathname;
342
+ query = u.searchParams;
343
+ } catch {
344
+ const q = rawUrl.indexOf("?");
345
+ path = q === -1 ? rawUrl : rawUrl.slice(0, q);
346
+ query = new URLSearchParams(q === -1 ? "" : rawUrl.slice(q + 1));
347
+ }
348
+
349
+ const segs = path.split("/").filter((s) => s.length > 0);
350
+ if (segs.length === 0) return null;
351
+
352
+ let group: string | undefined;
353
+ let apiVersion: string | undefined;
354
+ let rest: string[];
355
+ if (segs[0] === "api") {
356
+ group = "";
357
+ apiVersion = segs[1];
358
+ rest = segs.slice(2);
359
+ } else if (segs[0] === "apis") {
360
+ group = segs[1];
361
+ apiVersion = segs[2];
362
+ rest = segs.slice(3);
363
+ } else {
364
+ return null; // /version, /healthz, /openapi, …
365
+ }
366
+ if (!apiVersion) return null; // discovery root (/api, /apis/<group>)
367
+
368
+ // `namespaces/<ns>/<resource>...` is namespaced; everything else
369
+ // (including `namespaces` and `namespaces/<name>` themselves, and
370
+ // cluster-scoped resources like `nodes`) is taken as-is.
371
+ let namespace: string | undefined;
372
+ let resourcePath = rest;
373
+ if (rest[0] === "namespaces" && rest.length >= 3) {
374
+ namespace = rest[1];
375
+ resourcePath = rest.slice(2);
376
+ }
377
+ if (resourcePath.length === 0) return null; // APIResourceList discovery
378
+
379
+ const resource = resourcePath[0];
380
+ const name = resourcePath.length >= 2 ? resourcePath[1] : undefined;
381
+ const subresource = resourcePath.length >= 3 ? resourcePath[2] : undefined;
382
+
383
+ const watchParam = query.get("watch");
384
+ const watch = watchParam === "true" || watchParam === "1";
385
+ const hasName = name !== undefined;
386
+ let verb: string;
387
+ switch (method.toUpperCase()) {
388
+ case "GET":
389
+ case "HEAD":
390
+ verb = hasName ? "get" : watch ? "watch" : "list";
391
+ break;
392
+ case "POST":
393
+ verb = "create";
394
+ break;
395
+ case "PUT":
396
+ verb = "update";
397
+ break;
398
+ case "PATCH":
399
+ verb = "patch";
400
+ break;
401
+ case "DELETE":
402
+ verb = hasName ? "delete" : "deletecollection";
403
+ break;
404
+ default:
405
+ verb = method.toLowerCase();
406
+ }
407
+
408
+ return { verb, group, apiVersion, resource, subresource, name, namespace };
409
+ }
410
+
411
+ /**
412
+ * True for Kubernetes API *discovery* paths — the version/group/resource
413
+ * enumeration endpoints (`/api`, `/api/<version>`, `/apis`, `/apis/<group>`,
414
+ * `/apis/<group>/<version>`) the dynamic client hits to resolve a kind to
415
+ * its resource path. They carry no resource operation (so
416
+ * `describeKubeRequest` returns null), and `doFetch` retracts their events
417
+ * from the timeline. Non-resource paths that are NOT discovery (`/healthz`,
418
+ * `/version`, `/openapi`, …) are deliberately not matched — they stay as
419
+ * `http`.
420
+ */
421
+ function isKubeDiscoveryPath(rawUrl: string): boolean {
422
+ let path: string;
423
+ try {
424
+ path = new URL(rawUrl).pathname;
425
+ } catch {
426
+ const q = rawUrl.indexOf("?");
427
+ path = q === -1 ? rawUrl : rawUrl.slice(0, q);
428
+ }
429
+ const segs = path.split("/").filter((s) => s.length > 0);
430
+ if (segs.length === 0) return false;
431
+ // `/api` + `/api/<version>`; `/apis` + `/apis/<group>` + `/apis/<group>/<version>`.
432
+ // Anything longer carries a resource segment and is handled as `kube`.
433
+ if (segs[0] === "api") return segs.length <= 2;
434
+ if (segs[0] === "apis") return segs.length <= 3;
435
+ return false;
436
+ }
437
+
438
+ async function doFetch(request: RequestContext): Promise<ResponseContext> {
439
+ const url = request.getUrl();
440
+ const method = String(request.getHttpMethod());
441
+ const body = request.getBody();
442
+ const reqHeaders: Record<string, string> = {};
443
+ for (const [k, v] of Object.entries(request.getHeaders())) {
444
+ reqHeaders[k] = String(v);
445
+ }
446
+
447
+ // The library's auth flow puts client cert/key on an https.Agent
448
+ // attached to the request. Pull them out so we can hand them to Bun's
449
+ // fetch via its `tls` option.
450
+ // eslint-disable-next-line @typescript-eslint/no-explicit-any
451
+ const agent = request.getAgent() as any;
452
+ const agentOpts = agent?.options ?? {};
453
+ const tlsOpts: Record<string, unknown> = { rejectUnauthorized: false };
454
+ if (agentOpts.cert) tlsOpts.cert = agentOpts.cert;
455
+ if (agentOpts.key) tlsOpts.key = agentOpts.key;
456
+
457
+ const wrapped = await fetch(url, {
458
+ method,
459
+ headers: reqHeaders,
460
+ body: body as BodyInit | undefined,
461
+ signal: request.getSignal(),
462
+ // Bun-specific TLS shape; under Node this option is ignored.
463
+ // eslint-disable-next-line @typescript-eslint/no-explicit-any
464
+ tls: tlsOpts,
465
+ } as RequestInit);
466
+
467
+ // Before unwrapping, capture the inspector tag the daemon's fetch
468
+ // wrapper installed on the Response. We feed the seq back through
469
+ // AsyncLocalStorage so the API method's eventual return value can
470
+ // re-acquire it — otherwise the chain `core.listNode() →
471
+ // expect(result)` would record assertions with no back-reference.
472
+ const tag = readTag(wrapped);
473
+ const slot = callContext.getStore();
474
+ if (slot && tag) slot.seq = tag.sourceSeq;
475
+
476
+ // Reclassify the `http` event the fetch wrapper just recorded into a
477
+ // Kubernetes-specific `kube` event (verb/resource/namespace/name), so
478
+ // the timeline reads `list pods · default` rather than the raw API URL.
479
+ // A `tag` is only present when recording was active for this call, so
480
+ // this is a no-op outside instrumented test runs.
481
+ if (tag && tag.sourceSeq !== undefined) {
482
+ const meta = describeKubeRequest(method, url);
483
+ if (meta) {
484
+ recorderAnnotate(tag.sourceSeq, { kind: "kube", ...meta });
485
+ } else if (isKubeDiscoveryPath(url)) {
486
+ // The dynamic client (`objects`, KubernetesObjectApi) can't know a
487
+ // kind's resource path ahead of time, so before the real request it
488
+ // GETs the group's resource list (`/apis/<group>/<version>` →
489
+ // APIResourceList) to map e.g. Ingress → `ingresses`/namespaced, then
490
+ // caches it (apiVersionResourceCache). That discovery GET is library
491
+ // plumbing the test author never wrote, and because the cache is
492
+ // per-daemon-process it surfaces non-deterministically across forks
493
+ // (first access pays it; `dependsOn` children inheriting the warm
494
+ // cache don't). Retract it rather than leave a bare `http` row — the
495
+ // real list/read that follows is recorded and reclassified as usual.
496
+ recorderRemove(tag.sourceSeq);
497
+ }
498
+ // Other non-resource paths (/healthz, /version, /openapi, …) fall
499
+ // through and stay rendered as plain `http`.
500
+ }
501
+
502
+ // The daemon's fetch wrapper proxies `Response.status` and similar
503
+ // primitives as carrier objects (so test assertions can fold under
504
+ // the originating HTTP event). The kubernetes/client-node lib calls
505
+ // `httpStatusCode.toString()` which would then return
506
+ // "[object Object]" and the status-code dispatch falls through to
507
+ // "Unknown API Status Code!". Pull out the raw Response.
508
+ const response =
509
+ (wrapped as { unwrap?: () => Response }).unwrap?.() ?? wrapped;
510
+
511
+ const resHeaders: Record<string, string> = {};
512
+ response.headers.forEach((v, k) => {
513
+ resHeaders[k] = v;
514
+ });
515
+ const buf = Buffer.from(await response.arrayBuffer());
516
+ return new ResponseContext(response.status, resHeaders, {
517
+ text: async () => buf.toString("utf8"),
518
+ binary: async () => buf,
519
+ });
520
+ }
521
+
522
+ /**
523
+ * Recursively strip the inspector's carrier/proxy wrappers from a
524
+ * value. Needed for arguments flowing into the kubernetes/client-node
525
+ * API methods — if a wrapped pod's `metadata.name` (a primitive-carrier
526
+ * object) reaches a URL template, the lib stringifies it to
527
+ * `"[object Object]"` and the request 404s.
528
+ */
529
+ function deepUnwrap(value: unknown): unknown {
530
+ if (value === null || value === undefined) return value;
531
+ const raw = readRaw(value);
532
+ if (raw !== value) return deepUnwrap(raw);
533
+ if (typeof value !== "object") return value;
534
+ if (Array.isArray(value)) return value.map(deepUnwrap);
535
+ const out: Record<string, unknown> = {};
536
+ for (const [k, v] of Object.entries(value as Record<string, unknown>)) {
537
+ out[k] = deepUnwrap(v);
538
+ }
539
+ return out;
540
+ }
541
+
542
+ /**
543
+ * Wrap a `@kubernetes/client-node` Api instance so each method call
544
+ * runs in its own AsyncLocalStorage slot — `doFetch` writes the HTTP
545
+ * event's `sourceSeq` into the slot, and after the lib parses the
546
+ * response we re-attach the seq to the returned object. Downstream
547
+ * `expect(result.items[0].status…)` assertions then fold under that
548
+ * HTTP event in the test event log, the same way `expect(res.status)`
549
+ * does for plain `fetch` calls.
550
+ *
551
+ * Method arguments are deep-unwrapped on the way in so values pulled
552
+ * from a previous API response (still carrying the inspector wrappers)
553
+ * can be passed straight back into another call.
554
+ *
555
+ * Non-function properties pass through untagged.
556
+ */
557
+ function withTagging<T extends object>(api: T): Tagged<T> {
558
+ return new Proxy(api, {
559
+ get(target, prop, receiver) {
560
+ const value = Reflect.get(target, prop, receiver);
561
+ if (typeof value !== "function") return value;
562
+ // Bind the original method to `target` so the lib's internal
563
+ // `this.configuration` accesses keep working.
564
+ // eslint-disable-next-line @typescript-eslint/no-explicit-any
565
+ const method = (value as any).bind(target);
566
+ return (...args: unknown[]): unknown => {
567
+ const unwrappedArgs = args.map(deepUnwrap);
568
+ const slot: CallSlot = {};
569
+ const result = callContext.run(slot, () =>
570
+ method(...unwrappedArgs),
571
+ );
572
+ // Wrap unconditionally — `slot.seq` is undefined when no event was
573
+ // recorded (setup/eval, no active recorder), but the result's type is
574
+ // wrapped, so the value must be wrapped at runtime too (just without a
575
+ // provenance link). Keeps `.unwrap()` available in every context.
576
+ if (result && typeof (result as Promise<unknown>).then === "function") {
577
+ return (result as Promise<unknown>).then((v) => wrap(v, slot.seq));
578
+ }
579
+ return wrap(result, slot.seq);
580
+ };
581
+ },
582
+ }) as unknown as Tagged<T>;
583
+ }
584
+
585
+ /**
586
+ * Image tag for the Traefik we install. Pulled on first cluster boot
587
+ * through the host `zot` mirror (`docker.io` → cache) configured in
588
+ * `registries.yaml`, then captured into the warm-template snapshot so
589
+ * warm starts never pull.
590
+ */
591
+ const TRAEFIK_IMAGE = "rancher/mirrored-library-traefik:3.3.2";
592
+
593
+ /**
594
+ * Names of the in-cluster resources that carry the CA-signed default
595
+ * ingress certificate (created in `setupK3sCluster` when TLS is enabled).
596
+ * The Secret holds the leaf cert+key; the ConfigMap holds the Traefik
597
+ * file-provider snippet that points the `default` TLS store at it.
598
+ */
599
+ const TRAEFIK_TLS_SECRET = "traefik-default-tls";
600
+ const TRAEFIK_DYNAMIC_CONFIGMAP = "traefik-dynamic";
601
+
602
+ /**
603
+ * Traefik file-provider dynamic config: make the in-VM-CA leaf the
604
+ * `default` store certificate, so every router on the `websecure`
605
+ * entrypoint (which we force to TLS) serves it with no per-Ingress
606
+ * `spec.tls` needed.
607
+ */
608
+ const TRAEFIK_DYNAMIC_TLS = `tls:
609
+ stores:
610
+ default:
611
+ defaultCertificate:
612
+ certFile: /certs/tls.crt
613
+ keyFile: /certs/tls.key
614
+ `;
615
+
616
+ /**
617
+ * Traefik manifest applied during setup(). `hostNetwork: true` puts
618
+ * Traefik in the k3s container's netns, so it binds the container's :80
619
+ * (and, with `tls`, :443) directly — no CNI portmap involved (that path
620
+ * still trips on the kernel's missing xt_comment match).
621
+ *
622
+ * When `tls` is set we add a `websecure` :443 entrypoint with TLS forced
623
+ * on (served from the `default` store, i.e. the in-VM-CA leaf mounted
624
+ * from the `traefik-default-tls` Secret via the file provider). HTTPS
625
+ * then works for any routed host under the cluster's `ingressDomains`
626
+ * with zero per-Ingress config; the :80 `web` entrypoint is unchanged.
627
+ */
628
+ function buildTraefikManifest(tls: boolean): string {
629
+ const args = [
630
+ " - --entrypoints.web.address=:80",
631
+ ...(tls
632
+ ? [
633
+ " - --entrypoints.websecure.address=:443",
634
+ " - --entrypoints.websecure.http.tls=true",
635
+ ]
636
+ : []),
637
+ " - --providers.kubernetesingress=true",
638
+ " - --providers.kubernetesingress.ingressclass=traefik",
639
+ ...(tls
640
+ ? [
641
+ " - --providers.file.directory=/dynamic",
642
+ " - --providers.file.watch=true",
643
+ ]
644
+ : []),
645
+ " - --log.level=INFO",
646
+ ].join("\n");
647
+ const ports = [
648
+ " - name: web",
649
+ " containerPort: 80",
650
+ ...(tls
651
+ ? [" - name: websecure", " containerPort: 443"]
652
+ : []),
653
+ ].join("\n");
654
+ const volumeMounts = tls
655
+ ? `
656
+ volumeMounts:
657
+ - name: default-cert
658
+ mountPath: /certs
659
+ readOnly: true
660
+ - name: dynamic
661
+ mountPath: /dynamic
662
+ readOnly: true`
663
+ : "";
664
+ const volumes = tls
665
+ ? `
666
+ volumes:
667
+ - name: default-cert
668
+ secret:
669
+ secretName: ${TRAEFIK_TLS_SECRET}
670
+ - name: dynamic
671
+ configMap:
672
+ name: ${TRAEFIK_DYNAMIC_CONFIGMAP}`
673
+ : "";
674
+ return `apiVersion: v1
675
+ kind: ServiceAccount
676
+ metadata:
677
+ name: traefik
678
+ namespace: kube-system
679
+ ---
680
+ apiVersion: rbac.authorization.k8s.io/v1
681
+ kind: ClusterRole
682
+ metadata:
683
+ name: traefik
684
+ rules:
685
+ - apiGroups: [""]
686
+ resources: ["services", "endpoints", "secrets", "nodes"]
687
+ verbs: ["get", "list", "watch"]
688
+ - apiGroups: ["discovery.k8s.io"]
689
+ resources: ["endpointslices"]
690
+ verbs: ["get", "list", "watch"]
691
+ - apiGroups: ["networking.k8s.io"]
692
+ resources: ["ingresses", "ingressclasses"]
693
+ verbs: ["get", "list", "watch"]
694
+ - apiGroups: ["networking.k8s.io"]
695
+ resources: ["ingresses/status"]
696
+ verbs: ["update"]
697
+ ---
698
+ apiVersion: rbac.authorization.k8s.io/v1
699
+ kind: ClusterRoleBinding
700
+ metadata:
701
+ name: traefik
702
+ roleRef:
703
+ apiGroup: rbac.authorization.k8s.io
704
+ kind: ClusterRole
705
+ name: traefik
706
+ subjects:
707
+ - kind: ServiceAccount
708
+ name: traefik
709
+ namespace: kube-system
710
+ ---
711
+ apiVersion: networking.k8s.io/v1
712
+ kind: IngressClass
713
+ metadata:
714
+ name: traefik
715
+ annotations:
716
+ ingressclass.kubernetes.io/is-default-class: "true"
717
+ spec:
718
+ controller: traefik.io/ingress-controller
719
+ ---
720
+ apiVersion: apps/v1
721
+ kind: Deployment
722
+ metadata:
723
+ name: traefik
724
+ namespace: kube-system
725
+ labels:
726
+ app: traefik
727
+ spec:
728
+ replicas: 1
729
+ selector:
730
+ matchLabels:
731
+ app: traefik
732
+ template:
733
+ metadata:
734
+ labels:
735
+ app: traefik
736
+ spec:
737
+ serviceAccountName: traefik
738
+ hostNetwork: true
739
+ dnsPolicy: Default
740
+ tolerations:
741
+ - operator: Exists
742
+ containers:
743
+ - name: traefik
744
+ image: ${TRAEFIK_IMAGE}
745
+ imagePullPolicy: IfNotPresent
746
+ args:
747
+ ${args}
748
+ ports:
749
+ ${ports}${volumeMounts}${volumes}
750
+ `;
751
+ }
752
+
753
+ /**
754
+ * Host-side `zot` pull-through cache layout (local Firecracker provider
755
+ * only). One zot instance per upstream registry, all bound to the
756
+ * `spectest-br0` gateway `10.42.0.1` on the ports below — **kept in sync
757
+ * with `scripts/install-zot.sh`**. We mirror the cluster's containerd
758
+ * through these so every image pull reuses the shared host cache instead
759
+ * of hitting the public registry, and we list the canonical upstream as
760
+ * a fallback endpoint so a missing/cold mirror only ever slows a pull,
761
+ * never breaks it.
762
+ */
763
+ const ZOT_MIRRORS: Array<{ registry: string; port: number; upstream: string }> = [
764
+ { registry: "docker.io", port: 5000, upstream: "https://registry-1.docker.io" },
765
+ { registry: "ghcr.io", port: 5001, upstream: "https://ghcr.io" },
766
+ { registry: "quay.io", port: 5002, upstream: "https://quay.io" },
767
+ { registry: "registry.k8s.io", port: 5003, upstream: "https://registry.k8s.io" },
768
+ { registry: "public.ecr.aws", port: 5004, upstream: "https://public.ecr.aws" },
769
+ { registry: "gcr.io", port: 5005, upstream: "https://gcr.io" },
770
+ { registry: "mcr.microsoft.com", port: 5006, upstream: "https://mcr.microsoft.com" },
771
+ ];
772
+
773
+ /**
774
+ * Discover the host-side image cache gateway by reading the same
775
+ * `registry-mirrors` entry the in-VM dockerd already uses (baked into
776
+ * the local provider's golden `/etc/docker/daemon.json`). Returns the
777
+ * gateway host (`"10.42.0.1"`) when present, or `null` when there's no
778
+ * host cache — e.g. on Freestyle, where the cluster then pulls every
779
+ * image direct. Runs inside the daemon (VM) at `index.ts` load time, so
780
+ * the result is stable per host and never poisons the warm-template
781
+ * cache.
782
+ */
783
+ function detectHostMirrorGateway(): string | null {
784
+ try {
785
+ const cfg = JSON.parse(
786
+ readFileSync("/etc/docker/daemon.json", "utf8"),
787
+ ) as { "registry-mirrors"?: string[] };
788
+ const first = cfg["registry-mirrors"]?.[0];
789
+ return first ? new URL(first).hostname || null : null;
790
+ } catch {
791
+ return null;
792
+ }
793
+ }
794
+
795
+ /**
796
+ * Build `/etc/rancher/k3s/registries.yaml`. k3s reads this **once, at
797
+ * startup**, to configure its embedded containerd — which is why it has
798
+ * to be seeded via `files` (a pre-start bind mount) rather than a
799
+ * `setup` hook. Two jobs:
800
+ * 1. Mirror the cluster's image pulls through the host `zot` cache
801
+ * (local provider only; omitted when there's no host cache).
802
+ * 2. Trust the in-cluster registry, addressed as `<key>.internal:5000`
803
+ * (the `{{SPECTEST_SERVICE}}` token is expanded to the cluster's
804
+ * service key when the file is written). Image *references* use
805
+ * that peer-reachable name, but containerd pulls via the loopback
806
+ * endpoint `http://127.0.0.1:5000` — the hostNetwork registry pod
807
+ * shares the node's netns, so this needs no in-container DNS and
808
+ * can't be broken by a clobbered `hostnames`.
809
+ * Returns `null` when there's nothing to configure (no host cache and
810
+ * `registry` disabled), in which case no file is injected.
811
+ */
812
+ function buildRegistriesYaml(registryEnabled: boolean): string | null {
813
+ const gateway = detectHostMirrorGateway();
814
+ if (!gateway && !registryEnabled) return null;
815
+
816
+ const lines: string[] = ["mirrors:"];
817
+ if (gateway) {
818
+ for (const { registry, port, upstream } of ZOT_MIRRORS) {
819
+ lines.push(
820
+ ` "${registry}":`,
821
+ ` endpoint:`,
822
+ ` - "http://${gateway}:${port}"`,
823
+ ` - "${upstream}"`,
824
+ );
825
+ }
826
+ }
827
+ if (registryEnabled) {
828
+ const host = `{{SPECTEST_SERVICE}}.internal:${K3S_REGISTRY_PORT}`;
829
+ lines.push(
830
+ ` "${host}":`,
831
+ ` endpoint:`,
832
+ ` - "http://127.0.0.1:${K3S_REGISTRY_PORT}"`,
833
+ "configs:",
834
+ // The endpoint is plain HTTP; the config (keyed by endpoint host)
835
+ // makes that explicit and disables any TLS attempt against it.
836
+ ` "127.0.0.1:${K3S_REGISTRY_PORT}":`,
837
+ ` tls:`,
838
+ ` insecure_skip_verify: true`,
839
+ );
840
+ }
841
+ return lines.join("\n") + "\n";
842
+ }
843
+
844
+ /**
845
+ * In-cluster OCI registry (CNCF `distribution`). `hostNetwork: true`
846
+ * binds the cluster container's `:5000` directly — the same trick
847
+ * Traefik uses — so peer services reach it at `<cluster-key>.internal:5000`
848
+ * (the cluster service's own alias) and the node's own containerd reaches
849
+ * it at `127.0.0.1:5000`. Storage is an `emptyDir`, so pushed images live
850
+ * in the cluster and are captured by snapshot / isolated per test fork
851
+ * like all other in-VM state.
852
+ */
853
+ const REGISTRY_MANIFEST = `apiVersion: apps/v1
854
+ kind: Deployment
855
+ metadata:
856
+ name: spectest-registry
857
+ namespace: kube-system
858
+ labels:
859
+ app: spectest-registry
860
+ spec:
861
+ replicas: 1
862
+ selector:
863
+ matchLabels:
864
+ app: spectest-registry
865
+ template:
866
+ metadata:
867
+ labels:
868
+ app: spectest-registry
869
+ spec:
870
+ hostNetwork: true
871
+ dnsPolicy: Default
872
+ tolerations:
873
+ - operator: Exists
874
+ containers:
875
+ - name: registry
876
+ image: registry:2
877
+ imagePullPolicy: IfNotPresent
878
+ env:
879
+ - name: REGISTRY_HTTP_ADDR
880
+ value: ":${K3S_REGISTRY_PORT}"
881
+ - name: REGISTRY_STORAGE_DELETE_ENABLED
882
+ value: "true"
883
+ ports:
884
+ - name: registry
885
+ containerPort: ${K3S_REGISTRY_PORT}
886
+ volumeMounts:
887
+ - name: data
888
+ mountPath: /var/lib/registry
889
+ volumes:
890
+ - name: data
891
+ emptyDir: {}
892
+ `;
893
+
894
+ /**
895
+ * Wait for a Deployment to reach its desired ready-replica count,
896
+ * polling once a second up to `timeoutMs`. Throws with the last-seen
897
+ * status (plus kube-system pod diagnostics) on timeout.
898
+ */
899
+ async function waitForDeployment(
900
+ clusterName: string,
901
+ helpers: K3sHelpers,
902
+ deployment: string,
903
+ timeoutMs: number,
904
+ ): Promise<void> {
905
+ const deadline = Date.now() + timeoutMs;
906
+ let lastErr: string | undefined;
907
+ while (Date.now() < deadline) {
908
+ try {
909
+ // `.unwrap()` recovers the plain object — the client wraps its result in
910
+ // every context now (provenance-free here, since this internal poll runs
911
+ // during setup with no active recorder). We're reading for control flow,
912
+ // not asserting, so go straight to raw.
913
+ const dep = (
914
+ await helpers.client.apps.readNamespacedDeployment({
915
+ name: deployment,
916
+ namespace: "kube-system",
917
+ })
918
+ ).unwrap();
919
+ const ready = dep.status?.readyReplicas ?? 0;
920
+ const want = dep.spec?.replicas ?? 1;
921
+ if (ready >= want && want > 0) return;
922
+ lastErr = `${deployment} Deployment exists but only ${ready}/${want} replicas Ready`;
923
+ } catch (err) {
924
+ const msg = (err as Error)?.message ?? String(err);
925
+ lastErr = /not found|404/i.test(msg)
926
+ ? `${deployment} Deployment does not exist yet`
927
+ : msg;
928
+ }
929
+ // 250ms: the two sequential rollout waits in setup sit on the cold
930
+ // start's critical path, and a 1s poll wasted up to ~2s of it.
931
+ await new Promise((r) => setTimeout(r, 250));
932
+ }
933
+ const diag = await collectTraefikDiagnostics(helpers);
934
+ throw new Error(
935
+ `k3s(${clusterName}): ${deployment} did not reach Ready within ${
936
+ timeoutMs / 1000
937
+ }s. ${lastErr ?? ""}\n${diag}`,
938
+ );
939
+ }
940
+
941
+ /**
942
+ * Post-Ready setup. Apply the Traefik manifest (hostNetwork) and, when
943
+ * enabled, the in-cluster registry; wait for each Deployment to come
944
+ * Ready. Captured by the warm-template snapshot, so warm starts pay none
945
+ * of this cost.
946
+ *
947
+ * When the cluster declares `ingressDomains` (and the in-VM CA is
948
+ * present), TLS is enabled: we mint a CA-signed leaf covering `*.<domain>`
949
+ * for each domain, stash it in the `traefik-default-tls` Secret + a
950
+ * file-provider ConfigMap, and bring Traefik up with a `websecure` :443
951
+ * entrypoint serving it as the default cert. Those domains are then
952
+ * reachable over HTTPS with a cert the test framework already trusts.
953
+ */
954
+ async function setupK3sCluster(
955
+ name: string,
956
+ helpers: K3sHelpers,
957
+ opts: { registry: boolean; ingressDomains: string[] },
958
+ ): Promise<void> {
959
+ const tlsEnabled = opts.ingressDomains.length > 0 && caPresent();
960
+ if (tlsEnabled) {
961
+ const { cert, key } = await issueIngressCert(
962
+ opts.ingressDomains.map((d) => `*.${d}`),
963
+ );
964
+ // Apply the cert Secret + dynamic-config ConfigMap before the
965
+ // Deployment that mounts them. `stringData` lets us hand over plain
966
+ // PEM; the API server base64-encodes it.
967
+ await helpers.client.core.createNamespacedSecret({
968
+ namespace: "kube-system",
969
+ body: {
970
+ metadata: { name: TRAEFIK_TLS_SECRET, namespace: "kube-system" },
971
+ type: "kubernetes.io/tls",
972
+ stringData: { "tls.crt": cert, "tls.key": key },
973
+ },
974
+ });
975
+ await helpers.client.core.createNamespacedConfigMap({
976
+ namespace: "kube-system",
977
+ body: {
978
+ metadata: {
979
+ name: TRAEFIK_DYNAMIC_CONFIGMAP,
980
+ namespace: "kube-system",
981
+ },
982
+ data: { "tls.yaml": TRAEFIK_DYNAMIC_TLS },
983
+ },
984
+ });
985
+ }
986
+ await helpers.apply(buildTraefikManifest(tlsEnabled));
987
+ if (opts.registry) await helpers.apply(REGISTRY_MANIFEST);
988
+ // Both rollouts proceed independently inside the cluster — wait on them
989
+ // concurrently (they used to serialize, wasting up to a rollout's tail).
990
+ const waits = [waitForDeployment(name, helpers, "traefik", 120_000)];
991
+ if (opts.registry) {
992
+ waits.push(waitForDeployment(name, helpers, "spectest-registry", 120_000));
993
+ }
994
+ await Promise.all(waits);
995
+ }
996
+
997
+ /**
998
+ * Snapshot of kube-system state, dumped on traefik-wait timeout. With
999
+ * the static install, the failure surface is just "did our Deployment
1000
+ * schedule and become Ready?" — pod listing covers that.
1001
+ */
1002
+ async function collectTraefikDiagnostics(helpers: K3sHelpers): Promise<string> {
1003
+ const lines: string[] = [];
1004
+ try {
1005
+ const pods = await helpers.client.core.listNamespacedPod({
1006
+ namespace: "kube-system",
1007
+ });
1008
+ lines.push(`kube-system pods (${pods.items.length}):`);
1009
+ for (const p of pods.items) {
1010
+ const phase = p.status?.phase ?? "?";
1011
+ const cs = p.status?.containerStatuses ?? [];
1012
+ const reasons = cs
1013
+ .map((c) => c.state?.waiting?.reason ?? c.state?.terminated?.reason ?? "")
1014
+ .filter((s) => s)
1015
+ .join(",");
1016
+ lines.push(
1017
+ ` ${p.metadata?.name ?? "?"}: phase=${phase}${reasons ? ` reasons=${reasons}` : ""}`,
1018
+ );
1019
+ // The waiting `message` carries containerd's actual error — e.g. the
1020
+ // failing endpoint, an upstream `429 Too Many Requests`, or a
1021
+ // `connection refused`. The `reason` alone (`ErrImagePull`) hides all
1022
+ // of that, which is exactly what we need when a pull won't settle.
1023
+ for (const c of cs) {
1024
+ const msg =
1025
+ c.state?.waiting?.message ?? c.state?.terminated?.message ?? "";
1026
+ if (msg) lines.push(` ${c.name}: ${msg.replace(/\s+/g, " ").trim()}`);
1027
+ }
1028
+ }
1029
+ } catch (err) {
1030
+ lines.push(`(listing pods failed: ${(err as Error)?.message ?? String(err)})`);
1031
+ }
1032
+ // Recent Warning events surface pull failures the kubelet emits before a
1033
+ // container status even settles (FailedPull / Failed / BackOff), with the
1034
+ // raw containerd message attached. Best-effort: never let diagnostics throw.
1035
+ try {
1036
+ const events = await helpers.client.core.listNamespacedEvent({
1037
+ namespace: "kube-system",
1038
+ });
1039
+ const warnings = (events.items ?? [])
1040
+ .filter((e) => e.type === "Warning")
1041
+ .map((e) => ({
1042
+ obj: e.involvedObject?.name ?? "?",
1043
+ reason: e.reason ?? "?",
1044
+ message: (e.message ?? "").replace(/\s+/g, " ").trim(),
1045
+ }))
1046
+ .filter((e) => e.message);
1047
+ if (warnings.length) {
1048
+ lines.push(`kube-system Warning events (${warnings.length}):`);
1049
+ // Keep the tail — newest events are appended last by the API.
1050
+ for (const w of warnings.slice(-12)) {
1051
+ lines.push(` ${w.obj} [${w.reason}] ${w.message}`);
1052
+ }
1053
+ }
1054
+ } catch (err) {
1055
+ lines.push(`(listing events failed: ${(err as Error)?.message ?? String(err)})`);
1056
+ }
1057
+ return lines.join("\n");
1058
+ }
1059
+
1060
+ /**
1061
+ * A ready-to-use single-node Kubernetes cluster (k3s). Drop into
1062
+ * `environment.services`:
1063
+ *
1064
+ * ```ts
1065
+ * services: { k8s: k3s() }
1066
+ * ```
1067
+ *
1068
+ * Tests get `@kubernetes/client-node` API objects pre-wired to this
1069
+ * cluster at `ctx.svc.<key>.client` — `core`, `apps`, and a generic
1070
+ * `objects` (`KubernetesObjectApi`). Every API call is recorded on the
1071
+ * test event log alongside `fetch` calls. There's also `apply(yaml)`
1072
+ * sugar for piping a multi-document manifest in.
1073
+ *
1074
+ * **Ingress.** We deploy Traefik ourselves in `hostNetwork` mode
1075
+ * during `setup()`. Traefik binds the cluster container's :80 directly
1076
+ * (no ServiceLB / klipper-lb needed), watches Ingress objects via the
1077
+ * API, and routes incoming requests to pod Endpoints. Any `hostnames`
1078
+ * declared on this service in env.ts therefore route through Traefik:
1079
+ * a peer doing `fetch("http://app.example.com")` resolves the host to
1080
+ * the k3s container's IP (via systest-resolver), lands on Traefik,
1081
+ * and gets dispatched to the matching Ingress rule's backend pods.
1082
+ *
1083
+ * **Workarounds for Freestyle's kernel** (Linux 6.1.0-x-freestyle).
1084
+ * The stock kernel is missing the `xt_comment` netfilter match
1085
+ * extension. Two consequences, each handled below:
1086
+ *
1087
+ * 1. *kube-proxy* in default iptables mode generates rules with
1088
+ * `-m comment --comment "..."`, which the kernel rejects —
1089
+ * breaking pod→ClusterIP routing and every pod that talks to the
1090
+ * in-cluster API (helm-install Jobs, CoreDNS, …). Fixed by
1091
+ * `--kube-proxy-arg=proxy-mode=nftables`: kube-proxy emits
1092
+ * native nftables rules where comments are a first-class
1093
+ * construct, no xt_comment dependency. nftables proxy mode is
1094
+ * GA in k8s 1.32, which is why we pin that.
1095
+ *
1096
+ * 2. *CNI portmap plugin* (used by klipper-lb's hostPort to expose
1097
+ * LoadBalancer ports on the host) still uses iptables-nft and
1098
+ * hits the same xt_comment failure — there's no equivalent
1099
+ * flag to switch it to native nftables. Workaround: disable the
1100
+ * bundled traefik + ServiceLB and run Traefik with
1101
+ * `hostNetwork: true` ourselves. hostNetwork pods don't go
1102
+ * through portmap at all (they share the node's netns directly),
1103
+ * so the broken plugin is never invoked.
1104
+ *
1105
+ * Flannel uses the `host-gw` backend because Freestyle's stock kernel
1106
+ * lacks the VXLAN module — fine for single-node clusters.
1107
+ */
1108
+ /**
1109
+ * Default k3s docker image tag.
1110
+ *
1111
+ * The cluster's system images (the `rancher/k3s` image itself, plus
1112
+ * coredns / local-path-provisioner / pause and the Traefik we deploy)
1113
+ * are pulled on first boot through the host `zot` pull-through cache —
1114
+ * `registries.yaml` (seeded via `files`) mirrors `docker.io` and
1115
+ * `registry.k8s.io` at it. The first-ever cluster boot on a cold-cache
1116
+ * host pays the upstream pull once; thereafter zot serves the blobs
1117
+ * host-wide and the warm-template snapshot captures the booted cluster,
1118
+ * so neither cold-cache nor warm starts re-pull. Any `opts.version`
1119
+ * works — there's no base-snapshot release to keep in sync with.
1120
+ *
1121
+ * **Why v1.32.x:** kube-proxy's `nftables` proxy mode is GA in k8s 1.32
1122
+ * (beta in 1.31, alpha-gated in 1.30). The component runs kube-proxy in
1123
+ * this mode to sidestep Freestyle's missing `xt_comment` netfilter
1124
+ * extension; dropping below 1.31 reintroduces the broken iptables path.
1125
+ */
1126
+ const DEFAULT_K3S_VERSION = "v1.32.1-k3s1";
1127
+
1128
+ export function k3s(opts: K3sOptions = {}) {
1129
+ const version = opts.version ?? DEFAULT_K3S_VERSION;
1130
+ const extra = opts.extraArgs ?? [];
1131
+ const registryEnabled = opts.registry !== false;
1132
+ // Wildcard ingress domains. Drives both the `provides(... dnsName)`
1133
+ // wiring below and (when non-empty) the CA-signed TLS default cert that
1134
+ // setupK3sCluster mints so these domains are reachable over HTTPS.
1135
+ const ingressDomains = opts.ingressDomains ?? [];
1136
+ // `/etc/rancher/k3s/registries.yaml` (host-cache mirrors + trust for
1137
+ // the in-cluster registry). Seeded via `files` because k3s reads it
1138
+ // only at startup, before any setup hook could run.
1139
+ const registriesYaml = buildRegistriesYaml(registryEnabled);
1140
+ const serverArgs = [
1141
+ "k3s",
1142
+ "server",
1143
+ // CoreDNS / pod DNS upstream. Without this, k3s sees only the
1144
+ // loopback 127.0.0.11 (Docker's embedded DNS) in the container's
1145
+ // /etc/resolv.conf, decides no usable nameserver exists, and writes a
1146
+ // fallback `nameserver 8.8.8.8` that CoreDNS then forwards to — so
1147
+ // pods reach the public internet but NOT peer services on
1148
+ // spectest-net (`<svc>.internal`, fakes, service-TLS hosts all
1149
+ // NXDOMAIN). We instead point k3s at the container's default gateway
1150
+ // — the spectest-net bridge gateway, where spectest-resolver binds a
1151
+ // second listener for exactly this. The file is written by the
1152
+ // command wrapper below because the gateway IP is only known at
1153
+ // container start.
1154
+ "--resolv-conf=/run/spectest-resolv.conf",
1155
+ // metrics-server isn't useful in a test cluster.
1156
+ "--disable=metrics-server",
1157
+ // traefik + servicelb disabled: their klipper-lb DaemonSet uses
1158
+ // CNI portmap to bind host port 80, which still needs xt_comment
1159
+ // (the iptables compat path that the kernel can't satisfy).
1160
+ // We install Traefik with hostNetwork in setup() — same effect,
1161
+ // no portmap involved. local-storage stays enabled: it's a
1162
+ // controller pod that doesn't bind host ports.
1163
+ "--disable=traefik",
1164
+ "--disable=servicelb",
1165
+ // Pod CIDR MUST avoid 10.42.0.0/16: that's the spectest-br0 host
1166
+ // bridge subnet, whose gateway 10.42.0.1 fronts the host image caches
1167
+ // (zot :5000-5007, buildkitd :1234). k3s's *default* pod CIDR is also
1168
+ // 10.42.0.0/16 — with --flannel-backend=host-gw, flannel programs that
1169
+ // route into the node's own routing table and gives cni0 the subnet's
1170
+ // .1 (10.42.0.1). That shadows the route to the host gateway, so once
1171
+ // CNI comes up the node can no longer reach 10.42.0.1 and every
1172
+ // subsequent registry pull dies with "connect: connection refused"
1173
+ // (e.g. the in-cluster registry's `registry:2`, applied after the
1174
+ // cluster is up — the airgap-bundled system images pull *before* CNI
1175
+ // and so sneak through). Move pods to 10.44/service to 10.45.
1176
+ "--cluster-cidr=10.44.0.0/16",
1177
+ "--service-cidr=10.45.0.0/16",
1178
+ "--cluster-dns=10.45.0.10",
1179
+ "--flannel-backend=host-gw",
1180
+ "--write-kubeconfig-mode=644",
1181
+ // kube-proxy in nftables mode: native nftables rules, no
1182
+ // xt_comment dependency. Pod→ClusterIP routing works, so
1183
+ // CoreDNS / helm-install / anything-talking-to-the-API works.
1184
+ // GA in k8s 1.32.
1185
+ "--kube-proxy-arg=proxy-mode=nftables",
1186
+ ...extra,
1187
+ ].join(" ");
1188
+ // The service `command` runs under `/bin/sh -c` (see runContainer in
1189
+ // daemon.ts), so derive the bridge gateway from the container's default
1190
+ // route at start time, write it as the k3s resolv-conf, then exec k3s
1191
+ // (exec so it stays the container's main process and signals / the
1192
+ // readyCheck behave exactly as before). `/run` is a tmpfs on this
1193
+ // service, so the file is writable and never persisted into a snapshot.
1194
+ const cmd =
1195
+ "GW=\"$(ip route 2>/dev/null | awk '/^default/{print $3; exit}')\"; " +
1196
+ 'if [ -n "$GW" ]; then ' +
1197
+ "printf 'nameserver %s\\noptions ndots:0\\n' \"$GW\" > /run/spectest-resolv.conf; " +
1198
+ "else echo 'spectest: no default gateway found; k3s pod DNS for peer services will not resolve' >&2; " +
1199
+ ": > /run/spectest-resolv.conf; fi; " +
1200
+ `exec ${serverArgs}`;
1201
+ // Plain /readyz probe. On a warm zot cache the cluster's images are
1202
+ // already local, so the first boot completes in seconds; the
1203
+ // first-ever boot on a cold-cache host pulls through the mirror and
1204
+ // can take a couple of minutes (covered by readyTimeoutSecs).
1205
+ const readyCmd = "kubectl get --raw=/readyz >/dev/null 2>&1";
1206
+ const def = {
1207
+ image: { type: "registry", reference: `rancher/k3s:${version}` },
1208
+ command: cmd,
1209
+ privileged: true,
1210
+ tmpfs: ["/run", "/var/run"],
1211
+ cgroupns: "host",
1212
+ // 80/443 are advisory — peer services and host code reach them via
1213
+ // the k3s container's IP. ServiceLB (klipper-lb) binds them inside
1214
+ // the container's netns and forwards to the traefik pod. 5000 is the
1215
+ // in-cluster registry (hostNetwork pod bound to the container netns),
1216
+ // reached by peers at the cluster's own `<key>.internal:5000` alias.
1217
+ ports: registryEnabled ? [80, 443, 6443, K3S_REGISTRY_PORT] : [80, 443, 6443],
1218
+ // NOTE: do NOT mount /var/lib/rancher/k3s/agent/containerd as a cache
1219
+ // volume. It was tried (to spare a recreated cluster re-pulling its
1220
+ // system images on delta restores) and a fresh k3s server against the
1221
+ // previous container's containerd store — killed un-cleanly by the
1222
+ // teardown's `docker rm -f` — wedged the apiserver minutes in
1223
+ // (rollouts never settled, pod listing started failing). The zot
1224
+ // mirror already makes those re-pulls cheap; the residual win wasn't
1225
+ // worth the recovery semantics of a crash-state store under a fresh
1226
+ // cluster db.
1227
+ ...(registriesYaml
1228
+ ? {
1229
+ files: [
1230
+ { path: "/etc/rancher/k3s/registries.yaml", content: registriesYaml },
1231
+ ],
1232
+ }
1233
+ : {}),
1234
+ readyCheck: {
1235
+ type: "exec" as const,
1236
+ command: readyCmd,
1237
+ timeoutSecs: opts.readyTimeoutSecs ?? 120,
1238
+ },
1239
+ setup: async ({ name, helpers }: { name: string; helpers: K3sHelpers }) => {
1240
+ await setupK3sCluster(name, helpers, {
1241
+ registry: registryEnabled,
1242
+ ingressDomains,
1243
+ });
1244
+ },
1245
+ helpers: async ({ name }: { name: string }): Promise<K3sHelpers> => {
1246
+ // Read the cluster's kubeconfig and address the API server by its
1247
+ // auto-assigned `<name>.internal` hostname on spectest-net. TLS
1248
+ // verification is off (see the K3sHelpers docstring), so the
1249
+ // server's cert SAN list doesn't need to include the .internal
1250
+ // name.
1251
+ const kcRead = await runDocker([
1252
+ "exec",
1253
+ name,
1254
+ "cat",
1255
+ "/etc/rancher/k3s/k3s.yaml",
1256
+ ]);
1257
+ if (kcRead.code !== 0) {
1258
+ throw new Error(
1259
+ `k3s(${name}): failed to read kubeconfig from container: ${kcRead.stderr.trim()}`,
1260
+ );
1261
+ }
1262
+
1263
+ const kubeconfig = new KubeConfig();
1264
+ kubeconfig.loadFromString(kcRead.stdout);
1265
+
1266
+ const server = `https://${name}.internal:6443`;
1267
+ // Update kc.clusters so any code that reads kubeconfig sees the
1268
+ // right server URL, but the actual request server comes from the
1269
+ // Configuration we build below. `Cluster.server` is typed `readonly`
1270
+ // by @kubernetes/client-node, but the loaded object is a plain mutable
1271
+ // record — write through a mutable view rather than rebuild the config.
1272
+ for (const cluster of kubeconfig.clusters) {
1273
+ (cluster as { -readonly [K in keyof typeof cluster]: typeof cluster[K] }).server =
1274
+ server;
1275
+ }
1276
+
1277
+ const httpApi = new FetchHttpLibrary();
1278
+ const baseServer = new ServerConfiguration(server, {});
1279
+ // eslint-disable-next-line @typescript-eslint/no-explicit-any
1280
+ const config = createConfiguration({
1281
+ baseServer,
1282
+ authMethods: { default: kubeconfig as any },
1283
+ httpApi: httpApi as any,
1284
+ });
1285
+
1286
+ const core = withTagging(new CoreV1Api(config));
1287
+ const apps = withTagging(new AppsV1Api(config));
1288
+ const objects = withTagging(new KubernetesObjectApi(config));
1289
+
1290
+ const apply = async (
1291
+ manifest: string,
1292
+ ): Promise<Wrapped<KubernetesObject>[]> => {
1293
+ const docs = loadAllYaml(manifest) as KubernetesObject[];
1294
+ const out: Wrapped<KubernetesObject>[] = [];
1295
+ for (const doc of docs) {
1296
+ if (!doc || typeof doc !== "object" || !("kind" in doc)) continue;
1297
+ // `objects.create` is wrapped by `withTagging`, so each returned
1298
+ // object already carries the back-reference to its create call.
1299
+ const created = await objects.create(doc);
1300
+ out.push(created as unknown as Wrapped<KubernetesObject>);
1301
+ }
1302
+ return out;
1303
+ };
1304
+
1305
+ return {
1306
+ kubeconfig,
1307
+ client: { core, apps, objects },
1308
+ apply,
1309
+ };
1310
+ },
1311
+ } satisfies ServiceDefinition<K3sHelpers>;
1312
+
1313
+ // Wildcard ingress domains → a dnsName(`*.<domain>`, { service: self })
1314
+ // each, attached via provides(). SELF_SERVICE_TOKEN resolves to this
1315
+ // service's key at load time (the component can't know it here). The
1316
+ // resolver then points every host under the domain at the cluster.
1317
+ if (ingressDomains.length === 0) return def;
1318
+ return provides(
1319
+ def,
1320
+ ingressDomains.map((domain) =>
1321
+ dnsName(`*.${domain}`, { service: SELF_SERVICE_TOKEN }),
1322
+ ),
1323
+ );
1324
+ }