@decocms/start 6.0.1 → 6.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,489 @@
1
+ /**
2
+ * OTLP/HTTP JSON trace exporter — direct POST from a Cloudflare Worker
3
+ * to `deco-otel-ingest` `/v1/traces`.
4
+ *
5
+ * Mirrors `otelHttpMeter.ts` in shape: per-isolate buffer, flush via
6
+ * `ctx.waitUntil` at request end, cooldown + buffer cap.
7
+ *
8
+ * **Why direct POST instead of CF Destinations + `@opentelemetry/api` bridge.**
9
+ * The bridge tracer in `otel.ts` delegates to `trace.getTracer(...)`. Without
10
+ * a registered `TracerProvider`, that's a no-op proxy and every framework
11
+ * `deco.*` span we create silently disappears. Empirical verification (May
12
+ * 2026) on prod sites confirmed: zero `deco.*` rows in `otel_traces` despite
13
+ * `withTracing` being called on every request. This adapter ships those
14
+ * spans direct-POST, same channel as metrics + error logs.
15
+ *
16
+ * **Sampling.** Consistent per-trace decision via FNV-1a hash of the
17
+ * trace-id. Caller passes `headSamplingRate` (default 0.01 = 1%) to match
18
+ * the CF Destinations `traces.head_sampling_rate` default and keep volume
19
+ * within budget. Parent-based override: if an incoming W3C `traceparent`
20
+ * header carried `flags=01` (sampled), every span in that trace is
21
+ * exported regardless of the rate.
22
+ *
23
+ * **Buffer.** Spans are buffered per-isolate until `flush()` ships them.
24
+ * Unlike metrics (CUMULATIVE temporality, lifelong buffer), traces are
25
+ * one-shot per span: buffer drains on flush and starts empty for the next
26
+ * window. Cap (`maxBufferSpans`) protects against runaway trace volume.
27
+ *
28
+ * **Parent linkage.** `startSpan` reads the active span from a caller-
29
+ * supplied accessor (`getActiveSpanForParent`) — the framework wires this
30
+ * to `getActiveSpan` from `middleware/observability.ts` so child spans
31
+ * inherit `trace_id` + record `parent_span_id` automatically. Root spans
32
+ * (no active parent) consult `getRequestTraceContext` to pick up the
33
+ * incoming W3C traceparent, or generate a fresh trace.
34
+ */
35
+
36
+ import type { Span, TracerAdapter } from "../middleware/observability";
37
+
38
+ // ---------------------------------------------------------------------------
39
+ // W3C traceparent parsing
40
+ // ---------------------------------------------------------------------------
41
+
42
+ /**
43
+ * Trace context lifted from an inbound W3C `traceparent` header (RFC
44
+ * tracecontext, `version-traceId-parentId-flags`). `remoteParent: true`
45
+ * indicates the parent span lives in another service.
46
+ */
47
+ export interface TraceContext {
48
+ traceId: string;
49
+ parentSpanId: string;
50
+ /** Parent's `traceFlags & 0x01` — the W3C "sampled" bit. */
51
+ sampled: boolean;
52
+ /** True when the context came from an inbound header (remote parent). */
53
+ remoteParent: boolean;
54
+ }
55
+
56
+ /**
57
+ * Parse a W3C `traceparent` value. Returns `null` on any structural
58
+ * violation, including the well-known all-zero IDs (which OTel treats
59
+ * as invalid — see W3C tracecontext §3.2.2).
60
+ */
61
+ export function parseTraceparent(value: string | null | undefined): TraceContext | null {
62
+ if (!value) return null;
63
+ const parts = value.split("-");
64
+ if (parts.length !== 4) return null;
65
+ const [version, traceId, parentId, flags] = parts;
66
+ if (version !== "00") return null;
67
+ if (!/^[0-9a-f]{32}$/.test(traceId)) return null;
68
+ if (traceId === "0".repeat(32)) return null;
69
+ if (!/^[0-9a-f]{16}$/.test(parentId)) return null;
70
+ if (parentId === "0".repeat(16)) return null;
71
+ if (!/^[0-9a-f]{2}$/.test(flags)) return null;
72
+ const sampled = (Number.parseInt(flags, 16) & 0x01) === 0x01;
73
+ return { traceId, parentSpanId: parentId, sampled, remoteParent: true };
74
+ }
75
+
76
+ // ---------------------------------------------------------------------------
77
+ // ID generation + sampling
78
+ // ---------------------------------------------------------------------------
79
+
80
+ function randomHex(bytes: number): string {
81
+ if (typeof crypto !== "undefined" && crypto.getRandomValues) {
82
+ const buf = new Uint8Array(bytes);
83
+ crypto.getRandomValues(buf);
84
+ let out = "";
85
+ for (let i = 0; i < buf.length; i++) {
86
+ out += buf[i].toString(16).padStart(2, "0");
87
+ }
88
+ return out;
89
+ }
90
+ // Fallback for runtimes without crypto.getRandomValues (tests, some
91
+ // sandboxes). Lower-entropy but the function still satisfies the
92
+ // length contract — collisions in this branch are operational
93
+ // breadcrumbs, not a security threat.
94
+ let out = "";
95
+ while (out.length < bytes * 2) {
96
+ out += Math.floor(Math.random() * 0xffff)
97
+ .toString(16)
98
+ .padStart(4, "0");
99
+ }
100
+ return out.slice(0, bytes * 2);
101
+ }
102
+
103
+ export function newTraceId(): string {
104
+ return randomHex(16);
105
+ }
106
+
107
+ export function newSpanId(): string {
108
+ return randomHex(8);
109
+ }
110
+
111
+ /**
112
+ * FNV-1a 32-bit hash over the trace ID (hex string). Cheap, dependency-
113
+ * free, and produces a uniform distribution over 32-bit unsigned ints —
114
+ * good enough for consistent head sampling.
115
+ */
116
+ function hashTraceId(traceId: string): number {
117
+ let h = 2166136261;
118
+ for (let i = 0; i < traceId.length; i++) {
119
+ h ^= traceId.charCodeAt(i);
120
+ h = Math.imul(h, 16777619);
121
+ }
122
+ return h >>> 0;
123
+ }
124
+
125
+ /**
126
+ * Consistent head-sampling decision: every span in a trace gets the same
127
+ * answer because the input is `traceId`, not `spanId`. Caller is
128
+ * responsible for parent-based overrides (see `shouldExportSpan`).
129
+ */
130
+ export function shouldSampleTrace(traceId: string, rate: number): boolean {
131
+ if (!Number.isFinite(rate) || rate >= 1) return true;
132
+ if (rate <= 0) return false;
133
+ return hashTraceId(traceId) / 0xffffffff < rate;
134
+ }
135
+
136
+ // ---------------------------------------------------------------------------
137
+ // Types
138
+ // ---------------------------------------------------------------------------
139
+
140
+ type Labels = Record<string, string | number | boolean>;
141
+
142
+ /** OTel `SpanKind` enum. We only ever emit INTERNAL spans. */
143
+ const SPAN_KIND_INTERNAL = 1;
144
+ /** OTel `StatusCode.OK` = 1, `ERROR` = 2 (and `UNSET` = 0). */
145
+ const STATUS_OK = 1;
146
+ const STATUS_ERROR = 2;
147
+
148
+ interface SpanEvent {
149
+ name: string;
150
+ timeUnixNano: string;
151
+ attributes: Labels;
152
+ }
153
+
154
+ interface SpanRecord {
155
+ name: string;
156
+ traceId: string;
157
+ spanId: string;
158
+ parentSpanId: string;
159
+ startTimeUnixNano: string;
160
+ endTimeUnixNano: string;
161
+ attributes: Labels;
162
+ events: SpanEvent[];
163
+ status: { code: number; message: string };
164
+ }
165
+
166
+ export interface OtlpHttpTracerOptions {
167
+ /** Full OTLP/HTTP JSON traces endpoint, e.g. `https://.../v1/traces`. */
168
+ endpoint: string;
169
+ /** Resource attributes stamped on every OTLP payload (service.name etc.). */
170
+ resourceAttributes: Record<string, string>;
171
+ /** Scope name advertised in `scopeSpans[].scope.name`. */
172
+ scopeName?: string;
173
+ /** Scope version. */
174
+ scopeVersion?: string;
175
+ /**
176
+ * Head sampling rate, 0.0..1.0. Default 0.01 to match the
177
+ * CF Destinations `traces.head_sampling_rate` recommendation. Set to
178
+ * 1 to capture every trace; set to 0 to disable export entirely.
179
+ *
180
+ * Decisions are consistent per trace (FNV-1a hash of the trace ID),
181
+ * so a `deco.cms.resolvePage` child span is sampled iff the
182
+ * `deco.http.request` root for the same trace is sampled.
183
+ */
184
+ headSamplingRate?: number;
185
+ /** Hard cap on pending spans. Default: 2000. */
186
+ maxBufferSpans?: number;
187
+ /** Cooldown between successful flushes (ms). Default: 5000. */
188
+ minFlushIntervalMs?: number;
189
+ /** Per-flush HTTP timeout (ms). Default: 5000. */
190
+ flushTimeoutMs?: number;
191
+ /**
192
+ * Test seam — override `fetch` for the flush path. Same role as in
193
+ * `otelHttpMeter.ts`.
194
+ */
195
+ fetchImpl?: typeof fetch;
196
+ /** Test seam — override `Date.now()` for deterministic timestamps. */
197
+ nowMs?: () => number;
198
+ /** Optional sink for transport errors. */
199
+ onError?: (kind: "flush" | "overflow", err: unknown) => void;
200
+ /**
201
+ * Accessor for the currently-active span at the moment `startSpan` is
202
+ * called. The framework wires this to `getActiveSpan()` from
203
+ * `middleware/observability.ts` so child spans inherit `trace_id` and
204
+ * record `parent_span_id`. Returns `null` for root spans.
205
+ */
206
+ getActiveSpanForParent: () => Span | null;
207
+ /**
208
+ * Accessor for the per-request trace context (parsed from inbound
209
+ * `traceparent`). Read at root-span creation time so we honor remote
210
+ * parents and the `sampled` flag.
211
+ */
212
+ getRequestTraceContext?: () => TraceContext | null;
213
+ }
214
+
215
+ export interface OtlpHttpTracer extends TracerAdapter {
216
+ /** Drain the buffer (subject to cooldown). */
217
+ flush(): Promise<void>;
218
+ /** For tests + the audit channel. */
219
+ pendingSpanCount(): number;
220
+ }
221
+
222
+ // ---------------------------------------------------------------------------
223
+ // Factory
224
+ // ---------------------------------------------------------------------------
225
+
226
+ export function createOtlpHttpTracerAdapter(options: OtlpHttpTracerOptions): OtlpHttpTracer {
227
+ const endpoint = options.endpoint;
228
+ const resourceAttributes = options.resourceAttributes;
229
+ const scopeName = options.scopeName ?? "@decocms/start";
230
+ const scopeVersion = options.scopeVersion ?? "";
231
+ const headSamplingRate = options.headSamplingRate ?? 0.01;
232
+ const maxBuffer = options.maxBufferSpans ?? 2000;
233
+ const minFlushIntervalMs = options.minFlushIntervalMs ?? 5000;
234
+ const flushTimeoutMs = options.flushTimeoutMs ?? 5000;
235
+ const fetchImpl = options.fetchImpl ?? fetch;
236
+ const now = options.nowMs ?? (() => Date.now());
237
+ const onError = options.onError;
238
+ const getActiveSpanForParent = options.getActiveSpanForParent;
239
+ const getRequestTraceContext = options.getRequestTraceContext;
240
+
241
+ // Buffer of completed spans waiting to ship. Sampling decision is taken
242
+ // at span-end (not span-start) so attribute mutations during the span
243
+ // lifetime are captured in the record we drop or keep.
244
+ const spans: SpanRecord[] = [];
245
+ let lastFlushAt = 0;
246
+ let inflight: Promise<void> | null = null;
247
+
248
+ function pendingSpanCount(): number {
249
+ return spans.length;
250
+ }
251
+
252
+ /**
253
+ * Decide whether to export a finished span. Honors:
254
+ * 1. Remote parent's `sampled` flag (always wins — joins external traces).
255
+ * 2. Trace-ID hash vs `headSamplingRate` (consistent per trace).
256
+ */
257
+ function shouldExportSpan(traceId: string, remoteSampled: boolean | null): boolean {
258
+ if (remoteSampled === true) return true;
259
+ return shouldSampleTrace(traceId, headSamplingRate);
260
+ }
261
+
262
+ function startSpan(name: string, attributes?: Labels): Span {
263
+ const parent = getActiveSpanForParent();
264
+ const parentCtx = parent?.spanContext?.();
265
+
266
+ // Inherit trace ID from the parent (in-process or remote). Only root
267
+ // spans with no parent context generate a fresh trace ID.
268
+ const remoteCtx = parentCtx ? null : (getRequestTraceContext?.() ?? null);
269
+ const traceId = parentCtx?.traceId ?? remoteCtx?.traceId ?? newTraceId();
270
+ const parentSpanId = parentCtx?.spanId ?? remoteCtx?.parentSpanId ?? "";
271
+ // traceFlags propagated unchanged — if the inbound traceparent said
272
+ // sampled, downstream services that join via our `traceparent` header
273
+ // (see `injectTraceContext`) see the same flag.
274
+ const traceFlags = remoteCtx?.sampled ? 0x01 : (parentCtx?.traceFlags ?? 0x00);
275
+ const remoteSampled = remoteCtx?.sampled ?? null;
276
+
277
+ const spanId = newSpanId();
278
+ const startTimeNs = msToNs(now());
279
+ const record: SpanRecord = {
280
+ name,
281
+ traceId,
282
+ spanId,
283
+ parentSpanId,
284
+ startTimeUnixNano: startTimeNs,
285
+ endTimeUnixNano: startTimeNs, // overwritten on end()
286
+ attributes: { ...(attributes ?? {}) },
287
+ events: [],
288
+ status: { code: 0, message: "" },
289
+ };
290
+
291
+ let ended = false;
292
+
293
+ return {
294
+ end(): void {
295
+ if (ended) return;
296
+ ended = true;
297
+ record.endTimeUnixNano = msToNs(now());
298
+
299
+ // Sample at the END so attribute mutations during the span are
300
+ // captured in the kept record. The decision is consistent across
301
+ // every span in the trace because it hashes `traceId`.
302
+ if (!shouldExportSpan(traceId, remoteSampled)) return;
303
+
304
+ if (spans.length >= maxBuffer) {
305
+ onError?.("overflow", new Error(`trace buffer at cap (${maxBuffer}) — dropping span`));
306
+ return;
307
+ }
308
+ spans.push(record);
309
+ },
310
+ setError(error: unknown): void {
311
+ const message = error instanceof Error ? error.message : String(error);
312
+ record.status = { code: STATUS_ERROR, message };
313
+ record.attributes["exception.type"] =
314
+ error instanceof Error ? error.constructor.name : "unknown";
315
+ record.attributes["exception.message"] = message;
316
+ if (error instanceof Error && error.stack) {
317
+ record.attributes["exception.stacktrace"] = error.stack;
318
+ }
319
+ record.events.push({
320
+ name: "exception",
321
+ timeUnixNano: msToNs(now()),
322
+ attributes: {
323
+ "exception.type":
324
+ error instanceof Error ? error.constructor.name : "unknown",
325
+ "exception.message": message,
326
+ ...(error instanceof Error && error.stack
327
+ ? { "exception.stacktrace": error.stack }
328
+ : {}),
329
+ },
330
+ });
331
+ },
332
+ setAttribute(key: string, value: string | number | boolean): void {
333
+ record.attributes[key] = value;
334
+ // Status promotion: setting an OK-ish HTTP status_code transitions
335
+ // an UNSET span to OK so dashboards see "succeeded" explicitly.
336
+ if (key === "http.status_code" && typeof value === "number" && value < 400 && record.status.code === 0) {
337
+ record.status = { code: STATUS_OK, message: "" };
338
+ }
339
+ },
340
+ spanContext(): { traceId: string; spanId: string; traceFlags: number } {
341
+ return { traceId, spanId, traceFlags };
342
+ },
343
+ };
344
+ }
345
+
346
+ async function doFlush(): Promise<void> {
347
+ if (spans.length === 0) return;
348
+
349
+ // Snapshot + reset buffer before the network call so concurrent
350
+ // span ends during the POST land in the next window.
351
+ const batch = spans.splice(0, spans.length);
352
+
353
+ const payload = serializeOtlpTraces(batch, {
354
+ resourceAttributes,
355
+ scopeName,
356
+ scopeVersion,
357
+ });
358
+
359
+ const controller = new AbortController();
360
+ const timer = setTimeout(() => controller.abort(), flushTimeoutMs);
361
+ try {
362
+ const res = await fetchImpl(endpoint, {
363
+ method: "POST",
364
+ headers: { "Content-Type": "application/json" },
365
+ body: JSON.stringify(payload),
366
+ signal: controller.signal,
367
+ });
368
+ if (!res.ok) {
369
+ try {
370
+ await res.text();
371
+ } catch {
372
+ /* swallow */
373
+ }
374
+ onError?.("flush", new Error(`POST ${endpoint} → ${res.status}`));
375
+ }
376
+ } catch (err) {
377
+ onError?.("flush", err);
378
+ } finally {
379
+ clearTimeout(timer);
380
+ }
381
+ }
382
+
383
+ async function flush(): Promise<void> {
384
+ if (inflight) return inflight;
385
+ const elapsed = now() - lastFlushAt;
386
+ const overCap = spans.length >= maxBuffer;
387
+ if (!overCap && elapsed < minFlushIntervalMs) return;
388
+ inflight = doFlush().finally(() => {
389
+ lastFlushAt = now();
390
+ inflight = null;
391
+ });
392
+ return inflight;
393
+ }
394
+
395
+ return {
396
+ startSpan,
397
+ flush,
398
+ pendingSpanCount,
399
+ };
400
+ }
401
+
402
+ // ---------------------------------------------------------------------------
403
+ // OTLP/HTTP JSON serialization for traces
404
+ // ---------------------------------------------------------------------------
405
+
406
+ function msToNs(ms: number): string {
407
+ return `${Math.floor(ms)}000000`;
408
+ }
409
+
410
+ function attrsToOtlp(attrs: Labels): Array<{
411
+ key: string;
412
+ value:
413
+ | { stringValue: string }
414
+ | { intValue: string }
415
+ | { doubleValue: number }
416
+ | { boolValue: boolean };
417
+ }> {
418
+ const out: ReturnType<typeof attrsToOtlp> = [];
419
+ for (const k of Object.keys(attrs).sort()) {
420
+ const v = attrs[k];
421
+ if (v === undefined || v === null) continue;
422
+ if (typeof v === "string") out.push({ key: k, value: { stringValue: v } });
423
+ else if (typeof v === "boolean") out.push({ key: k, value: { boolValue: v } });
424
+ else if (Number.isInteger(v)) out.push({ key: k, value: { intValue: String(v) } });
425
+ else out.push({ key: k, value: { doubleValue: v } });
426
+ }
427
+ return out;
428
+ }
429
+
430
+ interface SerializeOpts {
431
+ resourceAttributes: Record<string, string>;
432
+ scopeName: string;
433
+ scopeVersion: string;
434
+ }
435
+
436
+ function serializeOtlpTraces(
437
+ batch: SpanRecord[],
438
+ opts: SerializeOpts,
439
+ ): { resourceSpans: unknown[] } {
440
+ const otlpSpans = batch.map((s) => ({
441
+ traceId: s.traceId,
442
+ spanId: s.spanId,
443
+ parentSpanId: s.parentSpanId,
444
+ name: s.name,
445
+ kind: SPAN_KIND_INTERNAL,
446
+ startTimeUnixNano: s.startTimeUnixNano,
447
+ endTimeUnixNano: s.endTimeUnixNano,
448
+ attributes: attrsToOtlp(s.attributes),
449
+ status: {
450
+ code: s.status.code,
451
+ ...(s.status.message ? { message: s.status.message } : {}),
452
+ },
453
+ events: s.events.map((e) => ({
454
+ timeUnixNano: e.timeUnixNano,
455
+ name: e.name,
456
+ attributes: attrsToOtlp(e.attributes),
457
+ })),
458
+ }));
459
+
460
+ const resourceAttrs: Array<{ key: string; value: { stringValue: string } }> = [];
461
+ for (const k of Object.keys(opts.resourceAttributes).sort()) {
462
+ resourceAttrs.push({ key: k, value: { stringValue: opts.resourceAttributes[k] } });
463
+ }
464
+
465
+ return {
466
+ resourceSpans: [
467
+ {
468
+ resource: { attributes: resourceAttrs },
469
+ scopeSpans: [
470
+ {
471
+ scope: { name: opts.scopeName, version: opts.scopeVersion },
472
+ spans: otlpSpans,
473
+ },
474
+ ],
475
+ },
476
+ ],
477
+ };
478
+ }
479
+
480
+ /**
481
+ * Test seam: internals exposed for unit tests only. Never use from app
482
+ * code — the surface is unstable and might change between minor releases.
483
+ */
484
+ export const _internals = {
485
+ parseTraceparent,
486
+ shouldSampleTrace,
487
+ newTraceId,
488
+ newSpanId,
489
+ };
@@ -38,6 +38,22 @@ export interface RequestContextData {
38
38
  request: Request;
39
39
  signal: AbortSignal;
40
40
  startedAt: number;
41
+ /**
42
+ * Stable per-request identifier. Generated by `workerEntry.ts` from
43
+ * `cf-ray` when present, falling back to `crypto.randomUUID()`. Echoed
44
+ * back to the client as the `X-Request-Id` response header so:
45
+ *
46
+ * - The producer Worker can stamp it on every log / span / metric
47
+ * attribute (`request.id`) for join queries.
48
+ * - The CF tail worker reads it from the response header and stamps
49
+ * the same `request.id` on tail rows — restores the join key on a
50
+ * channel that has no other shared identity with direct-POST.
51
+ * - Clients can include it in support requests so we can pull the
52
+ * full trace + logs from one ID.
53
+ *
54
+ * See `MIGRATION_TOOLING_PLAN.md` D-9 for the propagation policy.
55
+ */
56
+ requestId: string;
41
57
  /** Lazily computed device type. */
42
58
  _device?: "mobile" | "desktop";
43
59
  /** Lazily computed bot detection result. */
@@ -93,10 +109,30 @@ export const RequestContext = {
93
109
  }
94
110
  }
95
111
 
112
+ // Resolve request.id with explicit precedence so the choice is
113
+ // auditable from a single read:
114
+ // (1) inbound `x-request-id` — caller-supplied (e.g. a load balancer
115
+ // already tagged the request, or a synthetic test passed one in)
116
+ // (2) `cf-ray` — Cloudflare's own per-request identifier, present
117
+ // on every real Workers request and globally unique
118
+ // (3) `crypto.randomUUID()` — last resort for tests and locally-run
119
+ // dev without `cf-ray`
120
+ const incomingReqId = request.headers.get("x-request-id");
121
+ const cfRay = request.headers.get("cf-ray");
122
+ const requestId =
123
+ incomingReqId && incomingReqId.length > 0
124
+ ? incomingReqId
125
+ : cfRay && cfRay.length > 0
126
+ ? cfRay
127
+ : (typeof crypto !== "undefined" && typeof crypto.randomUUID === "function"
128
+ ? crypto.randomUUID()
129
+ : `${Date.now()}-${Math.random().toString(16).slice(2)}`);
130
+
96
131
  const ctx: RequestContextData = {
97
132
  request,
98
133
  signal: controller.signal,
99
134
  startedAt: Date.now(),
135
+ requestId,
100
136
  bag: new Map(),
101
137
  responseHeaders: new Headers(),
102
138
  };
@@ -131,6 +167,16 @@ export const RequestContext = {
131
167
  return ctx.signal;
132
168
  },
133
169
 
170
+ /**
171
+ * The stable per-request identifier — see `RequestContextData.requestId`
172
+ * for the full propagation policy. Returns `null` when called outside a
173
+ * request scope (vs throwing, because observability helpers in the
174
+ * shutdown path of a Worker may pull it lazily).
175
+ */
176
+ get requestId(): string | null {
177
+ return storage.getStore()?.requestId ?? null;
178
+ },
179
+
134
180
  /**
135
181
  * Detected device type based on User-Agent.
136
182
  */