@xeonr/upload-pool-sdk 1.3.0 → 1.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/src/tracing.ts ADDED
@@ -0,0 +1,363 @@
1
+ /**
2
+ * OpenTelemetry tracing setup for the worker SDK.
3
+ *
4
+ * Spans are exported via OTLP/HTTP-JSON to pipeline-api's `/v1/traces`
5
+ * receiver, authenticated with the pool's queue token. The pipeline-api
6
+ * receiver decodes the OTLP envelope and writes spans to its MySQL trace
7
+ * store, which backs the `/admin/workflow-run` admin UI — so worker
8
+ * callbacks nest naturally under the dispatching parseUpload trace.
9
+ *
10
+ * Design notes (see also: discussion in epic plan):
11
+ * - The TracerProvider is private to the SDK. It does *not* register
12
+ * globally — calling `trace.getTracer(...)` outside the SDK won't pick
13
+ * it up. This avoids interfering with any OTel setup the host
14
+ * application already has (e.g. its own collector → Grafana
15
+ * pipeline). The exporter ships SDK-emitted spans only.
16
+ * - `wf.run_id` is stamped on every emitted span as an attribute,
17
+ * mirrored from the job envelope's `wf_run_id` field. The pipeline-api
18
+ * receiver uses this attribute to associate the span with the right
19
+ * workflow run in `workflow_run_spans`.
20
+ * - The W3C traceparent propagator runs on outbound RPC calls *only when
21
+ * targeting the pipeline endpoint*. The uploads-api side has its own
22
+ * OTel pipeline and we don't want to inject our trace_id into its
23
+ * trees. See the interceptor in rpc-clients.ts.
24
+ *
25
+ * Body sanitization for `rpc.request` / `rpc.response` span events lives
26
+ * in the interceptor — keys matching /(token|secret|password|authorization
27
+ * |api[_-]?key|signature|sig|cookie)/i are redacted; payloads truncated
28
+ * to MAX_BODY_EVENT_BYTES.
29
+ */
30
+ import {
31
+ context,
32
+ createContextKey,
33
+ trace,
34
+ type Context,
35
+ type Span,
36
+ type SpanContext,
37
+ type Tracer,
38
+ TraceFlags,
39
+ SpanKind,
40
+ SpanStatusCode,
41
+ } from "@opentelemetry/api";
42
+ import { OTLPTraceExporter } from "@opentelemetry/exporter-trace-otlp-http";
43
+ import { Resource } from "@opentelemetry/resources";
44
+ import {
45
+ BatchSpanProcessor,
46
+ BasicTracerProvider,
47
+ } from "@opentelemetry/sdk-trace-base";
48
+ import {
49
+ ATTR_SERVICE_NAME,
50
+ ATTR_SERVICE_VERSION,
51
+ } from "@opentelemetry/semantic-conventions";
52
+
53
+ const TRACER_NAME = "@xeonr/upload-pool-sdk";
54
+
55
+ /**
56
+ * OTel context key carrying per-job attributes. Set once at the top of
57
+ * Pool.handleDispatch via `withJobContext()` and read by helpers that
58
+ * create child spans (interceptors, ctx.* spans) so every emitted span
59
+ * carries `pipeline.run_id` + identifiers — without it the trace-store
60
+ * receiver drops the span as orphan.
61
+ *
62
+ * The OTel `Span` API surface deliberately doesn't expose attributes
63
+ * back to read sites (only set/record/event), so we can't inherit them
64
+ * from a parent span at child-creation time. Carrying them in the
65
+ * Context object is the canonical OTel pattern.
66
+ */
67
+ export const JOB_ATTRS_KEY = createContextKey("uplim.job_attrs");
68
+
69
+ export interface JobAttrs {
70
+ runId: string;
71
+ jobId: string;
72
+ uploadId: string;
73
+ urn: string;
74
+ workerId: string;
75
+ }
76
+
77
+ export function getJobAttrs(): JobAttrs | undefined {
78
+ return context.active().getValue(JOB_ATTRS_KEY) as JobAttrs | undefined;
79
+ }
80
+
81
+ export function withJobContext<T>(attrs: JobAttrs, fn: () => T): T {
82
+ return context.with(context.active().setValue(JOB_ATTRS_KEY, attrs), fn);
83
+ }
84
+
85
+ export const SPAN_ATTR = {
86
+ JOB_ID: "iq.job_id",
87
+ UPLOAD_ID: "iq.upload_id",
88
+ URN: "iq.content_type_urn",
89
+ WORKER_ID: "iq.worker_id",
90
+ POOL_TOKEN_PREFIX: "iq.pool_token_prefix",
91
+ // Matches pipeline-api's TRACE_ATTRIBUTE.RUN_ID. The trace-store
92
+ // exporter reads this attribute to associate each span with the
93
+ // right workflow_run in MySQL (workflow_run_spans.run_id). Worker
94
+ // spans MUST carry it or they're dropped on the floor by the
95
+ // pipeline-api receiver.
96
+ RUN_ID: "pipeline.run_id",
97
+ RPC_METHOD: "rpc.method",
98
+ RPC_SERVICE: "rpc.service",
99
+ RPC_TARGET: "rpc.target",
100
+ RPC_CODE: "rpc.code",
101
+ RPC_DURATION_MS: "rpc.duration_ms",
102
+ S3_BYTES: "s3.bytes",
103
+ S3_URL_ORIGIN: "s3.url_origin",
104
+ S3_OUTCOME: "s3.outcome",
105
+ } as const;
106
+
107
+ export interface TracingInitConfig {
108
+ pipelineEndpoint: string;
109
+ poolToken: string;
110
+ workerId: string;
111
+ sdkVersion: string;
112
+ /**
113
+ * If true (default), the SDK initializes its own TracerProvider and
114
+ * exports spans to pipeline-api. Set false to disable all telemetry
115
+ * — handlers that read `trace.getTracer(...)` will still work via the
116
+ * global no-op provider, but nothing flushes anywhere.
117
+ */
118
+ enabled?: boolean;
119
+ }
120
+
121
+ export interface TracingHandle {
122
+ tracer: Tracer;
123
+ shutdown: () => Promise<void>;
124
+ /**
125
+ * Apply a job envelope's trace_context + wf_run_id to the active context.
126
+ * The returned Context, passed to `tracer.startActiveSpan(..., ctx, fn)`,
127
+ * roots a new span under the dispatching parseUpload activity.
128
+ */
129
+ contextFromEnvelope: (traceparent: string, runId: string) => Context;
130
+ }
131
+
132
+ let activeHandle: TracingHandle | null = null;
133
+
134
+ export function initTracing(cfg: TracingInitConfig): TracingHandle {
135
+ if (activeHandle) {
136
+ return activeHandle;
137
+ }
138
+
139
+ const enabled = cfg.enabled !== false;
140
+ if (!enabled) {
141
+ const tracer = trace.getTracer(TRACER_NAME, cfg.sdkVersion);
142
+ activeHandle = {
143
+ tracer,
144
+ shutdown: async () => {},
145
+ contextFromEnvelope: () => context.active(),
146
+ };
147
+ return activeHandle;
148
+ }
149
+
150
+ const resource = new Resource({
151
+ [ATTR_SERVICE_NAME]: TRACER_NAME,
152
+ [ATTR_SERVICE_VERSION]: cfg.sdkVersion,
153
+ [SPAN_ATTR.WORKER_ID]: cfg.workerId,
154
+ });
155
+
156
+ // OTLP/HTTP-JSON to pipeline-api's receiver. JSON over protobuf because
157
+ // it lets the receiver decode without pulling in @opentelemetry/
158
+ // otlp-transformer's protobuf runtime — the wire size penalty is fine
159
+ // for the volume the SDK emits (≈10 spans per job, ≈40 KB per job).
160
+ const exporter = new OTLPTraceExporter({
161
+ url: `${cfg.pipelineEndpoint.replace(/\/+$/, "")}/v1/traces`,
162
+ headers: {
163
+ authorization: `Bearer ${cfg.poolToken}`,
164
+ },
165
+ // Generous timeout — pipeline-api's receiver writes through to MySQL
166
+ // synchronously and we'd rather hold the export than drop a span on
167
+ // a slow ack.
168
+ timeoutMillis: 10_000,
169
+ });
170
+
171
+ const processor = new BatchSpanProcessor(exporter, {
172
+ // Default batch settings keep us flushing every 5s or 512 spans.
173
+ // We prefer faster flushes for the per-job RPC use case so the
174
+ // admin UI sees the latest activity within a couple seconds.
175
+ scheduledDelayMillis: 1_000,
176
+ maxQueueSize: 1024,
177
+ maxExportBatchSize: 128,
178
+ });
179
+
180
+ const provider = new BasicTracerProvider({
181
+ resource,
182
+ spanProcessors: [processor],
183
+ });
184
+
185
+ const tracer = provider.getTracer(TRACER_NAME, cfg.sdkVersion);
186
+
187
+ const handle: TracingHandle = {
188
+ tracer,
189
+ shutdown: async () => {
190
+ try {
191
+ await provider.shutdown();
192
+ } catch {
193
+ // Worker exit is best-effort — never block on export.
194
+ }
195
+ },
196
+ contextFromEnvelope: (traceparent: string, runId: string) => {
197
+ let ctx = context.active();
198
+ // Ensure every span produced under this context carries the
199
+ // runId attribute via baggage-like behaviour. We stamp it on
200
+ // each span at creation site instead of relying on baggage
201
+ // because the trace store reads it from span attributes.
202
+ if (!traceparent) {
203
+ return ctx;
204
+ }
205
+ const parsed = parseTraceparent(traceparent);
206
+ if (!parsed) {
207
+ return ctx;
208
+ }
209
+ ctx = trace.setSpanContext(ctx, parsed);
210
+ // runId is consumed by the caller and stamped explicitly on
211
+ // each span via stampJobAttributes(); we don't store it in
212
+ // the OTel context because there's no cross-span attribute
213
+ // mechanism in OTel without using a custom propagator.
214
+ void runId;
215
+ return ctx;
216
+ },
217
+ };
218
+
219
+ activeHandle = handle;
220
+ return handle;
221
+ }
222
+
223
+ export function getTracingHandle(): TracingHandle | null {
224
+ return activeHandle;
225
+ }
226
+
227
+ export async function shutdownTracing(): Promise<void> {
228
+ if (activeHandle) {
229
+ await activeHandle.shutdown();
230
+ activeHandle = null;
231
+ }
232
+ }
233
+
234
+ // --- traceparent helpers --------------------------------------------------
235
+
236
+ /**
237
+ * Parse a W3C `traceparent` header into an OTel SpanContext.
238
+ *
239
+ * Format: `00-<32 hex trace id>-<16 hex span id>-<2 hex flags>`. Returns
240
+ * null on malformed input. Used by the SDK to root iq.job spans under
241
+ * whatever pipeline activity dispatched the job.
242
+ */
243
+ export function parseTraceparent(value: string): SpanContext | null {
244
+ if (!value) return null;
245
+ const parts = value.trim().split("-");
246
+ if (parts.length !== 4) return null;
247
+ const [version, traceId, spanId, flagsHex] = parts;
248
+ if (version !== "00") return null;
249
+ if (!/^[0-9a-f]{32}$/.test(traceId)) return null;
250
+ if (!/^[0-9a-f]{16}$/.test(spanId)) return null;
251
+ if (!/^[0-9a-f]{2}$/.test(flagsHex)) return null;
252
+ const flags = Number.parseInt(flagsHex, 16);
253
+ return {
254
+ traceId,
255
+ spanId,
256
+ traceFlags: flags as TraceFlags,
257
+ isRemote: true,
258
+ };
259
+ }
260
+
261
+ /**
262
+ * Serialise a SpanContext as a W3C traceparent header value.
263
+ */
264
+ export function spanContextToTraceparent(sc: SpanContext): string {
265
+ const flags = sc.traceFlags.toString(16).padStart(2, "0");
266
+ return `00-${sc.traceId}-${sc.spanId}-${flags}`;
267
+ }
268
+
269
+ // --- body sanitization (req/resp on RPC span events) ----------------------
270
+
271
+ const MAX_BODY_EVENT_BYTES = 4 * 1024;
272
+ const REDACTED_KEY_PATTERN =
273
+ /(token|secret|password|authorization|api[_-]?key|signature|^sig$|cookie)/i;
274
+ const REDACTED_VALUE = "<redacted>";
275
+
276
+ /**
277
+ * Sanitize an arbitrary RPC request/response object for inclusion as a
278
+ * span event attribute. Same shape as the pipeline-api's tracing/sanitize
279
+ * (4 KB cap, key-pattern redaction) so the two ends produce consistent
280
+ * payloads in the admin UI.
281
+ */
282
+ export function sanitizeRpcBody(value: unknown): string {
283
+ try {
284
+ const cloned = redactDeep(value, 0);
285
+ const serialised = JSON.stringify(cloned);
286
+ if (serialised.length <= MAX_BODY_EVENT_BYTES) {
287
+ return serialised;
288
+ }
289
+ return `${serialised.slice(0, MAX_BODY_EVENT_BYTES)}...<truncated>`;
290
+ } catch {
291
+ return "<unserialisable>";
292
+ }
293
+ }
294
+
295
+ function redactDeep(value: unknown, depth: number): unknown {
296
+ if (depth > 6) return "<max-depth>";
297
+ if (value === null || value === undefined) return value;
298
+ if (value instanceof Uint8Array) {
299
+ return `<bytes:${value.byteLength}>`;
300
+ }
301
+ if (Array.isArray(value)) {
302
+ return value.slice(0, 50).map((v) => redactDeep(v, depth + 1));
303
+ }
304
+ if (typeof value === "object") {
305
+ const out: Record<string, unknown> = {};
306
+ const obj = value as Record<string, unknown>;
307
+ const keys = Object.keys(obj).slice(0, 50);
308
+ for (const k of keys) {
309
+ if (REDACTED_KEY_PATTERN.test(k)) {
310
+ out[k] = REDACTED_VALUE;
311
+ } else {
312
+ out[k] = redactDeep(obj[k], depth + 1);
313
+ }
314
+ }
315
+ return out;
316
+ }
317
+ if (typeof value === "string") {
318
+ // Heuristic: very long strings are likely embedded payloads; clamp.
319
+ return value.length > 1024 ? `${value.slice(0, 1024)}...` : value;
320
+ }
321
+ return value;
322
+ }
323
+
324
+ // --- span helpers --------------------------------------------------------
325
+
326
+ /**
327
+ * Set common job-scoped attributes on a span. Called by every site that
328
+ * creates a per-job span so RUN_ID / JOB_ID / UPLOAD_ID / URN are
329
+ * consistently present (the trace-store receiver requires
330
+ * `pipeline.run_id` to route the span into the right workflow run).
331
+ *
332
+ * Pulls the attribute values from the active OTel context if not
333
+ * passed explicitly, so RPC / ctx.* spans created mid-handler inherit
334
+ * them without needing to know about the envelope. The Pool root sets
335
+ * the context via `withJobContext` at the start of handleDispatch.
336
+ */
337
+ export function stampJobAttributes(
338
+ span: Span,
339
+ attrs?: Partial<JobAttrs>,
340
+ ): void {
341
+ const fromCtx = getJobAttrs();
342
+ const runId = attrs?.runId ?? fromCtx?.runId ?? "";
343
+ const jobId = attrs?.jobId ?? fromCtx?.jobId ?? "";
344
+ const uploadId = attrs?.uploadId ?? fromCtx?.uploadId;
345
+ const urn = attrs?.urn ?? fromCtx?.urn;
346
+ const workerId = attrs?.workerId ?? fromCtx?.workerId;
347
+ if (runId) span.setAttribute(SPAN_ATTR.RUN_ID, runId);
348
+ if (jobId) span.setAttribute(SPAN_ATTR.JOB_ID, jobId);
349
+ if (uploadId) span.setAttribute(SPAN_ATTR.UPLOAD_ID, uploadId);
350
+ if (urn) span.setAttribute(SPAN_ATTR.URN, urn);
351
+ if (workerId) span.setAttribute(SPAN_ATTR.WORKER_ID, workerId);
352
+ }
353
+
354
+ export function recordSpanError(span: Span, err: unknown): void {
355
+ if (err instanceof Error) {
356
+ span.recordException(err);
357
+ span.setStatus({ code: SpanStatusCode.ERROR, message: err.message });
358
+ } else {
359
+ span.setStatus({ code: SpanStatusCode.ERROR, message: String(err) });
360
+ }
361
+ }
362
+
363
+ export { SpanKind, SpanStatusCode };
package/src/types.ts CHANGED
@@ -60,6 +60,14 @@ export interface PoolConfig {
60
60
  logger?: import("./logger.js").Logger;
61
61
  /** Max concurrent in-flight jobs. Defaults to 1. */
62
62
  concurrency?: number;
63
+ /**
64
+ * Tracing config. The SDK ships spans to pipeline-api's /v1/traces
65
+ * receiver for the admin UI. Disable with `{ enabled: false }` to
66
+ * skip the OTel SDK setup entirely (e.g. in tests).
67
+ */
68
+ tracing?: {
69
+ enabled?: boolean;
70
+ };
63
71
  }
64
72
 
65
73
  export type JobHandler = (ctx: JobContext) => Promise<void>;