@xeonr/upload-pool-sdk 1.3.0 → 1.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/src/tracing.ts ADDED
@@ -0,0 +1,333 @@
1
+ /**
2
+ * OpenTelemetry tracing setup for the worker SDK.
3
+ *
4
+ * Spans are exported via OTLP/HTTP-JSON to pipeline-api's `/v1/traces`
5
+ * receiver, authenticated with the pool's queue token. The pipeline-api
6
+ * receiver decodes the OTLP envelope and writes spans to its MySQL trace
7
+ * store, which backs the `/admin/workflow-run` admin UI — so worker
8
+ * callbacks nest naturally under the dispatching parseUpload trace.
9
+ *
10
+ * Design notes (see also: discussion in epic plan):
11
+ * - The TracerProvider is private to the SDK. It does *not* register
12
+ * globally — calling `trace.getTracer(...)` outside the SDK won't pick
13
+ * it up. This avoids interfering with any OTel setup the host
14
+ * application already has (e.g. its own collector → Grafana
15
+ * pipeline). The exporter ships SDK-emitted spans only.
16
+ * - `wf.run_id` is stamped on every emitted span as an attribute,
17
+ * mirrored from the job envelope's `wf_run_id` field. The pipeline-api
18
+ * receiver uses this attribute to associate the span with the right
19
+ * workflow run in `workflow_run_spans`.
20
+ * - The W3C traceparent propagator runs on outbound RPC calls *only when
21
+ * targeting the pipeline endpoint*. The uploads-api side has its own
22
+ * OTel pipeline and we don't want to inject our trace_id into its
23
+ * trees. See the interceptor in rpc-clients.ts.
24
+ *
25
+ * Body sanitization for `rpc.request` / `rpc.response` span events lives
26
+ * in the interceptor — keys matching /(token|secret|password|authorization
27
+ * |api[_-]?key|signature|sig|cookie)/i are redacted; payloads truncated
28
+ * to MAX_BODY_EVENT_BYTES.
29
+ */
30
+ import {
31
+ context,
32
+ trace,
33
+ type Context,
34
+ type Span,
35
+ type SpanContext,
36
+ type Tracer,
37
+ TraceFlags,
38
+ SpanKind,
39
+ SpanStatusCode,
40
+ } from "@opentelemetry/api";
41
+ import { OTLPTraceExporter } from "@opentelemetry/exporter-trace-otlp-http";
42
+ import { Resource } from "@opentelemetry/resources";
43
+ import {
44
+ BatchSpanProcessor,
45
+ BasicTracerProvider,
46
+ } from "@opentelemetry/sdk-trace-base";
47
+ import {
48
+ ATTR_SERVICE_NAME,
49
+ ATTR_SERVICE_VERSION,
50
+ } from "@opentelemetry/semantic-conventions";
51
+
52
+ const TRACER_NAME = "@xeonr/upload-pool-sdk";
53
+
54
+ export const SPAN_ATTR = {
55
+ JOB_ID: "iq.job_id",
56
+ UPLOAD_ID: "iq.upload_id",
57
+ URN: "iq.content_type_urn",
58
+ WORKER_ID: "iq.worker_id",
59
+ POOL_TOKEN_PREFIX: "iq.pool_token_prefix",
60
+ // Matches pipeline-api's TRACE_ATTRIBUTE.RUN_ID. The trace-store
61
+ // exporter reads this attribute to associate each span with the
62
+ // right workflow_run in MySQL (workflow_run_spans.run_id). Worker
63
+ // spans MUST carry it or they're dropped on the floor by the
64
+ // pipeline-api receiver.
65
+ RUN_ID: "pipeline.run_id",
66
+ RPC_METHOD: "rpc.method",
67
+ RPC_SERVICE: "rpc.service",
68
+ RPC_TARGET: "rpc.target",
69
+ RPC_CODE: "rpc.code",
70
+ RPC_DURATION_MS: "rpc.duration_ms",
71
+ S3_BYTES: "s3.bytes",
72
+ S3_URL_ORIGIN: "s3.url_origin",
73
+ S3_OUTCOME: "s3.outcome",
74
+ } as const;
75
+
76
+ export interface TracingInitConfig {
77
+ pipelineEndpoint: string;
78
+ poolToken: string;
79
+ workerId: string;
80
+ sdkVersion: string;
81
+ /**
82
+ * If true (default), the SDK initializes its own TracerProvider and
83
+ * exports spans to pipeline-api. Set false to disable all telemetry
84
+ * — handlers that read `trace.getTracer(...)` will still work via the
85
+ * global no-op provider, but nothing flushes anywhere.
86
+ */
87
+ enabled?: boolean;
88
+ }
89
+
90
+ export interface TracingHandle {
91
+ tracer: Tracer;
92
+ shutdown: () => Promise<void>;
93
+ /**
94
+ * Apply a job envelope's trace_context + wf_run_id to the active context.
95
+ * The returned Context, passed to `tracer.startActiveSpan(..., ctx, fn)`,
96
+ * roots a new span under the dispatching parseUpload activity.
97
+ */
98
+ contextFromEnvelope: (traceparent: string, runId: string) => Context;
99
+ }
100
+
101
+ let activeHandle: TracingHandle | null = null;
102
+
103
+ export function initTracing(cfg: TracingInitConfig): TracingHandle {
104
+ if (activeHandle) {
105
+ return activeHandle;
106
+ }
107
+
108
+ const enabled = cfg.enabled !== false;
109
+ if (!enabled) {
110
+ const tracer = trace.getTracer(TRACER_NAME, cfg.sdkVersion);
111
+ activeHandle = {
112
+ tracer,
113
+ shutdown: async () => {},
114
+ contextFromEnvelope: () => context.active(),
115
+ };
116
+ return activeHandle;
117
+ }
118
+
119
+ const resource = new Resource({
120
+ [ATTR_SERVICE_NAME]: TRACER_NAME,
121
+ [ATTR_SERVICE_VERSION]: cfg.sdkVersion,
122
+ [SPAN_ATTR.WORKER_ID]: cfg.workerId,
123
+ });
124
+
125
+ // OTLP/HTTP-JSON to pipeline-api's receiver. JSON over protobuf because
126
+ // it lets the receiver decode without pulling in @opentelemetry/
127
+ // otlp-transformer's protobuf runtime — the wire size penalty is fine
128
+ // for the volume the SDK emits (≈10 spans per job, ≈40 KB per job).
129
+ const exporter = new OTLPTraceExporter({
130
+ url: `${cfg.pipelineEndpoint.replace(/\/+$/, "")}/v1/traces`,
131
+ headers: {
132
+ authorization: `Bearer ${cfg.poolToken}`,
133
+ },
134
+ // Generous timeout — pipeline-api's receiver writes through to MySQL
135
+ // synchronously and we'd rather hold the export than drop a span on
136
+ // a slow ack.
137
+ timeoutMillis: 10_000,
138
+ });
139
+
140
+ const processor = new BatchSpanProcessor(exporter, {
141
+ // Default batch settings keep us flushing every 5s or 512 spans.
142
+ // We prefer faster flushes for the per-job RPC use case so the
143
+ // admin UI sees the latest activity within a couple seconds.
144
+ scheduledDelayMillis: 1_000,
145
+ maxQueueSize: 1024,
146
+ maxExportBatchSize: 128,
147
+ });
148
+
149
+ const provider = new BasicTracerProvider({
150
+ resource,
151
+ spanProcessors: [processor],
152
+ });
153
+
154
+ const tracer = provider.getTracer(TRACER_NAME, cfg.sdkVersion);
155
+
156
+ const handle: TracingHandle = {
157
+ tracer,
158
+ shutdown: async () => {
159
+ try {
160
+ await provider.shutdown();
161
+ } catch {
162
+ // Worker exit is best-effort — never block on export.
163
+ }
164
+ },
165
+ contextFromEnvelope: (traceparent: string, runId: string) => {
166
+ let ctx = context.active();
167
+ // Ensure every span produced under this context carries the
168
+ // runId attribute via baggage-like behaviour. We stamp it on
169
+ // each span at creation site instead of relying on baggage
170
+ // because the trace store reads it from span attributes.
171
+ if (!traceparent) {
172
+ return ctx;
173
+ }
174
+ const parsed = parseTraceparent(traceparent);
175
+ if (!parsed) {
176
+ return ctx;
177
+ }
178
+ ctx = trace.setSpanContext(ctx, parsed);
179
+ // runId is consumed by the caller and stamped explicitly on
180
+ // each span via stampJobAttributes(); we don't store it in
181
+ // the OTel context because there's no cross-span attribute
182
+ // mechanism in OTel without using a custom propagator.
183
+ void runId;
184
+ return ctx;
185
+ },
186
+ };
187
+
188
+ activeHandle = handle;
189
+ return handle;
190
+ }
191
+
192
+ export function getTracingHandle(): TracingHandle | null {
193
+ return activeHandle;
194
+ }
195
+
196
+ export async function shutdownTracing(): Promise<void> {
197
+ if (activeHandle) {
198
+ await activeHandle.shutdown();
199
+ activeHandle = null;
200
+ }
201
+ }
202
+
203
+ // --- traceparent helpers --------------------------------------------------
204
+
205
+ /**
206
+ * Parse a W3C `traceparent` header into an OTel SpanContext.
207
+ *
208
+ * Format: `00-<32 hex trace id>-<16 hex span id>-<2 hex flags>`. Returns
209
+ * null on malformed input. Used by the SDK to root iq.job spans under
210
+ * whatever pipeline activity dispatched the job.
211
+ */
212
+ export function parseTraceparent(value: string): SpanContext | null {
213
+ if (!value) return null;
214
+ const parts = value.trim().split("-");
215
+ if (parts.length !== 4) return null;
216
+ const [version, traceId, spanId, flagsHex] = parts;
217
+ if (version !== "00") return null;
218
+ if (!/^[0-9a-f]{32}$/.test(traceId)) return null;
219
+ if (!/^[0-9a-f]{16}$/.test(spanId)) return null;
220
+ if (!/^[0-9a-f]{2}$/.test(flagsHex)) return null;
221
+ const flags = Number.parseInt(flagsHex, 16);
222
+ return {
223
+ traceId,
224
+ spanId,
225
+ traceFlags: flags as TraceFlags,
226
+ isRemote: true,
227
+ };
228
+ }
229
+
230
+ /**
231
+ * Serialise a SpanContext as a W3C traceparent header value.
232
+ */
233
+ export function spanContextToTraceparent(sc: SpanContext): string {
234
+ const flags = sc.traceFlags.toString(16).padStart(2, "0");
235
+ return `00-${sc.traceId}-${sc.spanId}-${flags}`;
236
+ }
237
+
238
+ // --- body sanitization (req/resp on RPC span events) ----------------------
239
+
240
+ const MAX_BODY_EVENT_BYTES = 4 * 1024;
241
+ const REDACTED_KEY_PATTERN =
242
+ /(token|secret|password|authorization|api[_-]?key|signature|^sig$|cookie)/i;
243
+ const REDACTED_VALUE = "<redacted>";
244
+
245
+ /**
246
+ * Sanitize an arbitrary RPC request/response object for inclusion as a
247
+ * span event attribute. Same shape as the pipeline-api's tracing/sanitize
248
+ * (4 KB cap, key-pattern redaction) so the two ends produce consistent
249
+ * payloads in the admin UI.
250
+ */
251
+ export function sanitizeRpcBody(value: unknown): string {
252
+ try {
253
+ const cloned = redactDeep(value, 0);
254
+ const serialised = JSON.stringify(cloned);
255
+ if (serialised.length <= MAX_BODY_EVENT_BYTES) {
256
+ return serialised;
257
+ }
258
+ return `${serialised.slice(0, MAX_BODY_EVENT_BYTES)}...<truncated>`;
259
+ } catch {
260
+ return "<unserialisable>";
261
+ }
262
+ }
263
+
264
+ function redactDeep(value: unknown, depth: number): unknown {
265
+ if (depth > 6) return "<max-depth>";
266
+ if (value === null || value === undefined) return value;
267
+ if (value instanceof Uint8Array) {
268
+ return `<bytes:${value.byteLength}>`;
269
+ }
270
+ if (Array.isArray(value)) {
271
+ return value.slice(0, 50).map((v) => redactDeep(v, depth + 1));
272
+ }
273
+ if (typeof value === "object") {
274
+ const out: Record<string, unknown> = {};
275
+ const obj = value as Record<string, unknown>;
276
+ const keys = Object.keys(obj).slice(0, 50);
277
+ for (const k of keys) {
278
+ if (REDACTED_KEY_PATTERN.test(k)) {
279
+ out[k] = REDACTED_VALUE;
280
+ } else {
281
+ out[k] = redactDeep(obj[k], depth + 1);
282
+ }
283
+ }
284
+ return out;
285
+ }
286
+ if (typeof value === "string") {
287
+ // Heuristic: very long strings are likely embedded payloads; clamp.
288
+ return value.length > 1024 ? `${value.slice(0, 1024)}...` : value;
289
+ }
290
+ return value;
291
+ }
292
+
293
+ // --- span helpers --------------------------------------------------------
294
+
295
+ /**
296
+ * Set common job-scoped attributes on a span. Called by every site that
297
+ * creates a per-job span so RUN_ID / JOB_ID / UPLOAD_ID / URN are
298
+ * consistently present (the exporter requires `wf.run_id` to route the
299
+ * span into the right workflow run).
300
+ */
301
+ export function stampJobAttributes(
302
+ span: Span,
303
+ attrs: {
304
+ runId: string;
305
+ jobId: string;
306
+ uploadId?: string;
307
+ urn?: string;
308
+ workerId?: string;
309
+ },
310
+ ): void {
311
+ span.setAttribute(SPAN_ATTR.RUN_ID, attrs.runId);
312
+ span.setAttribute(SPAN_ATTR.JOB_ID, attrs.jobId);
313
+ if (attrs.uploadId) {
314
+ span.setAttribute(SPAN_ATTR.UPLOAD_ID, attrs.uploadId);
315
+ }
316
+ if (attrs.urn) {
317
+ span.setAttribute(SPAN_ATTR.URN, attrs.urn);
318
+ }
319
+ if (attrs.workerId) {
320
+ span.setAttribute(SPAN_ATTR.WORKER_ID, attrs.workerId);
321
+ }
322
+ }
323
+
324
+ export function recordSpanError(span: Span, err: unknown): void {
325
+ if (err instanceof Error) {
326
+ span.recordException(err);
327
+ span.setStatus({ code: SpanStatusCode.ERROR, message: err.message });
328
+ } else {
329
+ span.setStatus({ code: SpanStatusCode.ERROR, message: String(err) });
330
+ }
331
+ }
332
+
333
+ export { SpanKind, SpanStatusCode };
package/src/types.ts CHANGED
@@ -60,6 +60,14 @@ export interface PoolConfig {
60
60
  logger?: import("./logger.js").Logger;
61
61
  /** Max concurrent in-flight jobs. Defaults to 1. */
62
62
  concurrency?: number;
63
+ /**
64
+ * Tracing config. The SDK ships spans to pipeline-api's /v1/traces
65
+ * receiver for the admin UI. Disable with `{ enabled: false }` to
66
+ * skip the OTel SDK setup entirely (e.g. in tests).
67
+ */
68
+ tracing?: {
69
+ enabled?: boolean;
70
+ };
63
71
  }
64
72
 
65
73
  export type JobHandler = (ctx: JobContext) => Promise<void>;