@xeonr/upload-pool-sdk 1.3.0 → 1.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/src/pool.ts CHANGED
@@ -12,6 +12,7 @@
12
12
  import { hostname } from "node:os";
13
13
  import { randomBytes } from "node:crypto";
14
14
  import { create } from "@bufbuild/protobuf";
15
+ import { SpanStatusCode } from "@opentelemetry/api";
15
16
  import {
16
17
  AcceptJobRequestSchema,
17
18
  CompleteJobRequestSchema,
@@ -22,6 +23,13 @@ import { SseClient } from "./sse-client.js";
22
23
  import { createJobContext, type JobEnvelope } from "./job-context.js";
23
24
  import { NonRetryableError } from "./errors.js";
24
25
  import { JsonLogger, type Logger } from "./logger.js";
26
+ import {
27
+ initTracing,
28
+ shutdownTracing,
29
+ recordSpanError,
30
+ stampJobAttributes,
31
+ type TracingHandle,
32
+ } from "./tracing.js";
25
33
  import type {
26
34
  JobContext,
27
35
  JobHandler,
@@ -33,6 +41,7 @@ export class Pool {
33
41
  private readonly rpc: RpcClients;
34
42
  private readonly sse: SseClient;
35
43
  private readonly logger: Logger;
44
+ private readonly tracing: TracingHandle;
36
45
  private inFlight = 0;
37
46
  private readonly workerId: string;
38
47
  private readonly capabilities: string[];
@@ -50,6 +59,19 @@ export class Pool {
50
59
  this.logger = (config.logger ?? new JsonLogger()).child({
51
60
  workerId: this.workerId,
52
61
  });
62
+
63
+ // Initialise tracing *before* RPC clients so the interceptor in
64
+ // rpc-clients picks up an active TracerProvider on its first call.
65
+ // Spans go to pipeline-api's /v1/traces receiver; see tracing.ts
66
+ // for the isolation rationale wrt the host app's OTel pipeline.
67
+ this.tracing = initTracing({
68
+ pipelineEndpoint: config.pipelineEndpoint,
69
+ poolToken: config.token,
70
+ workerId: this.workerId,
71
+ sdkVersion: SDK_VERSION,
72
+ enabled: config.tracing?.enabled,
73
+ });
74
+
53
75
  this.rpc = createRpcClients({
54
76
  apiEndpoint: config.apiEndpoint,
55
77
  pipelineEndpoint: config.pipelineEndpoint,
@@ -129,6 +151,9 @@ export class Pool {
129
151
  clearInterval(this.keepAliveInterval);
130
152
  this.keepAliveInterval = null;
131
153
  }
154
+ // Flush any buffered spans before exit so the final job's traces
155
+ // land in the admin UI even on a clean SIGTERM.
156
+ await shutdownTracing();
132
157
  if (this.runningResolve) {
133
158
  this.runningResolve();
134
159
  this.runningResolve = null;
@@ -154,61 +179,131 @@ export class Pool {
154
179
  return;
155
180
  }
156
181
  this.inFlight++;
157
- jobLogger.info("job.dispatched", {
158
- filename: envelope.filename,
159
- mimeType: envelope.mimeType,
160
- inFlight: this.inFlight,
161
- });
162
182
 
163
- const handler = this.resolveHandler(envelope.contentTypeContext.urn);
164
- const ctx = createJobContext(envelope, this.rpc, jobLogger);
165
- const startedAt = Date.now();
183
+ // Root the entire job under iq.job, parented to the dispatching
184
+ // activity's span via the traceparent the pipeline-worker stamped
185
+ // onto the envelope. All ctx.* + rpc.* spans created inside the
186
+ // handler will nest under this via OTel's AsyncLocalStorage-backed
187
+ // context manager.
188
+ const parentCtx = this.tracing.contextFromEnvelope(
189
+ envelope.traceContext ?? "",
190
+ envelope.wfRunId ?? "",
191
+ );
166
192
 
167
- try {
168
- // AcceptJob — clears the pipeline's accept-timeout.
169
- await this.rpc.integrationQueue.acceptJob(
170
- create(AcceptJobRequestSchema, {
193
+ await this.tracing.tracer.startActiveSpan(
194
+ "iq.job",
195
+ {
196
+ attributes: {
197
+ "iq.filename": envelope.filename,
198
+ "iq.mime_type": envelope.mimeType,
199
+ },
200
+ },
201
+ parentCtx,
202
+ async (rootSpan) => {
203
+ stampJobAttributes(rootSpan, {
204
+ runId: envelope.wfRunId ?? "",
171
205
  jobId: envelope.jobId,
206
+ uploadId: envelope.uploadId,
207
+ urn: envelope.contentTypeContext.urn,
172
208
  workerId: this.workerId,
173
- queueToken: this.config.token,
174
- }),
175
- );
176
- jobLogger.info("job.accepted");
209
+ });
177
210
 
178
- if (!handler) {
179
- jobLogger.warn("job.unhandled", {
180
- availableHandlers: this.capabilities,
211
+ jobLogger.info("job.dispatched", {
212
+ filename: envelope.filename,
213
+ mimeType: envelope.mimeType,
214
+ inFlight: this.inFlight,
181
215
  });
182
- await this.reportError(
183
- envelope.jobId,
184
- ctx,
185
- jobLogger,
186
- new NonRetryableError(`no handler for URN ${envelope.contentTypeContext.urn}`),
187
- );
188
- return;
189
- }
190
-
191
- await handler(ctx);
192
-
193
- await this.rpc.integrationQueue.completeJob(
194
- create(CompleteJobRequestSchema, {
195
- jobId: envelope.jobId,
216
+
217
+ const handler = this.resolveHandler(envelope.contentTypeContext.urn);
218
+ const ctx = createJobContext(envelope, this.rpc, jobLogger, {
196
219
  workerId: this.workerId,
197
- queueToken: this.config.token,
198
- }),
199
- );
200
- jobLogger.info("job.completed", {
201
- durationMs: Date.now() - startedAt,
202
- });
203
- } catch (err) {
204
- jobLogger.error("job.failed", {
205
- err,
206
- durationMs: Date.now() - startedAt,
207
- });
208
- await this.reportError(envelope.jobId, ctx, jobLogger, err as Error);
209
- } finally {
210
- this.inFlight--;
211
- }
220
+ });
221
+ const startedAt = Date.now();
222
+
223
+ try {
224
+ // AcceptJob — clears the pipeline's accept-timeout.
225
+ await this.rpc.integrationQueue.acceptJob(
226
+ create(AcceptJobRequestSchema, {
227
+ jobId: envelope.jobId,
228
+ workerId: this.workerId,
229
+ queueToken: this.config.token,
230
+ }),
231
+ );
232
+ jobLogger.info("job.accepted");
233
+
234
+ if (!handler) {
235
+ jobLogger.warn("job.unhandled", {
236
+ availableHandlers: this.capabilities,
237
+ });
238
+ await this.reportError(
239
+ envelope.jobId,
240
+ ctx,
241
+ jobLogger,
242
+ new NonRetryableError(
243
+ `no handler for URN ${envelope.contentTypeContext.urn}`,
244
+ ),
245
+ );
246
+ rootSpan.setStatus({
247
+ code: SpanStatusCode.ERROR,
248
+ message: "no handler",
249
+ });
250
+ return;
251
+ }
252
+
253
+ // iq.handler wraps the customer's handler function so its
254
+ // own latency is visible as a single row (excluding the
255
+ // accept/complete bookends). Children appear nested.
256
+ await this.tracing.tracer.startActiveSpan(
257
+ "iq.handler",
258
+ async (handlerSpan) => {
259
+ stampJobAttributes(handlerSpan, {
260
+ runId: envelope.wfRunId ?? "",
261
+ jobId: envelope.jobId,
262
+ uploadId: envelope.uploadId,
263
+ urn: envelope.contentTypeContext.urn,
264
+ workerId: this.workerId,
265
+ });
266
+ try {
267
+ await handler(ctx);
268
+ handlerSpan.setStatus({ code: SpanStatusCode.OK });
269
+ } catch (err) {
270
+ recordSpanError(handlerSpan, err);
271
+ throw err;
272
+ } finally {
273
+ handlerSpan.end();
274
+ }
275
+ },
276
+ );
277
+
278
+ await this.rpc.integrationQueue.completeJob(
279
+ create(CompleteJobRequestSchema, {
280
+ jobId: envelope.jobId,
281
+ workerId: this.workerId,
282
+ queueToken: this.config.token,
283
+ }),
284
+ );
285
+ jobLogger.info("job.completed", {
286
+ durationMs: Date.now() - startedAt,
287
+ });
288
+ rootSpan.setStatus({ code: SpanStatusCode.OK });
289
+ } catch (err) {
290
+ jobLogger.error("job.failed", {
291
+ err,
292
+ durationMs: Date.now() - startedAt,
293
+ });
294
+ recordSpanError(rootSpan, err);
295
+ await this.reportError(
296
+ envelope.jobId,
297
+ ctx,
298
+ jobLogger,
299
+ err as Error,
300
+ );
301
+ } finally {
302
+ rootSpan.end();
303
+ this.inFlight--;
304
+ }
305
+ },
306
+ );
212
307
  }
213
308
 
214
309
  private resolveHandler(urn: string): JobHandler | undefined {
@@ -13,10 +13,20 @@
13
13
  * pipeline-api) so we keep them on separate transports and don't try to
14
14
  * share a baseUrl.
15
15
  *
16
- * Each request is wrapped in a logging interceptor that emits one
17
- * `rpc.request` line on dispatch and one of `rpc.response` /
18
- * `rpc.error` on completion. Latency and connect-error codes are
19
- * captured so failed RPCs are diagnosable from worker logs alone.
16
+ * Each request is wrapped in a tracing-and-logging interceptor:
17
+ * - emits one `iq.callback.<Method>` span per call, child of the
18
+ * active job span (so the admin UI sees a row per RPC under the
19
+ * dispatching parseUpload run)
20
+ * - injects W3C `traceparent` on outbound headers *for the pipeline
21
+ * transport only* — calls to `apiEndpoint` ship without traceparent
22
+ * so upl-im-api's own OTel pipeline doesn't pick up our trace_id
23
+ * and conflate the two backends
24
+ * - captures sanitised request/response bodies as span events (4 KB
25
+ * cap, keys matching token/secret/auth/... redacted), so debugging
26
+ * a worker callback failure doesn't require correlating with
27
+ * server-side logs
28
+ * - logs `rpc.request` / `rpc.response` / `rpc.error` at info/error
29
+ * for the legacy log-only consumer surface that predates tracing
20
30
  */
21
31
  import { createConnectTransport } from "@connectrpc/connect-node";
22
32
  import {
@@ -26,41 +36,167 @@ import {
26
36
  ConnectError,
27
37
  Code,
28
38
  } from "@connectrpc/connect";
39
+ import {
40
+ context as otelContext,
41
+ SpanStatusCode,
42
+ trace,
43
+ } from "@opentelemetry/api";
29
44
  import { InternalUploadsService } from "./protocol/uplim/api/v1/uploads_pb.js";
30
45
  import { IntegrationQueueService } from "./protocol/uplim/workflow/v1/integration_queue_pb.js";
31
46
  import type { Logger } from "./logger.js";
47
+ import {
48
+ SPAN_ATTR,
49
+ getTracingHandle,
50
+ recordSpanError,
51
+ sanitizeRpcBody,
52
+ spanContextToTraceparent,
53
+ } from "./tracing.js";
32
54
 
33
55
  export interface RpcClients {
34
56
  internalUploads: Client<typeof InternalUploadsService>;
35
57
  integrationQueue: Client<typeof IntegrationQueueService>;
36
58
  }
37
59
 
38
- function loggingInterceptor(logger: Logger, target: string): Interceptor {
60
+ type RpcTarget = "api" | "pipeline";
61
+
62
+ function tracingLoggingInterceptor(
63
+ logger: Logger,
64
+ target: RpcTarget,
65
+ ): Interceptor {
39
66
  return (next) => async (req) => {
40
- const startedAt = Date.now();
67
+ const handle = getTracingHandle();
41
68
  const method = `${req.service.typeName}/${req.method.name}`;
69
+
70
+ // Inject W3C traceparent on the pipeline target *only*. upl-im-api
71
+ // runs its own OTel pipeline against a different backend; passing
72
+ // our trace_id over would graft our worker activity into its trees,
73
+ // muddying its admin dashboards. The pool token in the request
74
+ // body / queue token in headers remain the trust boundary either
75
+ // way.
76
+ if (target === "pipeline") {
77
+ const activeSpan = trace.getActiveSpan();
78
+ if (activeSpan) {
79
+ req.header.set(
80
+ "traceparent",
81
+ spanContextToTraceparent(activeSpan.spanContext()),
82
+ );
83
+ }
84
+ }
85
+
86
+ const startedAt = Date.now();
42
87
  logger.debug("rpc.request", { method, target });
43
- try {
44
- const res = await next(req);
45
- logger.debug("rpc.response", {
46
- method,
47
- target,
48
- durationMs: Date.now() - startedAt,
49
- });
50
- return res;
51
- } catch (err) {
52
- const code =
53
- err instanceof ConnectError ? Code[err.code] : undefined;
54
- const message = err instanceof Error ? err.message : String(err);
55
- logger.error("rpc.error", {
56
- method,
57
- target,
58
- code,
59
- message,
60
- durationMs: Date.now() - startedAt,
61
- });
62
- throw err;
88
+
89
+ // When tracing is uninitialised (e.g. tests, dry-runs), fall through
90
+ // to a logging-only path so we don't crash on a null tracer.
91
+ if (!handle) {
92
+ try {
93
+ const res = await next(req);
94
+ logger.debug("rpc.response", {
95
+ method,
96
+ target,
97
+ durationMs: Date.now() - startedAt,
98
+ });
99
+ return res;
100
+ } catch (err) {
101
+ const code =
102
+ err instanceof ConnectError ? Code[err.code] : undefined;
103
+ const message = err instanceof Error ? err.message : String(err);
104
+ logger.error("rpc.error", {
105
+ method,
106
+ target,
107
+ code,
108
+ message,
109
+ durationMs: Date.now() - startedAt,
110
+ });
111
+ throw err;
112
+ }
63
113
  }
114
+
115
+ return handle.tracer.startActiveSpan(
116
+ `iq.callback.${req.method.name}`,
117
+ async (span) => {
118
+ span.setAttribute(SPAN_ATTR.RPC_METHOD, method);
119
+ span.setAttribute(SPAN_ATTR.RPC_SERVICE, req.service.typeName);
120
+ span.setAttribute(SPAN_ATTR.RPC_TARGET, target);
121
+
122
+ // Inherit job-scoped attributes from the active parent span
123
+ // (set by Pool.handleDispatch via stampJobAttributes). Without
124
+ // this, the span lacks `pipeline.run_id` and gets dropped by
125
+ // the pipeline-api receiver. Reading them off the active span
126
+ // is the cheapest cross-span propagation we can do without a
127
+ // custom OTel context manager.
128
+ const parent = trace.getActiveSpan();
129
+ const parentAttrs = (parent as unknown as {
130
+ attributes?: Record<string, unknown>;
131
+ })?.attributes;
132
+ if (parentAttrs) {
133
+ for (const key of [
134
+ SPAN_ATTR.RUN_ID,
135
+ SPAN_ATTR.JOB_ID,
136
+ SPAN_ATTR.UPLOAD_ID,
137
+ SPAN_ATTR.URN,
138
+ SPAN_ATTR.WORKER_ID,
139
+ ]) {
140
+ const v = parentAttrs[key];
141
+ if (typeof v === "string" && v.length > 0) {
142
+ span.setAttribute(key, v);
143
+ }
144
+ }
145
+ }
146
+
147
+ // Body capture: request first, response on success, error
148
+ // payload on failure. Sanitiser truncates to 4 KB and redacts
149
+ // token-shaped keys (see tracing.ts).
150
+ try {
151
+ span.addEvent("rpc.request", {
152
+ body: sanitizeRpcBody(req.message),
153
+ });
154
+ } catch {
155
+ /* never let body capture crash the RPC */
156
+ }
157
+
158
+ try {
159
+ const res = await next(req);
160
+ const durationMs = Date.now() - startedAt;
161
+ span.setAttribute(SPAN_ATTR.RPC_DURATION_MS, durationMs);
162
+ span.setAttribute(SPAN_ATTR.RPC_CODE, "ok");
163
+ try {
164
+ span.addEvent("rpc.response", {
165
+ body: sanitizeRpcBody(res.message),
166
+ });
167
+ } catch {
168
+ /* body capture is best-effort */
169
+ }
170
+ span.setStatus({ code: SpanStatusCode.OK });
171
+ logger.debug("rpc.response", {
172
+ method,
173
+ target,
174
+ durationMs,
175
+ });
176
+ return res;
177
+ } catch (err) {
178
+ const durationMs = Date.now() - startedAt;
179
+ const code =
180
+ err instanceof ConnectError ? Code[err.code] : "internal";
181
+ const message =
182
+ err instanceof Error ? err.message : String(err);
183
+ span.setAttribute(SPAN_ATTR.RPC_DURATION_MS, durationMs);
184
+ span.setAttribute(SPAN_ATTR.RPC_CODE, String(code));
185
+ span.addEvent("rpc.error", { code: String(code), message });
186
+ recordSpanError(span, err);
187
+ logger.error("rpc.error", {
188
+ method,
189
+ target,
190
+ code,
191
+ message,
192
+ durationMs,
193
+ });
194
+ throw err;
195
+ } finally {
196
+ span.end();
197
+ }
198
+ },
199
+ );
64
200
  };
65
201
  }
66
202
 
@@ -76,15 +212,19 @@ export function createRpcClients(config: RpcClientsConfig): RpcClients {
76
212
  const apiTransport = createConnectTransport({
77
213
  baseUrl: config.apiEndpoint,
78
214
  httpVersion: "1.1",
79
- interceptors: [loggingInterceptor(rpcLogger, "api")],
215
+ interceptors: [tracingLoggingInterceptor(rpcLogger, "api")],
80
216
  });
81
217
  const pipelineTransport = createConnectTransport({
82
218
  baseUrl: config.pipelineEndpoint,
83
219
  httpVersion: "1.1",
84
- interceptors: [loggingInterceptor(rpcLogger, "pipeline")],
220
+ interceptors: [tracingLoggingInterceptor(rpcLogger, "pipeline")],
85
221
  });
86
222
  return {
87
223
  internalUploads: createClient(InternalUploadsService, apiTransport),
88
224
  integrationQueue: createClient(IntegrationQueueService, pipelineTransport),
89
225
  };
90
226
  }
227
+
228
+ // keep otelContext import alive for future extensions that may want to
229
+ // detach span scopes around streaming responses.
230
+ void otelContext;