@xeonr/upload-pool-sdk 1.3.0 → 1.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/src/pool.ts CHANGED
@@ -12,6 +12,7 @@
12
12
  import { hostname } from "node:os";
13
13
  import { randomBytes } from "node:crypto";
14
14
  import { create } from "@bufbuild/protobuf";
15
+ import { SpanStatusCode } from "@opentelemetry/api";
15
16
  import {
16
17
  AcceptJobRequestSchema,
17
18
  CompleteJobRequestSchema,
@@ -22,6 +23,14 @@ import { SseClient } from "./sse-client.js";
22
23
  import { createJobContext, type JobEnvelope } from "./job-context.js";
23
24
  import { NonRetryableError } from "./errors.js";
24
25
  import { JsonLogger, type Logger } from "./logger.js";
26
+ import {
27
+ initTracing,
28
+ shutdownTracing,
29
+ recordSpanError,
30
+ stampJobAttributes,
31
+ withJobContext,
32
+ type TracingHandle,
33
+ } from "./tracing.js";
25
34
  import type {
26
35
  JobContext,
27
36
  JobHandler,
@@ -33,6 +42,7 @@ export class Pool {
33
42
  private readonly rpc: RpcClients;
34
43
  private readonly sse: SseClient;
35
44
  private readonly logger: Logger;
45
+ private readonly tracing: TracingHandle;
36
46
  private inFlight = 0;
37
47
  private readonly workerId: string;
38
48
  private readonly capabilities: string[];
@@ -50,6 +60,19 @@ export class Pool {
50
60
  this.logger = (config.logger ?? new JsonLogger()).child({
51
61
  workerId: this.workerId,
52
62
  });
63
+
64
+ // Initialise tracing *before* RPC clients so the interceptor in
65
+ // rpc-clients picks up an active TracerProvider on its first call.
66
+ // Spans go to pipeline-api's /v1/traces receiver; see tracing.ts
67
+ // for the isolation rationale wrt the host app's OTel pipeline.
68
+ this.tracing = initTracing({
69
+ pipelineEndpoint: config.pipelineEndpoint,
70
+ poolToken: config.token,
71
+ workerId: this.workerId,
72
+ sdkVersion: SDK_VERSION,
73
+ enabled: config.tracing?.enabled,
74
+ });
75
+
53
76
  this.rpc = createRpcClients({
54
77
  apiEndpoint: config.apiEndpoint,
55
78
  pipelineEndpoint: config.pipelineEndpoint,
@@ -129,6 +152,9 @@ export class Pool {
129
152
  clearInterval(this.keepAliveInterval);
130
153
  this.keepAliveInterval = null;
131
154
  }
155
+ // Flush any buffered spans before exit so the final job's traces
156
+ // land in the admin UI even on a clean SIGTERM.
157
+ await shutdownTracing();
132
158
  if (this.runningResolve) {
133
159
  this.runningResolve();
134
160
  this.runningResolve = null;
@@ -154,61 +180,146 @@ export class Pool {
154
180
  return;
155
181
  }
156
182
  this.inFlight++;
157
- jobLogger.info("job.dispatched", {
158
- filename: envelope.filename,
159
- mimeType: envelope.mimeType,
160
- inFlight: this.inFlight,
161
- });
162
183
 
163
- const handler = this.resolveHandler(envelope.contentTypeContext.urn);
164
- const ctx = createJobContext(envelope, this.rpc, jobLogger);
165
- const startedAt = Date.now();
184
+ // Root the entire job under iq.job, parented to the dispatching
185
+ // activity's span via the traceparent the pipeline-worker stamped
186
+ // onto the envelope. All ctx.* + rpc.* spans created inside the
187
+ // handler will nest under this via OTel's AsyncLocalStorage-backed
188
+ // context manager.
189
+ const parentCtx = this.tracing.contextFromEnvelope(
190
+ envelope.traceContext ?? "",
191
+ envelope.wfRunId ?? "",
192
+ );
166
193
 
167
- try {
168
- // AcceptJob clears the pipeline's accept-timeout.
169
- await this.rpc.integrationQueue.acceptJob(
170
- create(AcceptJobRequestSchema, {
194
+ // Carry the job's identifiers in the active OTel context so
195
+ // stampJobAttributes() called by child spans (interceptors,
196
+ // ctx.* spans) can read them. OTel deliberately doesn't expose
197
+ // parent-span attributes back to child-creation sites — context
198
+ // values are the canonical cross-span carrier.
199
+ const jobAttrs = {
200
+ runId: envelope.wfRunId ?? "",
201
+ jobId: envelope.jobId,
202
+ uploadId: envelope.uploadId,
203
+ urn: envelope.contentTypeContext.urn,
204
+ workerId: this.workerId,
205
+ };
206
+
207
+ await withJobContext(jobAttrs, () =>
208
+ this.tracing.tracer.startActiveSpan(
209
+ "iq.job",
210
+ {
211
+ attributes: {
212
+ "iq.filename": envelope.filename,
213
+ "iq.mime_type": envelope.mimeType,
214
+ },
215
+ },
216
+ parentCtx,
217
+ async (rootSpan) => {
218
+ stampJobAttributes(rootSpan, {
219
+ runId: envelope.wfRunId ?? "",
171
220
  jobId: envelope.jobId,
221
+ uploadId: envelope.uploadId,
222
+ urn: envelope.contentTypeContext.urn,
172
223
  workerId: this.workerId,
173
- queueToken: this.config.token,
174
- }),
175
- );
176
- jobLogger.info("job.accepted");
224
+ });
177
225
 
178
- if (!handler) {
179
- jobLogger.warn("job.unhandled", {
180
- availableHandlers: this.capabilities,
226
+ jobLogger.info("job.dispatched", {
227
+ filename: envelope.filename,
228
+ mimeType: envelope.mimeType,
229
+ inFlight: this.inFlight,
181
230
  });
182
- await this.reportError(
183
- envelope.jobId,
184
- ctx,
185
- jobLogger,
186
- new NonRetryableError(`no handler for URN ${envelope.contentTypeContext.urn}`),
187
- );
188
- return;
189
- }
190
-
191
- await handler(ctx);
192
-
193
- await this.rpc.integrationQueue.completeJob(
194
- create(CompleteJobRequestSchema, {
195
- jobId: envelope.jobId,
231
+
232
+ const handler = this.resolveHandler(envelope.contentTypeContext.urn);
233
+ const ctx = createJobContext(envelope, this.rpc, jobLogger, {
196
234
  workerId: this.workerId,
197
- queueToken: this.config.token,
198
- }),
199
- );
200
- jobLogger.info("job.completed", {
201
- durationMs: Date.now() - startedAt,
202
- });
203
- } catch (err) {
204
- jobLogger.error("job.failed", {
205
- err,
206
- durationMs: Date.now() - startedAt,
207
- });
208
- await this.reportError(envelope.jobId, ctx, jobLogger, err as Error);
209
- } finally {
210
- this.inFlight--;
211
- }
235
+ });
236
+ const startedAt = Date.now();
237
+
238
+ try {
239
+ // AcceptJob — clears the pipeline's accept-timeout.
240
+ await this.rpc.integrationQueue.acceptJob(
241
+ create(AcceptJobRequestSchema, {
242
+ jobId: envelope.jobId,
243
+ workerId: this.workerId,
244
+ queueToken: this.config.token,
245
+ }),
246
+ );
247
+ jobLogger.info("job.accepted");
248
+
249
+ if (!handler) {
250
+ jobLogger.warn("job.unhandled", {
251
+ availableHandlers: this.capabilities,
252
+ });
253
+ await this.reportError(
254
+ envelope.jobId,
255
+ ctx,
256
+ jobLogger,
257
+ new NonRetryableError(
258
+ `no handler for URN ${envelope.contentTypeContext.urn}`,
259
+ ),
260
+ );
261
+ rootSpan.setStatus({
262
+ code: SpanStatusCode.ERROR,
263
+ message: "no handler",
264
+ });
265
+ return;
266
+ }
267
+
268
+ // iq.handler wraps the customer's handler function so its
269
+ // own latency is visible as a single row (excluding the
270
+ // accept/complete bookends). Children appear nested.
271
+ await this.tracing.tracer.startActiveSpan(
272
+ "iq.handler",
273
+ async (handlerSpan) => {
274
+ stampJobAttributes(handlerSpan, {
275
+ runId: envelope.wfRunId ?? "",
276
+ jobId: envelope.jobId,
277
+ uploadId: envelope.uploadId,
278
+ urn: envelope.contentTypeContext.urn,
279
+ workerId: this.workerId,
280
+ });
281
+ try {
282
+ await handler(ctx);
283
+ handlerSpan.setStatus({ code: SpanStatusCode.OK });
284
+ } catch (err) {
285
+ recordSpanError(handlerSpan, err);
286
+ throw err;
287
+ } finally {
288
+ handlerSpan.end();
289
+ }
290
+ },
291
+ );
292
+
293
+ await this.rpc.integrationQueue.completeJob(
294
+ create(CompleteJobRequestSchema, {
295
+ jobId: envelope.jobId,
296
+ workerId: this.workerId,
297
+ queueToken: this.config.token,
298
+ }),
299
+ );
300
+ jobLogger.info("job.completed", {
301
+ durationMs: Date.now() - startedAt,
302
+ });
303
+ rootSpan.setStatus({ code: SpanStatusCode.OK });
304
+ } catch (err) {
305
+ jobLogger.error("job.failed", {
306
+ err,
307
+ durationMs: Date.now() - startedAt,
308
+ });
309
+ recordSpanError(rootSpan, err);
310
+ await this.reportError(
311
+ envelope.jobId,
312
+ ctx,
313
+ jobLogger,
314
+ err as Error,
315
+ );
316
+ } finally {
317
+ rootSpan.end();
318
+ this.inFlight--;
319
+ }
320
+ },
321
+ ),
322
+ );
212
323
  }
213
324
 
214
325
  private resolveHandler(urn: string): JobHandler | undefined {
@@ -241,4 +352,4 @@ export class Pool {
241
352
  }
242
353
  }
243
354
 
244
- const SDK_VERSION = "1.3.0";
355
+ const SDK_VERSION = "1.4.1";
@@ -13,10 +13,20 @@
13
13
  * pipeline-api) so we keep them on separate transports and don't try to
14
14
  * share a baseUrl.
15
15
  *
16
- * Each request is wrapped in a logging interceptor that emits one
17
- * `rpc.request` line on dispatch and one of `rpc.response` /
18
- * `rpc.error` on completion. Latency and connect-error codes are
19
- * captured so failed RPCs are diagnosable from worker logs alone.
16
+ * Each request is wrapped in a tracing-and-logging interceptor:
17
+ * - emits one `iq.callback.<Method>` span per call, child of the
18
+ * active job span (so the admin UI sees a row per RPC under the
19
+ * dispatching parseUpload run)
20
+ * - injects W3C `traceparent` on outbound headers *for the pipeline
21
+ * transport only* — calls to `apiEndpoint` ship without traceparent
22
+ * so upl-im-api's own OTel pipeline doesn't pick up our trace_id
23
+ * and conflate the two backends
24
+ * - captures sanitised request/response bodies as span events (4 KB
25
+ * cap, keys matching token/secret/auth/... redacted), so debugging
26
+ * a worker callback failure doesn't require correlating with
27
+ * server-side logs
28
+ * - logs `rpc.request` / `rpc.response` / `rpc.error` at info/error
29
+ * for the legacy log-only consumer surface that predates tracing
20
30
  */
21
31
  import { createConnectTransport } from "@connectrpc/connect-node";
22
32
  import {
@@ -26,41 +36,149 @@ import {
26
36
  ConnectError,
27
37
  Code,
28
38
  } from "@connectrpc/connect";
39
+ import {
40
+ context as otelContext,
41
+ SpanStatusCode,
42
+ trace,
43
+ } from "@opentelemetry/api";
29
44
  import { InternalUploadsService } from "./protocol/uplim/api/v1/uploads_pb.js";
30
45
  import { IntegrationQueueService } from "./protocol/uplim/workflow/v1/integration_queue_pb.js";
31
46
  import type { Logger } from "./logger.js";
47
+ import {
48
+ SPAN_ATTR,
49
+ getTracingHandle,
50
+ recordSpanError,
51
+ sanitizeRpcBody,
52
+ spanContextToTraceparent,
53
+ stampJobAttributes,
54
+ } from "./tracing.js";
32
55
 
33
56
  export interface RpcClients {
34
57
  internalUploads: Client<typeof InternalUploadsService>;
35
58
  integrationQueue: Client<typeof IntegrationQueueService>;
36
59
  }
37
60
 
38
- function loggingInterceptor(logger: Logger, target: string): Interceptor {
61
+ type RpcTarget = "api" | "pipeline";
62
+
63
+ function tracingLoggingInterceptor(
64
+ logger: Logger,
65
+ target: RpcTarget,
66
+ ): Interceptor {
39
67
  return (next) => async (req) => {
40
- const startedAt = Date.now();
68
+ const handle = getTracingHandle();
41
69
  const method = `${req.service.typeName}/${req.method.name}`;
70
+
71
+ // Inject W3C traceparent on the pipeline target *only*. upl-im-api
72
+ // runs its own OTel pipeline against a different backend; passing
73
+ // our trace_id over would graft our worker activity into its trees,
74
+ // muddying its admin dashboards. The pool token in the request
75
+ // body / queue token in headers remain the trust boundary either
76
+ // way.
77
+ if (target === "pipeline") {
78
+ const activeSpan = trace.getActiveSpan();
79
+ if (activeSpan) {
80
+ req.header.set(
81
+ "traceparent",
82
+ spanContextToTraceparent(activeSpan.spanContext()),
83
+ );
84
+ }
85
+ }
86
+
87
+ const startedAt = Date.now();
42
88
  logger.debug("rpc.request", { method, target });
43
- try {
44
- const res = await next(req);
45
- logger.debug("rpc.response", {
46
- method,
47
- target,
48
- durationMs: Date.now() - startedAt,
49
- });
50
- return res;
51
- } catch (err) {
52
- const code =
53
- err instanceof ConnectError ? Code[err.code] : undefined;
54
- const message = err instanceof Error ? err.message : String(err);
55
- logger.error("rpc.error", {
56
- method,
57
- target,
58
- code,
59
- message,
60
- durationMs: Date.now() - startedAt,
61
- });
62
- throw err;
89
+
90
+ // When tracing is uninitialised (e.g. tests, dry-runs), fall through
91
+ // to a logging-only path so we don't crash on a null tracer.
92
+ if (!handle) {
93
+ try {
94
+ const res = await next(req);
95
+ logger.debug("rpc.response", {
96
+ method,
97
+ target,
98
+ durationMs: Date.now() - startedAt,
99
+ });
100
+ return res;
101
+ } catch (err) {
102
+ const code =
103
+ err instanceof ConnectError ? Code[err.code] : undefined;
104
+ const message = err instanceof Error ? err.message : String(err);
105
+ logger.error("rpc.error", {
106
+ method,
107
+ target,
108
+ code,
109
+ message,
110
+ durationMs: Date.now() - startedAt,
111
+ });
112
+ throw err;
113
+ }
63
114
  }
115
+
116
+ return handle.tracer.startActiveSpan(
117
+ `iq.callback.${req.method.name}`,
118
+ async (span) => {
119
+ span.setAttribute(SPAN_ATTR.RPC_METHOD, method);
120
+ span.setAttribute(SPAN_ATTR.RPC_SERVICE, req.service.typeName);
121
+ span.setAttribute(SPAN_ATTR.RPC_TARGET, target);
122
+
123
+ // Job-scoped attrs (runId, jobId, ...) pulled from the
124
+ // active OTel context — set once by Pool.handleDispatch
125
+ // via withJobContext. The receiver requires pipeline.run_id
126
+ // or the span is dropped, so this stamp is load-bearing.
127
+ stampJobAttributes(span);
128
+
129
+ // Body capture: request first, response on success, error
130
+ // payload on failure. Sanitiser truncates to 4 KB and redacts
131
+ // token-shaped keys (see tracing.ts).
132
+ try {
133
+ span.addEvent("rpc.request", {
134
+ body: sanitizeRpcBody(req.message),
135
+ });
136
+ } catch {
137
+ /* never let body capture crash the RPC */
138
+ }
139
+
140
+ try {
141
+ const res = await next(req);
142
+ const durationMs = Date.now() - startedAt;
143
+ span.setAttribute(SPAN_ATTR.RPC_DURATION_MS, durationMs);
144
+ span.setAttribute(SPAN_ATTR.RPC_CODE, "ok");
145
+ try {
146
+ span.addEvent("rpc.response", {
147
+ body: sanitizeRpcBody(res.message),
148
+ });
149
+ } catch {
150
+ /* body capture is best-effort */
151
+ }
152
+ span.setStatus({ code: SpanStatusCode.OK });
153
+ logger.debug("rpc.response", {
154
+ method,
155
+ target,
156
+ durationMs,
157
+ });
158
+ return res;
159
+ } catch (err) {
160
+ const durationMs = Date.now() - startedAt;
161
+ const code =
162
+ err instanceof ConnectError ? Code[err.code] : "internal";
163
+ const message =
164
+ err instanceof Error ? err.message : String(err);
165
+ span.setAttribute(SPAN_ATTR.RPC_DURATION_MS, durationMs);
166
+ span.setAttribute(SPAN_ATTR.RPC_CODE, String(code));
167
+ span.addEvent("rpc.error", { code: String(code), message });
168
+ recordSpanError(span, err);
169
+ logger.error("rpc.error", {
170
+ method,
171
+ target,
172
+ code,
173
+ message,
174
+ durationMs,
175
+ });
176
+ throw err;
177
+ } finally {
178
+ span.end();
179
+ }
180
+ },
181
+ );
64
182
  };
65
183
  }
66
184
 
@@ -76,15 +194,19 @@ export function createRpcClients(config: RpcClientsConfig): RpcClients {
76
194
  const apiTransport = createConnectTransport({
77
195
  baseUrl: config.apiEndpoint,
78
196
  httpVersion: "1.1",
79
- interceptors: [loggingInterceptor(rpcLogger, "api")],
197
+ interceptors: [tracingLoggingInterceptor(rpcLogger, "api")],
80
198
  });
81
199
  const pipelineTransport = createConnectTransport({
82
200
  baseUrl: config.pipelineEndpoint,
83
201
  httpVersion: "1.1",
84
- interceptors: [loggingInterceptor(rpcLogger, "pipeline")],
202
+ interceptors: [tracingLoggingInterceptor(rpcLogger, "pipeline")],
85
203
  });
86
204
  return {
87
205
  internalUploads: createClient(InternalUploadsService, apiTransport),
88
206
  integrationQueue: createClient(IntegrationQueueService, pipelineTransport),
89
207
  };
90
208
  }
209
+
210
+ // keep otelContext import alive for future extensions that may want to
211
+ // detach span scopes around streaming responses.
212
+ void otelContext;