@drarzter/kafka-client 0.5.7 → 0.6.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/core.js CHANGED
@@ -79,7 +79,8 @@ function decodeHeaders(raw) {
79
79
  for (const [key, value] of Object.entries(raw)) {
80
80
  if (value === void 0) continue;
81
81
  if (Array.isArray(value)) {
82
- result[key] = value.map((v) => Buffer.isBuffer(v) ? v.toString() : v).join(",");
82
+ const items = value.map((v) => Buffer.isBuffer(v) ? v.toString() : v);
83
+ result[key] = items[items.length - 1] ?? "";
83
84
  } else {
84
85
  result[key] = Buffer.isBuffer(value) ? value.toString() : value;
85
86
  }
@@ -141,17 +142,23 @@ function resolveTopicName(topicOrDescriptor) {
141
142
  }
142
143
  return String(topicOrDescriptor);
143
144
  }
144
- function registerSchema(topicOrDesc, schemaRegistry) {
145
+ function registerSchema(topicOrDesc, schemaRegistry, logger) {
145
146
  if (topicOrDesc?.__schema) {
146
147
  const topic2 = resolveTopicName(topicOrDesc);
148
+ const existing = schemaRegistry.get(topic2);
149
+ if (existing && existing !== topicOrDesc.__schema) {
150
+ logger?.warn(
151
+ `Schema conflict for topic "${topic2}": a different schema is already registered. Using the new schema \u2014 ensure consistent schemas to avoid silent validation mismatches.`
152
+ );
153
+ }
147
154
  schemaRegistry.set(topic2, topicOrDesc.__schema);
148
155
  }
149
156
  }
150
- async function validateMessage(topicOrDesc, message, deps) {
157
+ async function validateMessage(topicOrDesc, message, deps, ctx) {
151
158
  const topicName = resolveTopicName(topicOrDesc);
152
159
  if (topicOrDesc?.__schema) {
153
160
  try {
154
- return await topicOrDesc.__schema.parse(message);
161
+ return await topicOrDesc.__schema.parse(message, ctx);
155
162
  } catch (error) {
156
163
  throw new KafkaValidationError(topicName, message, {
157
164
  cause: error instanceof Error ? error : new Error(String(error))
@@ -162,7 +169,7 @@ async function validateMessage(topicOrDesc, message, deps) {
162
169
  const schema = deps.schemaRegistry.get(topicOrDesc);
163
170
  if (schema) {
164
171
  try {
165
- return await schema.parse(message);
172
+ return await schema.parse(message, ctx);
166
173
  } catch (error) {
167
174
  throw new KafkaValidationError(topicName, message, {
168
175
  cause: error instanceof Error ? error : new Error(String(error))
@@ -185,9 +192,14 @@ async function buildSendPayload(topicOrDesc, messages, deps) {
185
192
  for (const inst of deps.instrumentation) {
186
193
  inst.beforeSend?.(topic2, envelopeHeaders);
187
194
  }
195
+ const sendCtx = {
196
+ topic: topic2,
197
+ headers: envelopeHeaders,
198
+ version: m.schemaVersion ?? 1
199
+ };
188
200
  return {
189
201
  value: JSON.stringify(
190
- await validateMessage(topicOrDesc, m.value, deps)
202
+ await validateMessage(topicOrDesc, m.value, deps, sendCtx)
191
203
  ),
192
204
  key: m.key ?? null,
193
205
  headers: envelopeHeaders
@@ -231,19 +243,26 @@ function getOrCreateConsumer(groupId, fromBeginning, autoCommit, deps) {
231
243
  consumers.set(groupId, consumer);
232
244
  return consumer;
233
245
  }
234
- function buildSchemaMap(topics, schemaRegistry, optionSchemas) {
246
+ function buildSchemaMap(topics, schemaRegistry, optionSchemas, logger) {
235
247
  const schemaMap = /* @__PURE__ */ new Map();
248
+ const registerChecked = (name, schema) => {
249
+ const existing = schemaRegistry.get(name);
250
+ if (existing && existing !== schema) {
251
+ logger?.warn(
252
+ `Schema conflict for topic "${name}": a different schema is already registered. Using the new schema \u2014 ensure consistent schemas to avoid silent validation mismatches.`
253
+ );
254
+ }
255
+ schemaMap.set(name, schema);
256
+ schemaRegistry.set(name, schema);
257
+ };
236
258
  for (const t of topics) {
237
259
  if (t?.__schema) {
238
- const name = resolveTopicName(t);
239
- schemaMap.set(name, t.__schema);
240
- schemaRegistry.set(name, t.__schema);
260
+ registerChecked(resolveTopicName(t), t.__schema);
241
261
  }
242
262
  }
243
263
  if (optionSchemas) {
244
264
  for (const [k, v] of optionSchemas) {
245
- schemaMap.set(k, v);
246
- schemaRegistry.set(k, v);
265
+ registerChecked(k, v);
247
266
  }
248
267
  }
249
268
  return schemaMap;
@@ -270,8 +289,13 @@ function parseJsonMessage(raw, topic2, logger) {
270
289
  async function validateWithSchema(message, raw, topic2, schemaMap, interceptors, dlq, deps) {
271
290
  const schema = schemaMap.get(topic2);
272
291
  if (!schema) return message;
292
+ const ctx = {
293
+ topic: topic2,
294
+ headers: deps.originalHeaders ?? {},
295
+ version: Number(deps.originalHeaders?.["x-schema-version"] ?? 1)
296
+ };
273
297
  try {
274
- return await schema.parse(message);
298
+ return await schema.parse(message, ctx);
275
299
  } catch (error) {
276
300
  const err = toError(error);
277
301
  const validationError = new KafkaValidationError(topic2, message, {
@@ -308,7 +332,7 @@ async function validateWithSchema(message, raw, topic2, schemaMap, interceptors,
308
332
  return null;
309
333
  }
310
334
  }
311
- async function sendToDlq(topic2, rawMessage, deps, meta) {
335
+ function buildDlqPayload(topic2, rawMessage, meta) {
312
336
  const dlqTopic = `${topic2}.dlq`;
313
337
  const headers = {
314
338
  ...meta?.originalHeaders ?? {},
@@ -318,54 +342,82 @@ async function sendToDlq(topic2, rawMessage, deps, meta) {
318
342
  "x-dlq-error-stack": meta?.error.stack?.slice(0, 2e3) ?? "",
319
343
  "x-dlq-attempt-count": String(meta?.attempt ?? 0)
320
344
  };
345
+ return { topic: dlqTopic, messages: [{ value: rawMessage, headers }] };
346
+ }
347
+ async function sendToDlq(topic2, rawMessage, deps, meta) {
348
+ const payload = buildDlqPayload(topic2, rawMessage, meta);
321
349
  try {
322
- await deps.producer.send({
323
- topic: dlqTopic,
324
- messages: [{ value: rawMessage, headers }]
325
- });
326
- deps.logger.warn(`Message sent to DLQ: ${dlqTopic}`);
350
+ await deps.producer.send(payload);
351
+ deps.logger.warn(`Message sent to DLQ: ${payload.topic}`);
327
352
  } catch (error) {
328
- deps.logger.error(
329
- `Failed to send message to DLQ ${dlqTopic}:`,
330
- toError(error).stack
331
- );
353
+ const err = toError(error);
354
+ deps.logger.error(`Failed to send message to DLQ ${payload.topic}:`, err.stack);
355
+ await deps.onMessageLost?.({
356
+ topic: topic2,
357
+ error: err,
358
+ attempt: meta?.attempt ?? 0,
359
+ headers: meta?.originalHeaders ?? {}
360
+ });
332
361
  }
333
362
  }
334
363
  var RETRY_HEADER_ATTEMPT = "x-retry-attempt";
335
364
  var RETRY_HEADER_AFTER = "x-retry-after";
336
365
  var RETRY_HEADER_MAX_RETRIES = "x-retry-max-retries";
337
366
  var RETRY_HEADER_ORIGINAL_TOPIC = "x-retry-original-topic";
338
- async function sendToRetryTopic(originalTopic, rawMessages, attempt, maxRetries, delayMs, originalHeaders, deps) {
367
+ function buildRetryTopicPayload(originalTopic, rawMessages, attempt, maxRetries, delayMs, originalHeaders) {
339
368
  const retryTopic = `${originalTopic}.retry.${attempt}`;
340
- const {
341
- [RETRY_HEADER_ATTEMPT]: _a,
342
- [RETRY_HEADER_AFTER]: _b,
343
- [RETRY_HEADER_MAX_RETRIES]: _c,
344
- [RETRY_HEADER_ORIGINAL_TOPIC]: _d,
345
- ...userHeaders
346
- } = originalHeaders;
347
- const headers = {
348
- ...userHeaders,
349
- [RETRY_HEADER_ATTEMPT]: String(attempt),
350
- [RETRY_HEADER_AFTER]: String(Date.now() + delayMs),
351
- [RETRY_HEADER_MAX_RETRIES]: String(maxRetries),
352
- [RETRY_HEADER_ORIGINAL_TOPIC]: originalTopic
369
+ function buildHeaders(hdr) {
370
+ const {
371
+ [RETRY_HEADER_ATTEMPT]: _a,
372
+ [RETRY_HEADER_AFTER]: _b,
373
+ [RETRY_HEADER_MAX_RETRIES]: _c,
374
+ [RETRY_HEADER_ORIGINAL_TOPIC]: _d,
375
+ ...userHeaders
376
+ } = hdr;
377
+ return {
378
+ ...userHeaders,
379
+ [RETRY_HEADER_ATTEMPT]: String(attempt),
380
+ [RETRY_HEADER_AFTER]: String(Date.now() + delayMs),
381
+ [RETRY_HEADER_MAX_RETRIES]: String(maxRetries),
382
+ [RETRY_HEADER_ORIGINAL_TOPIC]: originalTopic
383
+ };
384
+ }
385
+ return {
386
+ topic: retryTopic,
387
+ messages: rawMessages.map((value, i) => ({
388
+ value,
389
+ headers: buildHeaders(
390
+ Array.isArray(originalHeaders) ? originalHeaders[i] ?? {} : originalHeaders
391
+ )
392
+ }))
353
393
  };
394
+ }
395
+ async function sendToRetryTopic(originalTopic, rawMessages, attempt, maxRetries, delayMs, originalHeaders, deps) {
396
+ const payload = buildRetryTopicPayload(
397
+ originalTopic,
398
+ rawMessages,
399
+ attempt,
400
+ maxRetries,
401
+ delayMs,
402
+ originalHeaders
403
+ );
354
404
  try {
355
- for (const raw of rawMessages) {
356
- await deps.producer.send({
357
- topic: retryTopic,
358
- messages: [{ value: raw, headers }]
359
- });
360
- }
405
+ await deps.producer.send(payload);
361
406
  deps.logger.warn(
362
- `Message queued in retry topic ${retryTopic} (attempt ${attempt}/${maxRetries})`
407
+ `Message queued in retry topic ${payload.topic} (attempt ${attempt}/${maxRetries})`
363
408
  );
364
409
  } catch (error) {
410
+ const err = toError(error);
365
411
  deps.logger.error(
366
- `Failed to send message to retry topic ${retryTopic}:`,
367
- toError(error).stack
412
+ `Failed to send message to retry topic ${payload.topic}:`,
413
+ err.stack
368
414
  );
415
+ await deps.onMessageLost?.({
416
+ topic: originalTopic,
417
+ error: err,
418
+ attempt,
419
+ headers: Array.isArray(originalHeaders) ? originalHeaders[0] ?? {} : originalHeaders
420
+ });
369
421
  }
370
422
  }
371
423
  async function broadcastToInterceptors(envelopes, interceptors, cb) {
@@ -377,11 +429,17 @@ async function broadcastToInterceptors(envelopes, interceptors, cb) {
377
429
  }
378
430
  async function runHandlerWithPipeline(fn, envelopes, interceptors, instrumentation) {
379
431
  const cleanups = [];
432
+ const wraps = [];
380
433
  try {
381
434
  for (const env of envelopes) {
382
435
  for (const inst of instrumentation) {
383
- const cleanup = inst.beforeConsume?.(env);
384
- if (typeof cleanup === "function") cleanups.push(cleanup);
436
+ const result = inst.beforeConsume?.(env);
437
+ if (typeof result === "function") {
438
+ cleanups.push(result);
439
+ } else if (result) {
440
+ if (result.cleanup) cleanups.push(result.cleanup);
441
+ if (result.wrap) wraps.push(result.wrap);
442
+ }
385
443
  }
386
444
  }
387
445
  for (const env of envelopes) {
@@ -389,7 +447,13 @@ async function runHandlerWithPipeline(fn, envelopes, interceptors, instrumentati
389
447
  await interceptor.before?.(env);
390
448
  }
391
449
  }
392
- await fn();
450
+ let runFn = fn;
451
+ for (let i = wraps.length - 1; i >= 0; i--) {
452
+ const wrap = wraps[i];
453
+ const inner = runFn;
454
+ runFn = () => wrap(inner);
455
+ }
456
+ await runFn();
393
457
  for (const env of envelopes) {
394
458
  for (const interceptor of interceptors) {
395
459
  await interceptor.after?.(env);
@@ -459,7 +523,7 @@ async function executeWithRetry(fn, ctx, deps) {
459
523
  1,
460
524
  retry.maxRetries,
461
525
  delay,
462
- envelopes[0]?.headers ?? {},
526
+ isBatch ? envelopes.map((e) => e.headers) : envelopes[0]?.headers ?? {},
463
527
  deps
464
528
  );
465
529
  } else if (isLastAttempt) {
@@ -561,6 +625,7 @@ async function handleEachBatch(payload, opts, deps) {
561
625
  interceptors,
562
626
  dlq,
563
627
  retry,
628
+ retryTopics,
564
629
  timeoutMs,
565
630
  wrapWithTimeout
566
631
  } = opts;
@@ -595,11 +660,12 @@ async function handleEachBatch(payload, opts, deps) {
595
660
  },
596
661
  {
597
662
  envelope: envelopes,
598
- rawMessages: batch.messages.filter((m) => m.value).map((m) => m.value.toString()),
663
+ rawMessages,
599
664
  interceptors,
600
665
  dlq,
601
666
  retry,
602
- isBatch: true
667
+ isBatch: true,
668
+ retryTopics
603
669
  },
604
670
  deps
605
671
  );
@@ -616,10 +682,11 @@ async function subscribeWithRetry(consumer, topics, logger, retryOpts) {
616
682
  } catch (error) {
617
683
  if (attempt === maxAttempts) throw error;
618
684
  const msg = toError(error).message;
685
+ const delay = Math.floor(Math.random() * backoffMs);
619
686
  logger.warn(
620
- `Failed to subscribe to [${topics.join(", ")}] (attempt ${attempt}/${maxAttempts}): ${msg}. Retrying in ${backoffMs}ms...`
687
+ `Failed to subscribe to [${topics.join(", ")}] (attempt ${attempt}/${maxAttempts}): ${msg}. Retrying in ${delay}ms...`
621
688
  );
622
- await sleep(backoffMs);
689
+ await sleep(delay);
623
690
  }
624
691
  }
625
692
  }
@@ -648,7 +715,8 @@ async function startLevelConsumer(level, levelTopics, levelGroupId, originalTopi
648
715
  onMessageLost,
649
716
  ensureTopic,
650
717
  getOrCreateConsumer: getOrCreateConsumer2,
651
- runningConsumers
718
+ runningConsumers,
719
+ createRetryTxProducer
652
720
  } = deps;
653
721
  const backoffMs = retry.backoffMs ?? 1e3;
654
722
  const maxBackoffMs = retry.maxBackoffMs ?? 3e4;
@@ -656,6 +724,7 @@ async function startLevelConsumer(level, levelTopics, levelGroupId, originalTopi
656
724
  for (const lt of levelTopics) {
657
725
  await ensureTopic(lt);
658
726
  }
727
+ const levelTxProducer = await createRetryTxProducer(`${levelGroupId}-tx`);
659
728
  const consumer = getOrCreateConsumer2(levelGroupId, false, false);
660
729
  await consumer.connect();
661
730
  await subscribeWithRetry(consumer, levelTopics, logger);
@@ -744,22 +813,67 @@ async function startLevelConsumer(level, levelTopics, levelGroupId, originalTopi
744
813
  const nextLevel = level + 1;
745
814
  const cap = Math.min(backoffMs * 2 ** level, maxBackoffMs);
746
815
  const delay = Math.floor(Math.random() * cap);
747
- await sendToRetryTopic(
816
+ const { topic: rtTopic, messages: rtMsgs } = buildRetryTopicPayload(
748
817
  originalTopic,
749
818
  [raw],
750
819
  nextLevel,
751
820
  currentMaxRetries,
752
821
  delay,
753
- headers,
754
- pipelineDeps
822
+ headers
755
823
  );
824
+ const tx = await levelTxProducer.transaction();
825
+ try {
826
+ await tx.send({ topic: rtTopic, messages: rtMsgs });
827
+ await tx.sendOffsets({
828
+ consumer,
829
+ topics: [{ topic: nextOffset.topic, partitions: [{ partition: nextOffset.partition, offset: nextOffset.offset }] }]
830
+ });
831
+ await tx.commit();
832
+ logger.warn(
833
+ `Message routed to ${rtTopic} (EOS, level ${nextLevel}/${currentMaxRetries})`
834
+ );
835
+ } catch (txErr) {
836
+ try {
837
+ await tx.abort();
838
+ } catch {
839
+ }
840
+ logger.error(
841
+ `EOS routing to ${rtTopic} failed \u2014 message will be redelivered:`,
842
+ toError(txErr).stack
843
+ );
844
+ return;
845
+ }
756
846
  } else if (dlq) {
757
- await sendToDlq(originalTopic, raw, pipelineDeps, {
758
- error,
759
- // +1 to account for the main consumer's initial attempt before routing.
760
- attempt: level + 1,
761
- originalHeaders: headers
762
- });
847
+ const { topic: dTopic, messages: dMsgs } = buildDlqPayload(
848
+ originalTopic,
849
+ raw,
850
+ {
851
+ error,
852
+ // +1 to account for the main consumer's initial attempt before routing.
853
+ attempt: level + 1,
854
+ originalHeaders: headers
855
+ }
856
+ );
857
+ const tx = await levelTxProducer.transaction();
858
+ try {
859
+ await tx.send({ topic: dTopic, messages: dMsgs });
860
+ await tx.sendOffsets({
861
+ consumer,
862
+ topics: [{ topic: nextOffset.topic, partitions: [{ partition: nextOffset.partition, offset: nextOffset.offset }] }]
863
+ });
864
+ await tx.commit();
865
+ logger.warn(`Message sent to DLQ: ${dTopic} (EOS)`);
866
+ } catch (txErr) {
867
+ try {
868
+ await tx.abort();
869
+ } catch {
870
+ }
871
+ logger.error(
872
+ `EOS DLQ routing to ${dTopic} failed \u2014 message will be redelivered:`,
873
+ toError(txErr).stack
874
+ );
875
+ return;
876
+ }
763
877
  } else {
764
878
  await onMessageLost?.({
765
879
  topic: originalTopic,
@@ -767,8 +881,8 @@ async function startLevelConsumer(level, levelTopics, levelGroupId, originalTopi
767
881
  attempt: level,
768
882
  headers
769
883
  });
884
+ await consumer.commitOffsets([nextOffset]);
770
885
  }
771
- await consumer.commitOffsets([nextOffset]);
772
886
  }
773
887
  });
774
888
  runningConsumers.set(levelGroupId, "eachMessage");
@@ -806,6 +920,7 @@ var KafkaClient = class {
806
920
  kafka;
807
921
  producer;
808
922
  txProducer;
923
+ retryTxProducers = /* @__PURE__ */ new Set();
809
924
  consumers = /* @__PURE__ */ new Map();
810
925
  admin;
811
926
  logger;
@@ -823,6 +938,8 @@ var KafkaClient = class {
823
938
  onMessageLost;
824
939
  onRebalance;
825
940
  isAdminConnected = false;
941
+ inFlightTotal = 0;
942
+ drainResolvers = [];
826
943
  clientId;
827
944
  constructor(clientId, groupId, brokers, options) {
828
945
  this.clientId = clientId;
@@ -830,7 +947,8 @@ var KafkaClient = class {
830
947
  this.logger = options?.logger ?? {
831
948
  log: (msg) => console.log(`[KafkaClient:${clientId}] ${msg}`),
832
949
  warn: (msg, ...args) => console.warn(`[KafkaClient:${clientId}] ${msg}`, ...args),
833
- error: (msg, ...args) => console.error(`[KafkaClient:${clientId}] ${msg}`, ...args)
950
+ error: (msg, ...args) => console.error(`[KafkaClient:${clientId}] ${msg}`, ...args),
951
+ debug: (msg, ...args) => console.debug(`[KafkaClient:${clientId}] ${msg}`, ...args)
834
952
  };
835
953
  this.autoCreateTopicsEnabled = options?.autoCreateTopics ?? false;
836
954
  this.strictSchemasEnabled = options?.strictSchemas ?? true;
@@ -874,7 +992,7 @@ var KafkaClient = class {
874
992
  /** Execute multiple sends atomically. Commits on success, aborts on error. */
875
993
  async transaction(fn) {
876
994
  if (!this.txProducer) {
877
- this.txProducer = this.kafka.producer({
995
+ const p = this.kafka.producer({
878
996
  kafkaJS: {
879
997
  acks: -1,
880
998
  idempotent: true,
@@ -882,7 +1000,8 @@ var KafkaClient = class {
882
1000
  maxInFlightRequests: 1
883
1001
  }
884
1002
  });
885
- await this.txProducer.connect();
1003
+ await p.connect();
1004
+ this.txProducer = p;
886
1005
  }
887
1006
  const tx = await this.txProducer.transaction();
888
1007
  try {
@@ -899,9 +1018,12 @@ var KafkaClient = class {
899
1018
  }
900
1019
  ]);
901
1020
  await tx.send(payload);
1021
+ this.notifyAfterSend(payload.topic, payload.messages.length);
902
1022
  },
903
1023
  sendBatch: async (topicOrDesc, messages) => {
904
- await tx.send(await this.preparePayload(topicOrDesc, messages));
1024
+ const payload = await this.preparePayload(topicOrDesc, messages);
1025
+ await tx.send(payload);
1026
+ this.notifyAfterSend(payload.topic, payload.messages.length);
905
1027
  }
906
1028
  };
907
1029
  await fn(ctx);
@@ -938,23 +1060,28 @@ var KafkaClient = class {
938
1060
  const deps = this.messageDeps;
939
1061
  const timeoutMs = options.handlerTimeoutMs;
940
1062
  await consumer.run({
941
- eachMessage: (payload) => handleEachMessage(
942
- payload,
943
- {
944
- schemaMap,
945
- handleMessage,
946
- interceptors,
947
- dlq,
948
- retry,
949
- retryTopics: options.retryTopics,
950
- timeoutMs,
951
- wrapWithTimeout: this.wrapWithTimeoutWarning.bind(this)
952
- },
953
- deps
1063
+ eachMessage: (payload) => this.trackInFlight(
1064
+ () => handleEachMessage(
1065
+ payload,
1066
+ {
1067
+ schemaMap,
1068
+ handleMessage,
1069
+ interceptors,
1070
+ dlq,
1071
+ retry,
1072
+ retryTopics: options.retryTopics,
1073
+ timeoutMs,
1074
+ wrapWithTimeout: this.wrapWithTimeoutWarning.bind(this)
1075
+ },
1076
+ deps
1077
+ )
954
1078
  )
955
1079
  });
956
1080
  this.runningConsumers.set(gid, "eachMessage");
957
1081
  if (options.retryTopics && retry) {
1082
+ if (!this.autoCreateTopicsEnabled) {
1083
+ await this.validateRetryTopicsExist(topicNames, retry.maxRetries);
1084
+ }
958
1085
  const companions = await startRetryTopicConsumers(
959
1086
  topicNames,
960
1087
  gid,
@@ -971,25 +1098,65 @@ var KafkaClient = class {
971
1098
  return { groupId: gid, stop: () => this.stopConsumer(gid) };
972
1099
  }
973
1100
  async startBatchConsumer(topics, handleBatch, options = {}) {
974
- const { consumer, schemaMap, gid, dlq, interceptors, retry } = await this.setupConsumer(topics, "eachBatch", options);
1101
+ if (options.retryTopics && !options.retry) {
1102
+ throw new Error(
1103
+ "retryTopics requires retry to be configured \u2014 set retry.maxRetries to enable the retry topic chain"
1104
+ );
1105
+ }
1106
+ if (options.autoCommit !== false) {
1107
+ this.logger.debug?.(
1108
+ `startBatchConsumer: autoCommit is enabled (default true). If your handler calls resolveOffset() or commitOffsetsIfNecessary(), set autoCommit: false to avoid offset conflicts.`
1109
+ );
1110
+ }
1111
+ const { consumer, schemaMap, topicNames, gid, dlq, interceptors, retry } = await this.setupConsumer(topics, "eachBatch", options);
975
1112
  const deps = this.messageDeps;
976
1113
  const timeoutMs = options.handlerTimeoutMs;
977
1114
  await consumer.run({
978
- eachBatch: (payload) => handleEachBatch(
979
- payload,
980
- {
981
- schemaMap,
982
- handleBatch,
983
- interceptors,
984
- dlq,
985
- retry,
986
- timeoutMs,
987
- wrapWithTimeout: this.wrapWithTimeoutWarning.bind(this)
988
- },
989
- deps
1115
+ eachBatch: (payload) => this.trackInFlight(
1116
+ () => handleEachBatch(
1117
+ payload,
1118
+ {
1119
+ schemaMap,
1120
+ handleBatch,
1121
+ interceptors,
1122
+ dlq,
1123
+ retry,
1124
+ retryTopics: options.retryTopics,
1125
+ timeoutMs,
1126
+ wrapWithTimeout: this.wrapWithTimeoutWarning.bind(this)
1127
+ },
1128
+ deps
1129
+ )
990
1130
  )
991
1131
  });
992
1132
  this.runningConsumers.set(gid, "eachBatch");
1133
+ if (options.retryTopics && retry) {
1134
+ if (!this.autoCreateTopicsEnabled) {
1135
+ await this.validateRetryTopicsExist(topicNames, retry.maxRetries);
1136
+ }
1137
+ const handleMessageForRetry = (env) => handleBatch([env], {
1138
+ partition: env.partition,
1139
+ highWatermark: env.offset,
1140
+ heartbeat: async () => {
1141
+ },
1142
+ resolveOffset: () => {
1143
+ },
1144
+ commitOffsetsIfNecessary: async () => {
1145
+ }
1146
+ });
1147
+ const companions = await startRetryTopicConsumers(
1148
+ topicNames,
1149
+ gid,
1150
+ handleMessageForRetry,
1151
+ retry,
1152
+ dlq,
1153
+ interceptors,
1154
+ schemaMap,
1155
+ this.retryTopicDeps,
1156
+ options.retryTopicAssignmentTimeoutMs
1157
+ );
1158
+ this.companionGroupIds.set(gid, companions);
1159
+ }
993
1160
  return { groupId: gid, stop: () => this.stopConsumer(gid) };
994
1161
  }
995
1162
  // ── Consumer lifecycle ───────────────────────────────────────────
@@ -1041,14 +1208,15 @@ var KafkaClient = class {
1041
1208
  */
1042
1209
  async getConsumerLag(groupId) {
1043
1210
  const gid = groupId ?? this.defaultGroupId;
1044
- if (!this.isAdminConnected) {
1045
- await this.admin.connect();
1046
- this.isAdminConnected = true;
1047
- }
1211
+ await this.ensureAdminConnected();
1048
1212
  const committedByTopic = await this.admin.fetchOffsets({ groupId: gid });
1213
+ const brokerOffsetsAll = await Promise.all(
1214
+ committedByTopic.map(({ topic: topic2 }) => this.admin.fetchTopicOffsets(topic2))
1215
+ );
1049
1216
  const result = [];
1050
- for (const { topic: topic2, partitions } of committedByTopic) {
1051
- const brokerOffsets = await this.admin.fetchTopicOffsets(topic2);
1217
+ for (let i = 0; i < committedByTopic.length; i++) {
1218
+ const { topic: topic2, partitions } = committedByTopic[i];
1219
+ const brokerOffsets = brokerOffsetsAll[i];
1052
1220
  for (const { partition, offset } of partitions) {
1053
1221
  const broker = brokerOffsets.find((o) => o.partition === partition);
1054
1222
  if (!broker) continue;
@@ -1063,10 +1231,7 @@ var KafkaClient = class {
1063
1231
  /** Check broker connectivity. Never throws — returns a discriminated union. */
1064
1232
  async checkStatus() {
1065
1233
  try {
1066
- if (!this.isAdminConnected) {
1067
- await this.admin.connect();
1068
- this.isAdminConnected = true;
1069
- }
1234
+ await this.ensureAdminConnected();
1070
1235
  const topics = await this.admin.listTopics();
1071
1236
  return { status: "up", clientId: this.clientId, topics };
1072
1237
  } catch (error) {
@@ -1081,12 +1246,17 @@ var KafkaClient = class {
1081
1246
  return this.clientId;
1082
1247
  }
1083
1248
  /** Gracefully disconnect producer, all consumers, and admin. */
1084
- async disconnect() {
1249
+ async disconnect(drainTimeoutMs = 3e4) {
1250
+ await this.waitForDrain(drainTimeoutMs);
1085
1251
  const tasks = [this.producer.disconnect()];
1086
1252
  if (this.txProducer) {
1087
1253
  tasks.push(this.txProducer.disconnect());
1088
1254
  this.txProducer = void 0;
1089
1255
  }
1256
+ for (const p of this.retryTxProducers) {
1257
+ tasks.push(p.disconnect());
1258
+ }
1259
+ this.retryTxProducers.clear();
1090
1260
  for (const consumer of this.consumers.values()) {
1091
1261
  tasks.push(consumer.disconnect());
1092
1262
  }
@@ -1101,9 +1271,59 @@ var KafkaClient = class {
1101
1271
  this.companionGroupIds.clear();
1102
1272
  this.logger.log("All connections closed");
1103
1273
  }
1274
+ // ── Graceful shutdown ────────────────────────────────────────────
1275
+ /**
1276
+ * Register SIGTERM / SIGINT handlers that drain in-flight messages before
1277
+ * disconnecting. Call this once after constructing the client in non-NestJS apps.
1278
+ * NestJS apps get drain for free via `onModuleDestroy` → `disconnect()`.
1279
+ */
1280
+ enableGracefulShutdown(signals = ["SIGTERM", "SIGINT"], drainTimeoutMs = 3e4) {
1281
+ const handler = () => {
1282
+ this.logger.log(
1283
+ "Shutdown signal received \u2014 draining in-flight handlers..."
1284
+ );
1285
+ this.disconnect(drainTimeoutMs).catch(
1286
+ (err) => this.logger.error(
1287
+ "Error during graceful shutdown:",
1288
+ toError(err).message
1289
+ )
1290
+ );
1291
+ };
1292
+ for (const signal of signals) {
1293
+ process.once(signal, handler);
1294
+ }
1295
+ }
1296
+ trackInFlight(fn) {
1297
+ this.inFlightTotal++;
1298
+ return fn().finally(() => {
1299
+ this.inFlightTotal--;
1300
+ if (this.inFlightTotal === 0) {
1301
+ this.drainResolvers.splice(0).forEach((r) => r());
1302
+ }
1303
+ });
1304
+ }
1305
+ waitForDrain(timeoutMs) {
1306
+ if (this.inFlightTotal === 0) return Promise.resolve();
1307
+ return new Promise((resolve) => {
1308
+ let handle;
1309
+ const onDrain = () => {
1310
+ clearTimeout(handle);
1311
+ resolve();
1312
+ };
1313
+ this.drainResolvers.push(onDrain);
1314
+ handle = setTimeout(() => {
1315
+ const idx = this.drainResolvers.indexOf(onDrain);
1316
+ if (idx !== -1) this.drainResolvers.splice(idx, 1);
1317
+ this.logger.warn(
1318
+ `Drain timed out after ${timeoutMs}ms \u2014 ${this.inFlightTotal} handler(s) still in flight`
1319
+ );
1320
+ resolve();
1321
+ }, timeoutMs);
1322
+ });
1323
+ }
1104
1324
  // ── Private helpers ──────────────────────────────────────────────
1105
1325
  async preparePayload(topicOrDesc, messages) {
1106
- registerSchema(topicOrDesc, this.schemaRegistry);
1326
+ registerSchema(topicOrDesc, this.schemaRegistry, this.logger);
1107
1327
  const payload = await buildSendPayload(
1108
1328
  topicOrDesc,
1109
1329
  messages,
@@ -1136,12 +1356,78 @@ var KafkaClient = class {
1136
1356
  }, timeoutMs);
1137
1357
  return promise;
1138
1358
  }
1139
- async ensureTopic(topic2) {
1140
- if (!this.autoCreateTopicsEnabled || this.ensuredTopics.has(topic2)) return;
1141
- if (!this.isAdminConnected) {
1359
+ /**
1360
+ * When `retryTopics: true` and `autoCreateTopics: false`, verify that every
1361
+ * `<topic>.retry.<level>` topic already exists. Throws a clear error at startup
1362
+ * rather than silently discovering missing topics on the first handler failure.
1363
+ */
1364
+ async validateRetryTopicsExist(topicNames, maxRetries) {
1365
+ await this.ensureAdminConnected();
1366
+ const existing = new Set(await this.admin.listTopics());
1367
+ const missing = [];
1368
+ for (const t of topicNames) {
1369
+ for (let level = 1; level <= maxRetries; level++) {
1370
+ const retryTopic = `${t}.retry.${level}`;
1371
+ if (!existing.has(retryTopic)) missing.push(retryTopic);
1372
+ }
1373
+ }
1374
+ if (missing.length > 0) {
1375
+ throw new Error(
1376
+ `retryTopics: true but the following retry topics do not exist: ${missing.join(", ")}. Create them manually or set autoCreateTopics: true.`
1377
+ );
1378
+ }
1379
+ }
1380
+ /**
1381
+ * When `autoCreateTopics` is disabled, verify that `<topic>.dlq` exists for every
1382
+ * consumed topic. Throws a clear error at startup rather than silently discovering
1383
+ * missing DLQ topics on the first handler failure.
1384
+ */
1385
+ async validateDlqTopicsExist(topicNames) {
1386
+ await this.ensureAdminConnected();
1387
+ const existing = new Set(await this.admin.listTopics());
1388
+ const missing = topicNames.filter((t) => !existing.has(`${t}.dlq`)).map((t) => `${t}.dlq`);
1389
+ if (missing.length > 0) {
1390
+ throw new Error(
1391
+ `dlq: true but the following DLQ topics do not exist: ${missing.join(", ")}. Create them manually or set autoCreateTopics: true.`
1392
+ );
1393
+ }
1394
+ }
1395
+ /**
1396
+ * Connect the admin client if not already connected.
1397
+ * The flag is only set to `true` after a successful connect — if `admin.connect()`
1398
+ * throws the flag remains `false` so the next call will retry the connection.
1399
+ */
1400
+ async ensureAdminConnected() {
1401
+ if (this.isAdminConnected) return;
1402
+ try {
1142
1403
  await this.admin.connect();
1143
1404
  this.isAdminConnected = true;
1405
+ } catch (err) {
1406
+ this.isAdminConnected = false;
1407
+ throw err;
1144
1408
  }
1409
+ }
1410
+ /**
1411
+ * Create and connect a transactional producer for EOS retry routing.
1412
+ * Each retry level consumer gets its own producer with a unique `transactionalId`
1413
+ * so Kafka can fence stale producers on restart without affecting other levels.
1414
+ */
1415
+ async createRetryTxProducer(transactionalId) {
1416
+ const p = this.kafka.producer({
1417
+ kafkaJS: {
1418
+ acks: -1,
1419
+ idempotent: true,
1420
+ transactionalId,
1421
+ maxInFlightRequests: 1
1422
+ }
1423
+ });
1424
+ await p.connect();
1425
+ this.retryTxProducers.add(p);
1426
+ return p;
1427
+ }
1428
+ async ensureTopic(topic2) {
1429
+ if (!this.autoCreateTopicsEnabled || this.ensuredTopics.has(topic2)) return;
1430
+ await this.ensureAdminConnected();
1145
1431
  await this.admin.createTopics({
1146
1432
  topics: [{ topic: topic2, numPartitions: this.numPartitions }]
1147
1433
  });
@@ -1174,7 +1460,8 @@ var KafkaClient = class {
1174
1460
  const schemaMap = buildSchemaMap(
1175
1461
  topics,
1176
1462
  this.schemaRegistry,
1177
- optionSchemas
1463
+ optionSchemas,
1464
+ this.logger
1178
1465
  );
1179
1466
  const topicNames = topics.map((t) => resolveTopicName(t));
1180
1467
  for (const t of topicNames) {
@@ -1184,6 +1471,9 @@ var KafkaClient = class {
1184
1471
  for (const t of topicNames) {
1185
1472
  await this.ensureTopic(`${t}.dlq`);
1186
1473
  }
1474
+ if (!this.autoCreateTopicsEnabled) {
1475
+ await this.validateDlqTopicsExist(topicNames);
1476
+ }
1187
1477
  }
1188
1478
  await consumer.connect();
1189
1479
  await subscribeWithRetry(
@@ -1202,7 +1492,8 @@ var KafkaClient = class {
1202
1492
  return {
1203
1493
  schemaRegistry: this.schemaRegistry,
1204
1494
  strictSchemasEnabled: this.strictSchemasEnabled,
1205
- instrumentation: this.instrumentation
1495
+ instrumentation: this.instrumentation,
1496
+ logger: this.logger
1206
1497
  };
1207
1498
  }
1208
1499
  get consumerOpsDeps() {
@@ -1230,7 +1521,8 @@ var KafkaClient = class {
1230
1521
  onMessageLost: this.onMessageLost,
1231
1522
  ensureTopic: (t) => this.ensureTopic(t),
1232
1523
  getOrCreateConsumer: (gid, fb, ac) => getOrCreateConsumer(gid, fb, ac, this.consumerOpsDeps),
1233
- runningConsumers: this.runningConsumers
1524
+ runningConsumers: this.runningConsumers,
1525
+ createRetryTxProducer: (txId) => this.createRetryTxProducer(txId)
1234
1526
  };
1235
1527
  }
1236
1528
  };