@eventferry/kafka 3.4.0 → 3.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -7,6 +7,7 @@ function classifyKafkajsError(err) {
7
7
  if (e.name === "KafkaJSNonRetriableError") return "fatal";
8
8
  const type = typeof e.type === "string" ? e.type : void 0;
9
9
  if (type) {
10
+ if (FENCED_TYPES.has(type)) return "fenced";
10
11
  if (RETRIABLE_TYPES.has(type)) return "retriable";
11
12
  if (POISON_TYPES.has(type)) return "poison";
12
13
  if (FATAL_TYPES.has(type)) return "fatal";
@@ -40,9 +41,11 @@ var POISON_TYPES = /* @__PURE__ */ new Set([
40
41
  "INVALID_REQUIRED_ACKS",
41
42
  "INVALID_PARTITIONS"
42
43
  ]);
43
- var FATAL_TYPES = /* @__PURE__ */ new Set([
44
+ var FENCED_TYPES = /* @__PURE__ */ new Set([
44
45
  "INVALID_PRODUCER_EPOCH",
45
- "PRODUCER_FENCED",
46
+ "PRODUCER_FENCED"
47
+ ]);
48
+ var FATAL_TYPES = /* @__PURE__ */ new Set([
46
49
  "TOPIC_AUTHORIZATION_FAILED",
47
50
  "CLUSTER_AUTHORIZATION_FAILED",
48
51
  "TRANSACTIONAL_ID_AUTHORIZATION_FAILED",
@@ -73,8 +76,8 @@ var CODE_TO_KIND = /* @__PURE__ */ new Map([
73
76
  // TOPIC_AUTHORIZATION_FAILED
74
77
  [31, "fatal"],
75
78
  // CLUSTER_AUTHORIZATION_FAILED
76
- [47, "fatal"],
77
- // INVALID_PRODUCER_EPOCH
79
+ [47, "fenced"],
80
+ // INVALID_PRODUCER_EPOCH — retryable once via publisher reconnect
78
81
  [58, "fatal"],
79
82
  // SASL_AUTHENTICATION_FAILED
80
83
  [74, "retriable"],
@@ -107,7 +110,10 @@ var UNSUPPORTED_BY_KAFKAJS = [
107
110
  "maxRequestSize",
108
111
  // Confluent-only escape hatches; ignored on kafkajs.
109
112
  "compressionLevel",
110
- "rawProducerConfig"
113
+ "rawProducerConfig",
114
+ // librdkafka stats — kafkajs has no equivalent surface.
115
+ "onStats",
116
+ "statsIntervalMs"
111
117
  ];
112
118
  var KafkaJsDriver = class {
113
119
  transactional;
@@ -393,8 +399,8 @@ var CODE_TO_KIND2 = /* @__PURE__ */ new Map([
393
399
  // ERR__TRANSPORT
394
400
  [-198, "poison"],
395
401
  // ERR__BAD_COMPRESSION
396
- [-144, "fatal"],
397
- // ERR__FENCED — producer fenced by another with same txn id
402
+ [-144, "fenced"],
403
+ // ERR__FENCED — producer fenced; publisher reconnect attempts a transparent recovery once
398
404
  [-150, "fatal"],
399
405
  // ERR__FATAL — unrecoverable librdkafka error
400
406
  [-169, "fatal"],
@@ -426,8 +432,8 @@ var CODE_TO_KIND2 = /* @__PURE__ */ new Map([
426
432
  // TOPIC_AUTHORIZATION_FAILED
427
433
  [31, "fatal"],
428
434
  // CLUSTER_AUTHORIZATION_FAILED
429
- [47, "fatal"],
430
- // INVALID_PRODUCER_EPOCH
435
+ [47, "fenced"],
436
+ // INVALID_PRODUCER_EPOCH — retryable once via publisher reconnect
431
437
  [58, "fatal"],
432
438
  // SASL_AUTHENTICATION_FAILED
433
439
  [74, "retriable"],
@@ -441,7 +447,7 @@ var CODE_TO_KIND2 = /* @__PURE__ */ new Map([
441
447
  ]);
442
448
  var NAME_TO_KIND = /* @__PURE__ */ new Map([
443
449
  ["ERR__QUEUE_FULL", "backpressure"],
444
- ["ERR__FENCED", "fatal"],
450
+ ["ERR__FENCED", "fenced"],
445
451
  ["ERR__FATAL", "fatal"],
446
452
  ["ERR__AUTHENTICATION", "fatal"],
447
453
  ["ERR__SSL", "fatal"],
@@ -450,7 +456,7 @@ var NAME_TO_KIND = /* @__PURE__ */ new Map([
450
456
  ["ERR__BAD_COMPRESSION", "poison"],
451
457
  ["ERR_TOPIC_AUTHORIZATION_FAILED", "fatal"],
452
458
  ["ERR_CLUSTER_AUTHORIZATION_FAILED", "fatal"],
453
- ["ERR_INVALID_PRODUCER_EPOCH", "fatal"],
459
+ ["ERR_INVALID_PRODUCER_EPOCH", "fenced"],
454
460
  ["ERR_SASL_AUTHENTICATION_FAILED", "fatal"],
455
461
  ["ERR_CORRUPT_MESSAGE", "poison"],
456
462
  ["ERR_MSG_SIZE_TOO_LARGE", "poison"],
@@ -486,6 +492,12 @@ function buildConfluentClientConfig(opts) {
486
492
  if (opts.compressionLevel !== void 0) {
487
493
  librdkafka["compression.level"] = opts.compressionLevel;
488
494
  }
495
+ if (opts.onStats) {
496
+ librdkafka["stats_cb"] = wrapStatsCallback(opts.onStats);
497
+ librdkafka["statistics.interval.ms"] = opts.statsIntervalMs ?? 3e4;
498
+ } else if (opts.statsIntervalMs !== void 0) {
499
+ librdkafka["statistics.interval.ms"] = opts.statsIntervalMs;
500
+ }
489
501
  const tlsRequested = opts.ssl === true || isTlsConfig(opts.ssl);
490
502
  const saslRequested = !!opts.sasl;
491
503
  if (saslRequested && tlsRequested) {
@@ -523,6 +535,20 @@ function buildConfluentClientConfig(opts) {
523
535
  function isTlsConfig(v) {
524
536
  return typeof v === "object" && v !== null;
525
537
  }
538
+ function wrapStatsCallback(onStats) {
539
+ return (raw) => {
540
+ let parsed;
541
+ try {
542
+ parsed = typeof raw === "string" ? JSON.parse(raw) : raw;
543
+ } catch {
544
+ return;
545
+ }
546
+ try {
547
+ onStats(parsed);
548
+ } catch {
549
+ }
550
+ };
551
+ }
526
552
  function stringifyPem(input) {
527
553
  if (Array.isArray(input)) {
528
554
  return input.map((x) => typeof x === "string" ? x : x.toString("utf8")).join("\n");
@@ -764,11 +790,17 @@ var KafkaPublisher = class {
764
790
  hooks;
765
791
  tracer;
766
792
  validateTopicsOnConnect;
793
+ autoRecoverFromFence;
794
+ // Serialize reconnects so concurrent publish() calls hitting a fence
795
+ // all observe the same single reconnect attempt — the second publish
796
+ // doesn't try to disconnect a producer the first is still re-initing.
797
+ fenceRecovery = null;
767
798
  constructor(opts) {
768
799
  this.logger = opts.logger;
769
800
  this.hooks = opts.hooks ?? {};
770
801
  this.tracer = opts.tracer ?? new NoopKafkaTracer();
771
802
  this.validateTopicsOnConnect = opts.validateTopicsOnConnect ? Object.freeze([...opts.validateTopicsOnConnect]) : void 0;
803
+ this.autoRecoverFromFence = opts.autoRecoverFromFence ?? false;
772
804
  const onTransactionAbort = this.hooks.onTransactionAbort ? (error) => {
773
805
  void safeHook(
774
806
  this.logger,
@@ -891,6 +923,20 @@ var KafkaPublisher = class {
891
923
  await safeHook(this.logger, "onError", () => this.hooks.onError?.(error));
892
924
  throw err;
893
925
  }
926
+ const firstFenced = results.find(
927
+ (r) => !r.ok && r.errorKind === "fenced"
928
+ );
929
+ if (firstFenced) {
930
+ const fenceErr = firstFenced.error ?? new Error("producer fenced");
931
+ await safeHook(
932
+ this.logger,
933
+ "onProducerFenced",
934
+ () => this.hooks.onProducerFenced?.(fenceErr)
935
+ );
936
+ if (this.autoRecoverFromFence) {
937
+ results = await this.recoverAndRetry(outgoing, results);
938
+ }
939
+ }
894
940
  const byId = new Map(messages.map((m) => [m.recordId, m]));
895
941
  let allOk = true;
896
942
  for (const r of results) {
@@ -941,6 +987,110 @@ var KafkaPublisher = class {
941
987
  get transactional() {
942
988
  return this.driver.transactional;
943
989
  }
990
+ /**
991
+ * Cheap reachability probe. Borrows a fresh admin client, calls
992
+ * `listTopics`, and returns timing + outcome. Useful as the body of a
993
+ * `/healthz` or `/readyz` endpoint — proves the broker is reachable
994
+ * AND that the configured credentials still authenticate against it,
995
+ * without writing a record.
996
+ *
997
+ * Does NOT exercise the producer's send path — a healthy admin
998
+ * connection doesn't guarantee `publish()` will succeed (a fenced
999
+ * transactional producer would still answer healthy here). Treat this
1000
+ * as "broker reachable + auth still good", not "publisher is fully
1001
+ * operational".
1002
+ *
1003
+ * Default timeout 5_000 ms — long enough to ride out a single broker
1004
+ * leader election, short enough to fail a liveness probe meaningfully.
1005
+ * Set `timeoutMs: 0` to disable the timer entirely.
1006
+ *
1007
+ * The driver must implement `admin()` (the built-ins do); custom
1008
+ * drivers without admin get `{ ok: false, error: ... }` instead of
1009
+ * the throw `publisher.admin()` would surface — health checks are
1010
+ * not the place to crash.
1011
+ */
1012
+ async healthCheck(opts = {}) {
1013
+ const timeoutMs = opts.timeoutMs ?? 5e3;
1014
+ const startedAt = Date.now();
1015
+ if (!this.driver.admin) {
1016
+ return {
1017
+ ok: false,
1018
+ latencyMs: 0,
1019
+ timestamp: startedAt,
1020
+ error: new Error(
1021
+ "KafkaPublisher.healthCheck: configured driver does not implement admin()"
1022
+ )
1023
+ };
1024
+ }
1025
+ let admin = null;
1026
+ try {
1027
+ admin = await this.driver.admin();
1028
+ await admin.connect();
1029
+ const probe = admin.listTopics();
1030
+ if (timeoutMs > 0) {
1031
+ await raceWithTimeout(probe, timeoutMs, "healthCheck");
1032
+ } else {
1033
+ await probe;
1034
+ }
1035
+ return {
1036
+ ok: true,
1037
+ latencyMs: Date.now() - startedAt,
1038
+ timestamp: startedAt
1039
+ };
1040
+ } catch (err) {
1041
+ const error = err instanceof Error ? err : new Error(String(err));
1042
+ return {
1043
+ ok: false,
1044
+ latencyMs: Date.now() - startedAt,
1045
+ timestamp: startedAt,
1046
+ error
1047
+ };
1048
+ } finally {
1049
+ try {
1050
+ await admin?.close();
1051
+ } catch {
1052
+ }
1053
+ }
1054
+ }
1055
+ /**
1056
+ * Disconnect + re-connect the driver and re-send the batch ONCE. Used
1057
+ * by the fence-recovery path. Concurrent fence recoveries dedupe on a
1058
+ * shared in-flight promise (`fenceRecovery`) so we don't tear the
1059
+ * producer down while another batch is mid-restart.
1060
+ *
1061
+ * If the second send STILL reports any fenced records, those failures
1062
+ * are returned unchanged — another instance has almost certainly taken
1063
+ * the same `transactionalId` and silently retrying again would mask
1064
+ * the misconfiguration.
1065
+ */
1066
+ async recoverAndRetry(outgoing, firstResults) {
1067
+ if (!this.fenceRecovery) {
1068
+ this.fenceRecovery = (async () => {
1069
+ try {
1070
+ await this.driver.disconnect();
1071
+ await this.driver.connect();
1072
+ } finally {
1073
+ this.fenceRecovery = null;
1074
+ }
1075
+ })();
1076
+ }
1077
+ try {
1078
+ await this.fenceRecovery;
1079
+ } catch (err) {
1080
+ const reconnectErr = err instanceof Error ? err : new Error(String(err));
1081
+ await safeHook(
1082
+ this.logger,
1083
+ "onError",
1084
+ () => this.hooks.onError?.(reconnectErr)
1085
+ );
1086
+ return firstResults;
1087
+ }
1088
+ try {
1089
+ return await this.driver.sendBatch(outgoing);
1090
+ } catch {
1091
+ return firstResults;
1092
+ }
1093
+ }
944
1094
  /**
945
1095
  * Start a span for the batch following the OTel messaging conventions.
946
1096
  *
@@ -959,6 +1109,26 @@ var KafkaPublisher = class {
959
1109
  });
960
1110
  }
961
1111
  };
1112
+ function raceWithTimeout(p, ms, label) {
1113
+ return new Promise((resolve, reject) => {
1114
+ const timer = setTimeout(() => {
1115
+ reject(new Error(`${label} timed out after ${ms}ms`));
1116
+ }, ms);
1117
+ if (typeof timer.unref === "function") {
1118
+ timer.unref();
1119
+ }
1120
+ p.then(
1121
+ (v) => {
1122
+ clearTimeout(timer);
1123
+ resolve(v);
1124
+ },
1125
+ (e) => {
1126
+ clearTimeout(timer);
1127
+ reject(e);
1128
+ }
1129
+ );
1130
+ });
1131
+ }
962
1132
  function selectDriver(opts) {
963
1133
  const kind = opts.driver ?? "kafkajs";
964
1134
  switch (kind) {