@eventferry/kafka 3.4.0 → 3.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.cts CHANGED
@@ -127,7 +127,20 @@ interface KafkaDriver {
127
127
  * TLS using the driver's default trust store).
128
128
  *
129
129
  * `rejectUnauthorized` is intentionally NOT a knob here — TLS verification is
130
- * non-negotiable. Dev clusters with self-signed certs pass their CA via `ca`.
130
+ * non-negotiable. Dev clusters with self-signed certs MUST pass their CA via
131
+ * `ca` (validation still happens, against your CA instead of the system
132
+ * trust store). If the broker is addressed by an IP literal or a hostname
133
+ * that doesn't match the cert SAN, set `servername` to the hostname the
134
+ * cert was issued for so SNI + verification align.
135
+ *
136
+ * **Driver parity:**
137
+ * - `ca`, `cert`, `key`, `passphrase` work on both kafkajs and confluent.
138
+ * - `servername` is honored by **kafkajs** (Node `tls.connect` reads
139
+ * `servername` directly). On the **confluent** driver it's a documented
140
+ * no-op — librdkafka derives SNI from the broker address and v1.x's
141
+ * kafkaJS-compat layer does not surface an override. Use the kafkajs
142
+ * driver for clusters where you need the SNI lever, or wait for
143
+ * librdkafka to expose it.
131
144
  */
132
145
  interface TlsConfig {
133
146
  /** PEM-encoded CA bundle. Buffers and strings both accepted. */
@@ -138,7 +151,13 @@ interface TlsConfig {
138
151
  key?: string | Buffer;
139
152
  /** Passphrase for an encrypted private key. */
140
153
  passphrase?: string;
141
- /** SNI host. Useful when broker address doesn't match the cert SAN. */
154
+ /**
155
+ * SNI host. Set this when the broker address (e.g. an IP literal or an
156
+ * internal DNS name) does NOT match the certificate's Subject
157
+ * Alternative Names. Honored on the kafkajs driver; no-op on the
158
+ * confluent driver (librdkafka does not expose an SNI override at
159
+ * v1.x).
160
+ */
142
161
  servername?: string;
143
162
  }
144
163
  /**
@@ -337,7 +356,40 @@ interface ProducerBehaviorConfig {
337
356
  * extension point, not a JS callback.
338
357
  */
339
358
  customPartitioner?: () => (args: unknown) => number;
359
+ /**
360
+ * (confluent only) Periodic librdkafka statistics callback. When set,
361
+ * eventferry wires `stats_cb` on the underlying producer and parses the
362
+ * JSON payload librdkafka emits every {@link statsIntervalMs} ms.
363
+ *
364
+ * The shape is intentionally opaque — librdkafka's stats schema is huge
365
+ * (txmsgs, rxbytes, queue depth, broker timeouts, per-topic / per-partition
366
+ * counters…) and evolves across versions. Documented at
367
+ * https://github.com/confluentinc/librdkafka/blob/master/STATISTICS.md.
368
+ * Cast to your own narrower type if you're consuming a known subset.
369
+ *
370
+ * No-op on the kafkajs driver — kafkajs has no equivalent surface.
371
+ * Pair with {@link statsIntervalMs} (defaults to 30000 ms when this hook
372
+ * is set but `rawProducerConfig['statistics.interval.ms']` isn't).
373
+ */
374
+ onStats?: (stats: LibrdkafkaStats) => void;
375
+ /**
376
+ * (confluent only) Override the polling interval the librdkafka stats
377
+ * callback fires at. Maps to `statistics.interval.ms`. Defaults to
378
+ * 30000 ms when {@link onStats} is set; defaults to 0 (disabled)
379
+ * otherwise — librdkafka spends CPU on this and we don't want to enable
380
+ * it silently. Set to 0 to suppress emission while keeping the hook
381
+ * defined (useful for tests).
382
+ */
383
+ statsIntervalMs?: number;
340
384
  }
385
+ /**
386
+ * Opaque envelope for librdkafka's stats JSON. The schema is
387
+ * version-specific and large; eventferry surfaces it untyped so you can
388
+ * cast to whatever subset you care about.
389
+ *
390
+ * Reference: https://github.com/confluentinc/librdkafka/blob/master/STATISTICS.md
391
+ */
392
+ type LibrdkafkaStats = Record<string, unknown>;
341
393
  type DriverKind = "kafkajs" | "confluent";
342
394
 
343
395
  interface KjsProducer {
@@ -522,6 +574,18 @@ interface KafkaPublisherHooks {
522
574
  * Useful for observability dashboards that track EOS failure rates.
523
575
  */
524
576
  onTransactionAbort?(error: Error): void | Promise<void>;
577
+ /**
578
+ * Fires when the broker fences this producer — the previous publish
579
+ * batch reported at least one `errorKind: "fenced"` result. Receives
580
+ * the first fenced `Error` so dashboards can attribute the incident.
581
+ *
582
+ * - When `autoRecoverFromFence` is on, this hook fires BEFORE the
583
+ * transparent reconnect attempt. Use it to decrement a transactional
584
+ * producer's leader-election counter or warn the operator.
585
+ * - When `autoRecoverFromFence` is off, the publisher surfaces the
586
+ * fenced result unchanged and the hook is still fired for visibility.
587
+ */
588
+ onProducerFenced?(error: Error): void | Promise<void>;
525
589
  }
526
590
  /**
527
591
  * Invoke a hook safely. Never throws back into the caller — logs the hook's
@@ -643,6 +707,32 @@ interface KafkaPublisherOptions extends KafkaConnectionConfig, ProducerBehaviorC
643
707
  * fire. Driver must implement `admin()` (the built-ins do).
644
708
  */
645
709
  validateTopicsOnConnect?: string[];
710
+ /**
711
+ * Transparently recover from a producer-fence error. When set to `true`,
712
+ * a `publish()` call whose batch comes back with at least one
713
+ * `errorKind: "fenced"` result triggers ONE round of:
714
+ *
715
+ * 1. disconnect the driver
716
+ * 2. connect it again (re-running `initTransactions` for transactional producers)
717
+ * 3. re-send the same batch
718
+ *
719
+ * If the second send still produces a fenced result, the publisher gives
720
+ * up and surfaces the failures unchanged — at that point the fence is
721
+ * almost certainly caused by another instance taking the same
722
+ * `transactionalId`, and silently retrying again would mask the
723
+ * misconfiguration.
724
+ *
725
+ * Default `false` to preserve the previous "fenced → fatal" semantics.
726
+ * Turn it on when running a single producer instance against transient
727
+ * brokers (rolling restarts, network blips) where a fence is usually
728
+ * just a transient epoch mismatch.
729
+ *
730
+ * For MULTI-INSTANCE EOS, leave this OFF and use a callable
731
+ * `transactionalId` derived from per-instance context (pod name, k8s
732
+ * ordinal, AZ + replica index) so each instance has a stable, unique
733
+ * id — fences will then correctly stop the loser instance.
734
+ */
735
+ autoRecoverFromFence?: boolean;
646
736
  }
647
737
  /**
648
738
  * The Publisher the Relay talks to. Wraps a pluggable KafkaDriver and adds
@@ -656,6 +746,8 @@ declare class KafkaPublisher implements Publisher {
656
746
  private readonly hooks;
657
747
  private readonly tracer;
658
748
  private readonly validateTopicsOnConnect;
749
+ private readonly autoRecoverFromFence;
750
+ private fenceRecovery;
659
751
  constructor(opts: KafkaPublisherOptions);
660
752
  connect(): Promise<void>;
661
753
  /**
@@ -697,6 +789,43 @@ declare class KafkaPublisher implements Publisher {
697
789
  publishToDlq(message: PublishableMessage, error: Error): Promise<void>;
698
790
  /** Whether the configured driver provides atomic (EOS) batch sends. */
699
791
  get transactional(): boolean;
792
+ /**
793
+ * Cheap reachability probe. Borrows a fresh admin client, calls
794
+ * `listTopics`, and returns timing + outcome. Useful as the body of a
795
+ * `/healthz` or `/readyz` endpoint — proves the broker is reachable
796
+ * AND that the configured credentials still authenticate against it,
797
+ * without writing a record.
798
+ *
799
+ * Does NOT exercise the producer's send path — a healthy admin
800
+ * connection doesn't guarantee `publish()` will succeed (a fenced
801
+ * transactional producer would still answer healthy here). Treat this
802
+ * as "broker reachable + auth still good", not "publisher is fully
803
+ * operational".
804
+ *
805
+ * Default timeout 5_000 ms — long enough to ride out a single broker
806
+ * leader election, short enough to fail a liveness probe meaningfully.
807
+ * Set `timeoutMs: 0` to disable the timer entirely.
808
+ *
809
+ * The driver must implement `admin()` (the built-ins do); custom
810
+ * drivers without admin get `{ ok: false, error: ... }` instead of
811
+ * the throw `publisher.admin()` would surface — health checks are
812
+ * not the place to crash.
813
+ */
814
+ healthCheck(opts?: {
815
+ timeoutMs?: number;
816
+ }): Promise<HealthStatus>;
817
+ /**
818
+ * Disconnect + re-connect the driver and re-send the batch ONCE. Used
819
+ * by the fence-recovery path. Concurrent fence recoveries dedupe on a
820
+ * shared in-flight promise (`fenceRecovery`) so we don't tear the
821
+ * producer down while another batch is mid-restart.
822
+ *
823
+ * If the second send STILL reports any fenced records, those failures
824
+ * are returned unchanged — another instance has almost certainly taken
825
+ * the same `transactionalId` and silently retrying again would mask
826
+ * the misconfiguration.
827
+ */
828
+ private recoverAndRetry;
700
829
  /**
701
830
  * Start a span for the batch following the OTel messaging conventions.
702
831
  *
@@ -707,5 +836,20 @@ declare class KafkaPublisher implements Publisher {
707
836
  */
708
837
  private startBatchSpan;
709
838
  }
839
+ /**
840
+ * Outcome of a {@link KafkaPublisher.healthCheck} call. Shape is stable
841
+ * and small so consumers (HTTP /healthz, k8s probes, Datadog) can
842
+ * marshal it without a translation layer.
843
+ */
844
+ interface HealthStatus {
845
+ /** True when the broker answered within the timeout window. */
846
+ ok: boolean;
847
+ /** Wall-clock milliseconds spent on the probe (admin connect + listTopics). */
848
+ latencyMs: number;
849
+ /** Epoch ms when the probe started — handy for log correlation. */
850
+ timestamp: number;
851
+ /** Present only when `ok === false`. The classified error, untouched. */
852
+ error?: Error;
853
+ }
710
854
 
711
- export { type ConfluentClientConfig, ConfluentDriver, type ConfluentDriverOptions, type DriverKind, type KafkaAdmin, type KafkaConnectionConfig, type KafkaDriver, type KafkaDriverAdmin, KafkaJsDriver, type KafkaJsDriverOptions, type KafkaJsPartitionerChoice, KafkaPublisher, type KafkaPublisherHooks, type KafkaPublisherOptions, type KafkaTracer, NoopKafkaTracer, type OauthBearerToken, type PartitionGrowSpec, type PartitionMetadata, type ProducerBehaviorConfig, type SaslConfig, type SaslOauthbearerConfig, type SaslPasswordConfig, type SpanAttributeValue, type SpanLike, type TlsConfig, type TopicCreateSpec, type TopicMetadata, _resetKafkajsWarnDedup, buildConfluentClientConfig, classifyConfluentError, classifyKafkajsError, safeHook };
855
+ export { type ConfluentClientConfig, ConfluentDriver, type ConfluentDriverOptions, type DriverKind, type HealthStatus, type KafkaAdmin, type KafkaConnectionConfig, type KafkaDriver, type KafkaDriverAdmin, KafkaJsDriver, type KafkaJsDriverOptions, type KafkaJsPartitionerChoice, KafkaPublisher, type KafkaPublisherHooks, type KafkaPublisherOptions, type KafkaTracer, type LibrdkafkaStats, NoopKafkaTracer, type OauthBearerToken, type PartitionGrowSpec, type PartitionMetadata, type ProducerBehaviorConfig, type SaslConfig, type SaslOauthbearerConfig, type SaslPasswordConfig, type SpanAttributeValue, type SpanLike, type TlsConfig, type TopicCreateSpec, type TopicMetadata, _resetKafkajsWarnDedup, buildConfluentClientConfig, classifyConfluentError, classifyKafkajsError, safeHook };
package/dist/index.d.ts CHANGED
@@ -127,7 +127,20 @@ interface KafkaDriver {
127
127
  * TLS using the driver's default trust store).
128
128
  *
129
129
  * `rejectUnauthorized` is intentionally NOT a knob here — TLS verification is
130
- * non-negotiable. Dev clusters with self-signed certs pass their CA via `ca`.
130
+ * non-negotiable. Dev clusters with self-signed certs MUST pass their CA via
131
+ * `ca` (validation still happens, against your CA instead of the system
132
+ * trust store). If the broker is addressed by an IP literal or a hostname
133
+ * that doesn't match the cert SAN, set `servername` to the hostname the
134
+ * cert was issued for so SNI + verification align.
135
+ *
136
+ * **Driver parity:**
137
+ * - `ca`, `cert`, `key`, `passphrase` work on both kafkajs and confluent.
138
+ * - `servername` is honored by **kafkajs** (Node `tls.connect` reads
139
+ * `servername` directly). On the **confluent** driver it's a documented
140
+ * no-op — librdkafka derives SNI from the broker address and v1.x's
141
+ * kafkaJS-compat layer does not surface an override. Use the kafkajs
142
+ * driver for clusters where you need the SNI lever, or wait for
143
+ * librdkafka to expose it.
131
144
  */
132
145
  interface TlsConfig {
133
146
  /** PEM-encoded CA bundle. Buffers and strings both accepted. */
@@ -138,7 +151,13 @@ interface TlsConfig {
138
151
  key?: string | Buffer;
139
152
  /** Passphrase for an encrypted private key. */
140
153
  passphrase?: string;
141
- /** SNI host. Useful when broker address doesn't match the cert SAN. */
154
+ /**
155
+ * SNI host. Set this when the broker address (e.g. an IP literal or an
156
+ * internal DNS name) does NOT match the certificate's Subject
157
+ * Alternative Names. Honored on the kafkajs driver; no-op on the
158
+ * confluent driver (librdkafka does not expose an SNI override at
159
+ * v1.x).
160
+ */
142
161
  servername?: string;
143
162
  }
144
163
  /**
@@ -337,7 +356,40 @@ interface ProducerBehaviorConfig {
337
356
  * extension point, not a JS callback.
338
357
  */
339
358
  customPartitioner?: () => (args: unknown) => number;
359
+ /**
360
+ * (confluent only) Periodic librdkafka statistics callback. When set,
361
+ * eventferry wires `stats_cb` on the underlying producer and parses the
362
+ * JSON payload librdkafka emits every {@link statsIntervalMs} ms.
363
+ *
364
+ * The shape is intentionally opaque — librdkafka's stats schema is huge
365
+ * (txmsgs, rxbytes, queue depth, broker timeouts, per-topic / per-partition
366
+ * counters…) and evolves across versions. Documented at
367
+ * https://github.com/confluentinc/librdkafka/blob/master/STATISTICS.md.
368
+ * Cast to your own narrower type if you're consuming a known subset.
369
+ *
370
+ * No-op on the kafkajs driver — kafkajs has no equivalent surface.
371
+ * Pair with {@link statsIntervalMs} (defaults to 30000 ms when this hook
372
+ * is set but `rawProducerConfig['statistics.interval.ms']` isn't).
373
+ */
374
+ onStats?: (stats: LibrdkafkaStats) => void;
375
+ /**
376
+ * (confluent only) Override the polling interval the librdkafka stats
377
+ * callback fires at. Maps to `statistics.interval.ms`. Defaults to
378
+ * 30000 ms when {@link onStats} is set; defaults to 0 (disabled)
379
+ * otherwise — librdkafka spends CPU on this and we don't want to enable
380
+ * it silently. Set to 0 to suppress emission while keeping the hook
381
+ * defined (useful for tests).
382
+ */
383
+ statsIntervalMs?: number;
340
384
  }
385
+ /**
386
+ * Opaque envelope for librdkafka's stats JSON. The schema is
387
+ * version-specific and large; eventferry surfaces it untyped so you can
388
+ * cast to whatever subset you care about.
389
+ *
390
+ * Reference: https://github.com/confluentinc/librdkafka/blob/master/STATISTICS.md
391
+ */
392
+ type LibrdkafkaStats = Record<string, unknown>;
341
393
  type DriverKind = "kafkajs" | "confluent";
342
394
 
343
395
  interface KjsProducer {
@@ -522,6 +574,18 @@ interface KafkaPublisherHooks {
522
574
  * Useful for observability dashboards that track EOS failure rates.
523
575
  */
524
576
  onTransactionAbort?(error: Error): void | Promise<void>;
577
+ /**
578
+ * Fires when the broker fences this producer — the previous publish
579
+ * batch reported at least one `errorKind: "fenced"` result. Receives
580
+ * the first fenced `Error` so dashboards can attribute the incident.
581
+ *
582
+ * - When `autoRecoverFromFence` is on, this hook fires BEFORE the
583
+ * transparent reconnect attempt. Use it to decrement a transactional
584
+ * producer's leader-election counter or warn the operator.
585
+ * - When `autoRecoverFromFence` is off, the publisher surfaces the
586
+ * fenced result unchanged and the hook is still fired for visibility.
587
+ */
588
+ onProducerFenced?(error: Error): void | Promise<void>;
525
589
  }
526
590
  /**
527
591
  * Invoke a hook safely. Never throws back into the caller — logs the hook's
@@ -643,6 +707,32 @@ interface KafkaPublisherOptions extends KafkaConnectionConfig, ProducerBehaviorC
643
707
  * fire. Driver must implement `admin()` (the built-ins do).
644
708
  */
645
709
  validateTopicsOnConnect?: string[];
710
+ /**
711
+ * Transparently recover from a producer-fence error. When set to `true`,
712
+ * a `publish()` call whose batch comes back with at least one
713
+ * `errorKind: "fenced"` result triggers ONE round of:
714
+ *
715
+ * 1. disconnect the driver
716
+ * 2. connect it again (re-running `initTransactions` for transactional producers)
717
+ * 3. re-send the same batch
718
+ *
719
+ * If the second send still produces a fenced result, the publisher gives
720
+ * up and surfaces the failures unchanged — at that point the fence is
721
+ * almost certainly caused by another instance taking the same
722
+ * `transactionalId`, and silently retrying again would mask the
723
+ * misconfiguration.
724
+ *
725
+ * Default `false` to preserve the previous "fenced → fatal" semantics.
726
+ * Turn it on when running a single producer instance against transient
727
+ * brokers (rolling restarts, network blips) where a fence is usually
728
+ * just a transient epoch mismatch.
729
+ *
730
+ * For MULTI-INSTANCE EOS, leave this OFF and use a callable
731
+ * `transactionalId` derived from per-instance context (pod name, k8s
732
+ * ordinal, AZ + replica index) so each instance has a stable, unique
733
+ * id — fences will then correctly stop the loser instance.
734
+ */
735
+ autoRecoverFromFence?: boolean;
646
736
  }
647
737
  /**
648
738
  * The Publisher the Relay talks to. Wraps a pluggable KafkaDriver and adds
@@ -656,6 +746,8 @@ declare class KafkaPublisher implements Publisher {
656
746
  private readonly hooks;
657
747
  private readonly tracer;
658
748
  private readonly validateTopicsOnConnect;
749
+ private readonly autoRecoverFromFence;
750
+ private fenceRecovery;
659
751
  constructor(opts: KafkaPublisherOptions);
660
752
  connect(): Promise<void>;
661
753
  /**
@@ -697,6 +789,43 @@ declare class KafkaPublisher implements Publisher {
697
789
  publishToDlq(message: PublishableMessage, error: Error): Promise<void>;
698
790
  /** Whether the configured driver provides atomic (EOS) batch sends. */
699
791
  get transactional(): boolean;
792
+ /**
793
+ * Cheap reachability probe. Borrows a fresh admin client, calls
794
+ * `listTopics`, and returns timing + outcome. Useful as the body of a
795
+ * `/healthz` or `/readyz` endpoint — proves the broker is reachable
796
+ * AND that the configured credentials still authenticate against it,
797
+ * without writing a record.
798
+ *
799
+ * Does NOT exercise the producer's send path — a healthy admin
800
+ * connection doesn't guarantee `publish()` will succeed (a fenced
801
+ * transactional producer would still answer healthy here). Treat this
802
+ * as "broker reachable + auth still good", not "publisher is fully
803
+ * operational".
804
+ *
805
+ * Default timeout 5_000 ms — long enough to ride out a single broker
806
+ * leader election, short enough to fail a liveness probe meaningfully.
807
+ * Set `timeoutMs: 0` to disable the timer entirely.
808
+ *
809
+ * The driver must implement `admin()` (the built-ins do); custom
810
+ * drivers without admin get `{ ok: false, error: ... }` instead of
811
+ * the throw `publisher.admin()` would surface — health checks are
812
+ * not the place to crash.
813
+ */
814
+ healthCheck(opts?: {
815
+ timeoutMs?: number;
816
+ }): Promise<HealthStatus>;
817
+ /**
818
+ * Disconnect + re-connect the driver and re-send the batch ONCE. Used
819
+ * by the fence-recovery path. Concurrent fence recoveries dedupe on a
820
+ * shared in-flight promise (`fenceRecovery`) so we don't tear the
821
+ * producer down while another batch is mid-restart.
822
+ *
823
+ * If the second send STILL reports any fenced records, those failures
824
+ * are returned unchanged — another instance has almost certainly taken
825
+ * the same `transactionalId` and silently retrying again would mask
826
+ * the misconfiguration.
827
+ */
828
+ private recoverAndRetry;
700
829
  /**
701
830
  * Start a span for the batch following the OTel messaging conventions.
702
831
  *
@@ -707,5 +836,20 @@ declare class KafkaPublisher implements Publisher {
707
836
  */
708
837
  private startBatchSpan;
709
838
  }
839
+ /**
840
+ * Outcome of a {@link KafkaPublisher.healthCheck} call. Shape is stable
841
+ * and small so consumers (HTTP /healthz, k8s probes, Datadog) can
842
+ * marshal it without a translation layer.
843
+ */
844
+ interface HealthStatus {
845
+ /** True when the broker answered within the timeout window. */
846
+ ok: boolean;
847
+ /** Wall-clock milliseconds spent on the probe (admin connect + listTopics). */
848
+ latencyMs: number;
849
+ /** Epoch ms when the probe started — handy for log correlation. */
850
+ timestamp: number;
851
+ /** Present only when `ok === false`. The classified error, untouched. */
852
+ error?: Error;
853
+ }
710
854
 
711
- export { type ConfluentClientConfig, ConfluentDriver, type ConfluentDriverOptions, type DriverKind, type KafkaAdmin, type KafkaConnectionConfig, type KafkaDriver, type KafkaDriverAdmin, KafkaJsDriver, type KafkaJsDriverOptions, type KafkaJsPartitionerChoice, KafkaPublisher, type KafkaPublisherHooks, type KafkaPublisherOptions, type KafkaTracer, NoopKafkaTracer, type OauthBearerToken, type PartitionGrowSpec, type PartitionMetadata, type ProducerBehaviorConfig, type SaslConfig, type SaslOauthbearerConfig, type SaslPasswordConfig, type SpanAttributeValue, type SpanLike, type TlsConfig, type TopicCreateSpec, type TopicMetadata, _resetKafkajsWarnDedup, buildConfluentClientConfig, classifyConfluentError, classifyKafkajsError, safeHook };
855
+ export { type ConfluentClientConfig, ConfluentDriver, type ConfluentDriverOptions, type DriverKind, type HealthStatus, type KafkaAdmin, type KafkaConnectionConfig, type KafkaDriver, type KafkaDriverAdmin, KafkaJsDriver, type KafkaJsDriverOptions, type KafkaJsPartitionerChoice, KafkaPublisher, type KafkaPublisherHooks, type KafkaPublisherOptions, type KafkaTracer, type LibrdkafkaStats, NoopKafkaTracer, type OauthBearerToken, type PartitionGrowSpec, type PartitionMetadata, type ProducerBehaviorConfig, type SaslConfig, type SaslOauthbearerConfig, type SaslPasswordConfig, type SpanAttributeValue, type SpanLike, type TlsConfig, type TopicCreateSpec, type TopicMetadata, _resetKafkajsWarnDedup, buildConfluentClientConfig, classifyConfluentError, classifyKafkajsError, safeHook };