@databricks/zerobus-ingest-sdk 0.1.1 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/Cargo.toml CHANGED
@@ -1,6 +1,6 @@
1
1
  [package]
2
2
  name = "zerobus-sdk-ts"
3
- version = "0.1.0"
3
+ version = "0.2.0"
4
4
  authors = ["Databricks"]
5
5
  edition = "2021"
6
6
  license-file = "LICENSE"
@@ -15,8 +15,8 @@ crate-type = ["cdylib"]
15
15
  napi = { version = "2", features = ["async", "tokio_rt", "serde-json"] }
16
16
  napi-derive = "2"
17
17
 
18
- # The Rust SDK we're wrapping (using local path for development)
19
- databricks-zerobus-ingest-sdk = { git = "https://github.com/databricks/zerobus-sdk-rs", tag = "v0.2.0" }
18
+ # The Rust SDK we're wrapping
19
+ databricks-zerobus-ingest-sdk = { git = "https://github.com/databricks/zerobus-sdk-rs", tag = "v0.4.0" }
20
20
 
21
21
  # Async runtime (same as the Rust SDK)
22
22
  tokio = { version = "1.42", features = ["macros", "rt-multi-thread"] }
@@ -38,9 +38,24 @@ base64 = "0.21"
38
38
  # Async trait support
39
39
  async-trait = "0.1"
40
40
 
41
+ # Arrow dependencies (only used when arrow-flight feature is enabled)
42
+ arrow-array = { version = "56.2.0", optional = true }
43
+ arrow-schema = { version = "56.2.0", optional = true }
44
+ arrow-ipc = { version = "56.2.0", features = ["lz4", "zstd"], optional = true }
45
+
41
46
  [build-dependencies]
42
47
  napi-build = "2"
43
48
 
49
+ [features]
50
+ default = []
51
+ # Arrow Flight is experimental/unsupported - enable with: npm run build:arrow
52
+ arrow-flight = [
53
+ "databricks-zerobus-ingest-sdk/arrow-flight",
54
+ "dep:arrow-array",
55
+ "dep:arrow-schema",
56
+ "dep:arrow-ipc"
57
+ ]
58
+
44
59
  [profile.release]
45
60
  lto = true
46
61
  strip = true
package/README.md CHANGED
@@ -288,35 +288,24 @@ const stream = await sdk.createStream(
288
288
  );
289
289
 
290
290
  try {
291
- let lastAckPromise;
291
+ let lastOffset: bigint;
292
292
 
293
293
  // Send all records
294
294
  for (let i = 0; i < 100; i++) {
295
- // Create JSON record
296
295
  const record = {
297
296
  device_name: `sensor-${i % 10}`,
298
297
  temp: 20 + (i % 15),
299
298
  humidity: 50 + (i % 40)
300
299
  };
301
300
 
302
- // JSON supports 2 types:
303
- // 1. object (high-level) - SDK auto-stringifies
304
- lastAckPromise = stream.ingestRecord(record);
305
- // 2. string (low-level) - pre-serialized JSON
306
- // lastAckPromise = stream.ingestRecord(JSON.stringify(record));
301
+ // ingestRecordOffset returns immediately after queuing
302
+ lastOffset = await stream.ingestRecordOffset(record);
307
303
  }
308
304
 
309
- console.log('All records sent. Waiting for last acknowledgment...');
310
-
311
- // Wait for the last record's acknowledgment
312
- const lastOffset = await lastAckPromise;
313
- console.log(`Last record offset: ${lastOffset}`);
314
-
315
- // Flush to ensure all records are acknowledged
316
- await stream.flush();
305
+ // Wait for all records to be acknowledged
306
+ await stream.waitForOffset(lastOffset);
317
307
  console.log('Successfully ingested 100 records!');
318
308
  } finally {
319
- // Always close the stream
320
309
  await stream.close();
321
310
  }
322
311
  ```
@@ -464,7 +453,7 @@ const stream = await sdk.createStream(tableProperties, clientId, clientSecret, o
464
453
 
465
454
  try {
466
455
  const AirQuality = airQuality.examples.AirQuality;
467
- let lastAckPromise;
456
+ let lastOffset: bigint;
468
457
 
469
458
  // Send all records
470
459
  for (let i = 0; i < 100; i++) {
@@ -474,22 +463,12 @@ try {
474
463
  humidity: 50 + i
475
464
  });
476
465
 
477
- // Protobuf supports 2 types:
478
- // 1. Message object (high-level) - SDK calls .encode().finish()
479
- lastAckPromise = stream.ingestRecord(record);
480
- // 2. Buffer (low-level) - pre-serialized bytes
481
- // const buffer = Buffer.from(AirQuality.encode(record).finish());
482
- // lastAckPromise = stream.ingestRecord(buffer);
466
+ // ingestRecordOffset returns immediately after queuing
467
+ lastOffset = await stream.ingestRecordOffset(record);
483
468
  }
484
469
 
485
- console.log('All records sent. Waiting for last acknowledgment...');
486
-
487
- // Wait for the last record's acknowledgment
488
- const lastOffset = await lastAckPromise;
489
- console.log(`Last record offset: ${lastOffset}`);
490
-
491
- // Flush to ensure all records are acknowledged
492
- await stream.flush();
470
+ // Wait for all records to be acknowledged
471
+ await stream.waitForOffset(lastOffset);
493
472
  console.log('Successfully ingested 100 records!');
494
473
  } finally {
495
474
  await stream.close();
@@ -610,7 +589,7 @@ npm run build:proto
610
589
  protoc --descriptor_set_out=schemas/air_quality_descriptor.pb --include_imports schemas/air_quality.proto
611
590
 
612
591
  # Run example
613
- npx tsx examples/proto.ts
592
+ npm run example:proto:single
614
593
  ```
615
594
 
616
595
  #### Why Two Steps (TypeScript + Descriptor)?
@@ -641,15 +620,17 @@ export DATABRICKS_CLIENT_ID="your-client-id"
641
620
  export DATABRICKS_CLIENT_SECRET="your-client-secret"
642
621
  export ZEROBUS_TABLE_NAME="main.default.air_quality"
643
622
 
644
- # Run JSON example
645
- npx tsx examples/json.ts
623
+ # Run JSON examples
624
+ npm run example:json:single
625
+ npm run example:json:batch
646
626
 
647
627
  # For Protocol Buffers, generate TypeScript code and descriptor
648
628
  npm run build:proto
649
629
  protoc --descriptor_set_out=schemas/air_quality_descriptor.pb --include_imports schemas/air_quality.proto
650
630
 
651
- # Run Protocol Buffers example
652
- npx tsx examples/proto.ts
631
+ # Run Protocol Buffers examples
632
+ npm run example:proto:single
633
+ npm run example:proto:batch
653
634
  ```
654
635
 
655
636
  ### Batch Ingestion
@@ -664,13 +645,14 @@ const records = Array.from({ length: 1000 }, (_, i) =>
664
645
  );
665
646
 
666
647
  // Protobuf Type 1: Message objects (high-level) - SDK auto-serializes
667
- const offsetId = await stream.ingestRecords(records);
648
+ const offsetId = await stream.ingestRecordsOffset(records);
668
649
 
669
650
  // Protobuf Type 2: Buffers (low-level) - pre-serialized bytes
670
651
  // const buffers = records.map(r => Buffer.from(AirQuality.encode(r).finish()));
671
- // const offsetId = await stream.ingestRecords(buffers);
652
+ // const offsetId = await stream.ingestRecordsOffset(buffers);
672
653
 
673
654
  if (offsetId !== null) {
655
+ await stream.waitForOffset(offsetId);
674
656
  console.log(`Batch acknowledged at offset ${offsetId}`);
675
657
  }
676
658
  ```
@@ -685,11 +667,15 @@ const records = Array.from({ length: 1000 }, (_, i) => ({
685
667
  }));
686
668
 
687
669
  // JSON Type 1: objects (high-level) - SDK auto-stringifies
688
- const offsetId = await stream.ingestRecords(records);
670
+ const offsetId = await stream.ingestRecordsOffset(records);
689
671
 
690
672
  // JSON Type 2: strings (low-level) - pre-serialized JSON
691
673
  // const jsonRecords = records.map(r => JSON.stringify(r));
692
- // const offsetId = await stream.ingestRecords(jsonRecords);
674
+ // const offsetId = await stream.ingestRecordsOffset(jsonRecords);
675
+
676
+ if (offsetId !== null) {
677
+ await stream.waitForOffset(offsetId);
678
+ }
693
679
  ```
694
680
 
695
681
  **Type Widening Support:**
@@ -703,7 +689,7 @@ const offsetId = await stream.ingestRecords(records);
703
689
  - Use `recreateStream()` for recovery - it automatically handles unacknowledged batches
704
690
 
705
691
  **Examples:**
706
- Both `json.ts` and `proto.ts` examples demonstrate batch ingestion.
692
+ See `examples/json/batch.ts` and `examples/proto/batch.ts` for batch ingestion examples.
707
693
 
708
694
  ## Authentication
709
695
 
@@ -732,29 +718,25 @@ The SDK automatically fetches access tokens and includes these headers:
732
718
  Beyond OAuth, you can use custom headers for Personal Access Tokens (PAT) or other auth methods:
733
719
 
734
720
  ```typescript
735
- import { ZerobusSdk } from '@databricks/zerobus-ingest-sdk';
736
- import { HeadersProvider } from '@databricks/zerobus-ingest-sdk/src/headers_provider';
737
-
738
- class CustomHeadersProvider implements HeadersProvider {
739
- async getHeaders(): Promise<Array<[string, string]>> {
740
- return [
741
- ["authorization", `Bearer ${myToken}`],
742
- ["x-databricks-zerobus-table-name", tableName]
743
- ];
744
- }
745
- }
746
-
747
- const headersProvider = new CustomHeadersProvider();
748
721
  const stream = await sdk.createStream(
749
722
  tableProperties,
750
723
  '', // client_id (ignored when headers_provider is provided)
751
724
  '', // client_secret (ignored when headers_provider is provided)
752
725
  options,
753
- { getHeadersCallback: headersProvider.getHeaders.bind(headersProvider) }
726
+ {
727
+ getHeadersCallback: async () => [
728
+ ["authorization", `Bearer ${myToken}`],
729
+ ["x-databricks-zerobus-table-name", tableName]
730
+ ]
731
+ }
754
732
  );
755
733
  ```
756
734
 
757
- **Note:** Custom authentication is integrated into the main `createStream()` method. See the API Reference for details.
735
+ **Required headers:**
736
+ - `authorization` - Bearer token or other auth header
737
+ - `x-databricks-zerobus-table-name` - The fully qualified table name
738
+
739
+ **Note:** The SDK automatically adds the `user-agent` header if not provided.
758
740
 
759
741
  ## Configuration
760
742
 
@@ -770,6 +752,7 @@ const stream = await sdk.createStream(
770
752
  | `recoveryRetries` | 4 | Maximum number of recovery attempts |
771
753
  | `flushTimeoutMs` | 300,000 | Timeout for flush operations (ms) |
772
754
  | `serverLackOfAckTimeoutMs` | 60,000 | Server acknowledgment timeout (ms) |
755
+ | `streamPausedMaxWaitTimeMs` | undefined | Max wait time during graceful stream close (ms) |
773
756
 
774
757
  ### Example Configuration
775
758
 
@@ -839,7 +822,8 @@ The SDK includes automatic recovery for transient failures (enabled by default w
839
822
 
840
823
  ```typescript
841
824
  try {
842
- const offset = await stream.ingestRecord(JSON.stringify(record));
825
+ const offset = await stream.ingestRecordOffset(record);
826
+ await stream.waitForOffset(offset);
843
827
  console.log(`Success: offset ${offset}`);
844
828
  } catch (error) {
845
829
  console.error('Ingestion failed:', error);
@@ -952,11 +936,44 @@ Represents an active ingestion stream.
952
936
 
953
937
  **Methods:**
954
938
 
939
+ ```typescript
940
+ async ingestRecordOffset(payload: Buffer | string | object): Promise<bigint>
941
+ ```
942
+
943
+ **(Recommended)** Ingests a single record. The Promise resolves immediately after the record is queued (before server acknowledgment). Use `waitForOffset()` to wait for acknowledgment when needed.
944
+
945
+ ```typescript
946
+ // High-throughput pattern: send many, wait once
947
+ const offset1 = await stream.ingestRecordOffset(record1); // Resolves immediately
948
+ const offset2 = await stream.ingestRecordOffset(record2); // Resolves immediately
949
+ await stream.waitForOffset(offset2); // Waits for server to acknowledge all records up to offset2
950
+ ```
951
+
952
+ ---
953
+
954
+ ```typescript
955
+ async ingestRecordsOffset(payloads: Array<Buffer | string | object>): Promise<bigint | null>
956
+ ```
957
+
958
+ **(Recommended)** Ingests multiple records as a batch. The Promise resolves immediately after the batch is queued (before server acknowledgment). Returns `null` for empty batches.
959
+
960
+ ---
961
+
962
+ ```typescript
963
+ async waitForOffset(offsetId: bigint): Promise<void>
964
+ ```
965
+
966
+ Waits for the server to acknowledge all records up to and including the specified offset ID.
967
+
968
+ ---
969
+
955
970
  ```typescript
956
971
  async ingestRecord(payload: Buffer | string | object): Promise<bigint>
957
972
  ```
958
973
 
959
- Ingests a single record. This method **blocks** until the record is sent to the SDK's internal landing zone, then returns a Promise for the server acknowledgment. This allows you to send many records without waiting for individual acknowledgments.
974
+ **@deprecated** Use `ingestRecordOffset()` instead.
975
+
976
+ Ingests a single record. Unlike `ingestRecordOffset()`, the Promise only resolves **after the server acknowledges** the record. This is slower for high-throughput scenarios.
960
977
 
961
978
  **Parameters:**
962
979
  - `payload` - Record data. The SDK supports 4 input types for flexibility:
@@ -994,7 +1011,9 @@ await stream.ingestRecord(buffer);
994
1011
  async ingestRecords(payloads: Array<Buffer | string | object>): Promise<bigint | null>
995
1012
  ```
996
1013
 
997
- Ingests multiple records as a batch. All records in a batch are acknowledged together atomically. This method **blocks** until all records are sent to the SDK's internal landing zone, then returns a Promise for the server acknowledgment.
1014
+ **@deprecated** Use `ingestRecordsOffset()` instead.
1015
+
1016
+ Ingests multiple records as a batch. Unlike `ingestRecordsOffset()`, the Promise only resolves **after the server acknowledges** the batch. This is slower for high-throughput scenarios.
998
1017
 
999
1018
  **Parameters:**
1000
1019
  - `payloads` - Array of record data. Supports the same 4 types as `ingestRecord()`:
@@ -1135,14 +1154,15 @@ Configuration options for stream behavior.
1135
1154
 
1136
1155
  ```typescript
1137
1156
  interface StreamConfigurationOptions {
1138
- recordType?: RecordType; // RecordType.Json or RecordType.Proto. Default: RecordType.Proto
1139
- maxInflightRequests?: number; // Default: 10,000
1140
- recovery?: boolean; // Default: true
1141
- recoveryTimeoutMs?: number; // Default: 15,000
1142
- recoveryBackoffMs?: number; // Default: 2,000
1143
- recoveryRetries?: number; // Default: 4
1144
- flushTimeoutMs?: number; // Default: 300,000
1145
- serverLackOfAckTimeoutMs?: number; // Default: 60,000
1157
+ recordType?: RecordType; // RecordType.Json or RecordType.Proto. Default: RecordType.Proto
1158
+ maxInflightRequests?: number; // Default: 10,000
1159
+ recovery?: boolean; // Default: true
1160
+ recoveryTimeoutMs?: number; // Default: 15,000
1161
+ recoveryBackoffMs?: number; // Default: 2,000
1162
+ recoveryRetries?: number; // Default: 4
1163
+ flushTimeoutMs?: number; // Default: 300,000
1164
+ serverLackOfAckTimeoutMs?: number; // Default: 60,000
1165
+ streamPausedMaxWaitTimeMs?: number; // Default: undefined (wait for full server duration)
1146
1166
  }
1147
1167
 
1148
1168
  enum RecordType {
@@ -1159,7 +1179,7 @@ enum RecordType {
1159
1179
  4. **Error handling**: The stream handles errors internally with automatic retry. Only use `recreateStream()` for persistent failures after internal retries are exhausted.
1160
1180
  5. **Use Protocol Buffers for production**: Protocol Buffers (the default) provides better performance and schema validation. Use JSON only when you need schema flexibility or for quick prototyping.
1161
1181
  6. **Store credentials securely**: Use environment variables, never hardcode credentials
1162
- 7. **Use batch ingestion**: For high-throughput scenarios, use `ingestRecords()` instead of individual `ingestRecord()` calls
1182
+ 7. **Use batch ingestion**: For high-throughput scenarios, use `ingestRecordsOffset()` instead of individual `ingestRecordOffset()` calls
1163
1183
 
1164
1184
  ## Platform Support
1165
1185
 
@@ -1203,7 +1223,7 @@ This SDK wraps the high-performance [Rust Zerobus SDK](https://github.com/databr
1203
1223
  ```
1204
1224
 
1205
1225
  **Benefits:**
1206
- - **Zero-copy data transfer** between JavaScript and Rust
1226
+ - **Native performance** - Rust implementation for high-throughput ingestion
1207
1227
  - **Native async/await support** - Rust futures become JavaScript Promises
1208
1228
  - **Automatic memory management** - No manual cleanup required
1209
1229
  - **Type safety** - Compile-time checks on both sides