@databricks/zerobus-ingest-sdk 0.1.1 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/Cargo.lock +707 -205
- package/Cargo.toml +18 -3
- package/README.md +89 -69
- package/index.d.ts +538 -0
- package/index.js +318 -0
- package/package.json +23 -13
- package/schemas/air_quality_descriptor.pb +9 -0
- package/schemas/edge_cases_descriptor.pb +41 -0
- package/src/lib.rs +887 -38
- package/zerobus-ingest-sdk.linux-arm64-gnu.node +0 -0
- package/zerobus-ingest-sdk.linux-x64-gnu.node +0 -0
- package/zerobus-ingest-sdk.win32-x64-msvc.node +0 -0
package/Cargo.toml
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[package]
|
|
2
2
|
name = "zerobus-sdk-ts"
|
|
3
|
-
version = "0.
|
|
3
|
+
version = "0.2.0"
|
|
4
4
|
authors = ["Databricks"]
|
|
5
5
|
edition = "2021"
|
|
6
6
|
license-file = "LICENSE"
|
|
@@ -15,8 +15,8 @@ crate-type = ["cdylib"]
|
|
|
15
15
|
napi = { version = "2", features = ["async", "tokio_rt", "serde-json"] }
|
|
16
16
|
napi-derive = "2"
|
|
17
17
|
|
|
18
|
-
# The Rust SDK we're wrapping
|
|
19
|
-
databricks-zerobus-ingest-sdk = { git = "https://github.com/databricks/zerobus-sdk-rs", tag = "v0.
|
|
18
|
+
# The Rust SDK we're wrapping
|
|
19
|
+
databricks-zerobus-ingest-sdk = { git = "https://github.com/databricks/zerobus-sdk-rs", tag = "v0.4.0" }
|
|
20
20
|
|
|
21
21
|
# Async runtime (same as the Rust SDK)
|
|
22
22
|
tokio = { version = "1.42", features = ["macros", "rt-multi-thread"] }
|
|
@@ -38,9 +38,24 @@ base64 = "0.21"
|
|
|
38
38
|
# Async trait support
|
|
39
39
|
async-trait = "0.1"
|
|
40
40
|
|
|
41
|
+
# Arrow dependencies (only used when arrow-flight feature is enabled)
|
|
42
|
+
arrow-array = { version = "56.2.0", optional = true }
|
|
43
|
+
arrow-schema = { version = "56.2.0", optional = true }
|
|
44
|
+
arrow-ipc = { version = "56.2.0", features = ["lz4", "zstd"], optional = true }
|
|
45
|
+
|
|
41
46
|
[build-dependencies]
|
|
42
47
|
napi-build = "2"
|
|
43
48
|
|
|
49
|
+
[features]
|
|
50
|
+
default = []
|
|
51
|
+
# Arrow Flight is experimental/unsupported - enable with: npm run build:arrow
|
|
52
|
+
arrow-flight = [
|
|
53
|
+
"databricks-zerobus-ingest-sdk/arrow-flight",
|
|
54
|
+
"dep:arrow-array",
|
|
55
|
+
"dep:arrow-schema",
|
|
56
|
+
"dep:arrow-ipc"
|
|
57
|
+
]
|
|
58
|
+
|
|
44
59
|
[profile.release]
|
|
45
60
|
lto = true
|
|
46
61
|
strip = true
|
package/README.md
CHANGED
|
@@ -288,35 +288,24 @@ const stream = await sdk.createStream(
|
|
|
288
288
|
);
|
|
289
289
|
|
|
290
290
|
try {
|
|
291
|
-
let
|
|
291
|
+
let lastOffset: bigint;
|
|
292
292
|
|
|
293
293
|
// Send all records
|
|
294
294
|
for (let i = 0; i < 100; i++) {
|
|
295
|
-
// Create JSON record
|
|
296
295
|
const record = {
|
|
297
296
|
device_name: `sensor-${i % 10}`,
|
|
298
297
|
temp: 20 + (i % 15),
|
|
299
298
|
humidity: 50 + (i % 40)
|
|
300
299
|
};
|
|
301
300
|
|
|
302
|
-
//
|
|
303
|
-
|
|
304
|
-
lastAckPromise = stream.ingestRecord(record);
|
|
305
|
-
// 2. string (low-level) - pre-serialized JSON
|
|
306
|
-
// lastAckPromise = stream.ingestRecord(JSON.stringify(record));
|
|
301
|
+
// ingestRecordOffset returns immediately after queuing
|
|
302
|
+
lastOffset = await stream.ingestRecordOffset(record);
|
|
307
303
|
}
|
|
308
304
|
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
// Wait for the last record's acknowledgment
|
|
312
|
-
const lastOffset = await lastAckPromise;
|
|
313
|
-
console.log(`Last record offset: ${lastOffset}`);
|
|
314
|
-
|
|
315
|
-
// Flush to ensure all records are acknowledged
|
|
316
|
-
await stream.flush();
|
|
305
|
+
// Wait for all records to be acknowledged
|
|
306
|
+
await stream.waitForOffset(lastOffset);
|
|
317
307
|
console.log('Successfully ingested 100 records!');
|
|
318
308
|
} finally {
|
|
319
|
-
// Always close the stream
|
|
320
309
|
await stream.close();
|
|
321
310
|
}
|
|
322
311
|
```
|
|
@@ -464,7 +453,7 @@ const stream = await sdk.createStream(tableProperties, clientId, clientSecret, o
|
|
|
464
453
|
|
|
465
454
|
try {
|
|
466
455
|
const AirQuality = airQuality.examples.AirQuality;
|
|
467
|
-
let
|
|
456
|
+
let lastOffset: bigint;
|
|
468
457
|
|
|
469
458
|
// Send all records
|
|
470
459
|
for (let i = 0; i < 100; i++) {
|
|
@@ -474,22 +463,12 @@ try {
|
|
|
474
463
|
humidity: 50 + i
|
|
475
464
|
});
|
|
476
465
|
|
|
477
|
-
//
|
|
478
|
-
|
|
479
|
-
lastAckPromise = stream.ingestRecord(record);
|
|
480
|
-
// 2. Buffer (low-level) - pre-serialized bytes
|
|
481
|
-
// const buffer = Buffer.from(AirQuality.encode(record).finish());
|
|
482
|
-
// lastAckPromise = stream.ingestRecord(buffer);
|
|
466
|
+
// ingestRecordOffset returns immediately after queuing
|
|
467
|
+
lastOffset = await stream.ingestRecordOffset(record);
|
|
483
468
|
}
|
|
484
469
|
|
|
485
|
-
|
|
486
|
-
|
|
487
|
-
// Wait for the last record's acknowledgment
|
|
488
|
-
const lastOffset = await lastAckPromise;
|
|
489
|
-
console.log(`Last record offset: ${lastOffset}`);
|
|
490
|
-
|
|
491
|
-
// Flush to ensure all records are acknowledged
|
|
492
|
-
await stream.flush();
|
|
470
|
+
// Wait for all records to be acknowledged
|
|
471
|
+
await stream.waitForOffset(lastOffset);
|
|
493
472
|
console.log('Successfully ingested 100 records!');
|
|
494
473
|
} finally {
|
|
495
474
|
await stream.close();
|
|
@@ -610,7 +589,7 @@ npm run build:proto
|
|
|
610
589
|
protoc --descriptor_set_out=schemas/air_quality_descriptor.pb --include_imports schemas/air_quality.proto
|
|
611
590
|
|
|
612
591
|
# Run example
|
|
613
|
-
|
|
592
|
+
npm run example:proto:single
|
|
614
593
|
```
|
|
615
594
|
|
|
616
595
|
#### Why Two Steps (TypeScript + Descriptor)?
|
|
@@ -641,15 +620,17 @@ export DATABRICKS_CLIENT_ID="your-client-id"
|
|
|
641
620
|
export DATABRICKS_CLIENT_SECRET="your-client-secret"
|
|
642
621
|
export ZEROBUS_TABLE_NAME="main.default.air_quality"
|
|
643
622
|
|
|
644
|
-
# Run JSON
|
|
645
|
-
|
|
623
|
+
# Run JSON examples
|
|
624
|
+
npm run example:json:single
|
|
625
|
+
npm run example:json:batch
|
|
646
626
|
|
|
647
627
|
# For Protocol Buffers, generate TypeScript code and descriptor
|
|
648
628
|
npm run build:proto
|
|
649
629
|
protoc --descriptor_set_out=schemas/air_quality_descriptor.pb --include_imports schemas/air_quality.proto
|
|
650
630
|
|
|
651
|
-
# Run Protocol Buffers
|
|
652
|
-
|
|
631
|
+
# Run Protocol Buffers examples
|
|
632
|
+
npm run example:proto:single
|
|
633
|
+
npm run example:proto:batch
|
|
653
634
|
```
|
|
654
635
|
|
|
655
636
|
### Batch Ingestion
|
|
@@ -664,13 +645,14 @@ const records = Array.from({ length: 1000 }, (_, i) =>
|
|
|
664
645
|
);
|
|
665
646
|
|
|
666
647
|
// Protobuf Type 1: Message objects (high-level) - SDK auto-serializes
|
|
667
|
-
const offsetId = await stream.
|
|
648
|
+
const offsetId = await stream.ingestRecordsOffset(records);
|
|
668
649
|
|
|
669
650
|
// Protobuf Type 2: Buffers (low-level) - pre-serialized bytes
|
|
670
651
|
// const buffers = records.map(r => Buffer.from(AirQuality.encode(r).finish()));
|
|
671
|
-
// const offsetId = await stream.
|
|
652
|
+
// const offsetId = await stream.ingestRecordsOffset(buffers);
|
|
672
653
|
|
|
673
654
|
if (offsetId !== null) {
|
|
655
|
+
await stream.waitForOffset(offsetId);
|
|
674
656
|
console.log(`Batch acknowledged at offset ${offsetId}`);
|
|
675
657
|
}
|
|
676
658
|
```
|
|
@@ -685,11 +667,15 @@ const records = Array.from({ length: 1000 }, (_, i) => ({
|
|
|
685
667
|
}));
|
|
686
668
|
|
|
687
669
|
// JSON Type 1: objects (high-level) - SDK auto-stringifies
|
|
688
|
-
const offsetId = await stream.
|
|
670
|
+
const offsetId = await stream.ingestRecordsOffset(records);
|
|
689
671
|
|
|
690
672
|
// JSON Type 2: strings (low-level) - pre-serialized JSON
|
|
691
673
|
// const jsonRecords = records.map(r => JSON.stringify(r));
|
|
692
|
-
// const offsetId = await stream.
|
|
674
|
+
// const offsetId = await stream.ingestRecordsOffset(jsonRecords);
|
|
675
|
+
|
|
676
|
+
if (offsetId !== null) {
|
|
677
|
+
await stream.waitForOffset(offsetId);
|
|
678
|
+
}
|
|
693
679
|
```
|
|
694
680
|
|
|
695
681
|
**Type Widening Support:**
|
|
@@ -703,7 +689,7 @@ const offsetId = await stream.ingestRecords(records);
|
|
|
703
689
|
- Use `recreateStream()` for recovery - it automatically handles unacknowledged batches
|
|
704
690
|
|
|
705
691
|
**Examples:**
|
|
706
|
-
|
|
692
|
+
See `examples/json/batch.ts` and `examples/proto/batch.ts` for batch ingestion examples.
|
|
707
693
|
|
|
708
694
|
## Authentication
|
|
709
695
|
|
|
@@ -732,29 +718,25 @@ The SDK automatically fetches access tokens and includes these headers:
|
|
|
732
718
|
Beyond OAuth, you can use custom headers for Personal Access Tokens (PAT) or other auth methods:
|
|
733
719
|
|
|
734
720
|
```typescript
|
|
735
|
-
import { ZerobusSdk } from '@databricks/zerobus-ingest-sdk';
|
|
736
|
-
import { HeadersProvider } from '@databricks/zerobus-ingest-sdk/src/headers_provider';
|
|
737
|
-
|
|
738
|
-
class CustomHeadersProvider implements HeadersProvider {
|
|
739
|
-
async getHeaders(): Promise<Array<[string, string]>> {
|
|
740
|
-
return [
|
|
741
|
-
["authorization", `Bearer ${myToken}`],
|
|
742
|
-
["x-databricks-zerobus-table-name", tableName]
|
|
743
|
-
];
|
|
744
|
-
}
|
|
745
|
-
}
|
|
746
|
-
|
|
747
|
-
const headersProvider = new CustomHeadersProvider();
|
|
748
721
|
const stream = await sdk.createStream(
|
|
749
722
|
tableProperties,
|
|
750
723
|
'', // client_id (ignored when headers_provider is provided)
|
|
751
724
|
'', // client_secret (ignored when headers_provider is provided)
|
|
752
725
|
options,
|
|
753
|
-
{
|
|
726
|
+
{
|
|
727
|
+
getHeadersCallback: async () => [
|
|
728
|
+
["authorization", `Bearer ${myToken}`],
|
|
729
|
+
["x-databricks-zerobus-table-name", tableName]
|
|
730
|
+
]
|
|
731
|
+
}
|
|
754
732
|
);
|
|
755
733
|
```
|
|
756
734
|
|
|
757
|
-
**
|
|
735
|
+
**Required headers:**
|
|
736
|
+
- `authorization` - Bearer token or other auth header
|
|
737
|
+
- `x-databricks-zerobus-table-name` - The fully qualified table name
|
|
738
|
+
|
|
739
|
+
**Note:** The SDK automatically adds the `user-agent` header if not provided.
|
|
758
740
|
|
|
759
741
|
## Configuration
|
|
760
742
|
|
|
@@ -770,6 +752,7 @@ const stream = await sdk.createStream(
|
|
|
770
752
|
| `recoveryRetries` | 4 | Maximum number of recovery attempts |
|
|
771
753
|
| `flushTimeoutMs` | 300,000 | Timeout for flush operations (ms) |
|
|
772
754
|
| `serverLackOfAckTimeoutMs` | 60,000 | Server acknowledgment timeout (ms) |
|
|
755
|
+
| `streamPausedMaxWaitTimeMs` | undefined | Max wait time during graceful stream close (ms) |
|
|
773
756
|
|
|
774
757
|
### Example Configuration
|
|
775
758
|
|
|
@@ -839,7 +822,8 @@ The SDK includes automatic recovery for transient failures (enabled by default w
|
|
|
839
822
|
|
|
840
823
|
```typescript
|
|
841
824
|
try {
|
|
842
|
-
const offset = await stream.
|
|
825
|
+
const offset = await stream.ingestRecordOffset(record);
|
|
826
|
+
await stream.waitForOffset(offset);
|
|
843
827
|
console.log(`Success: offset ${offset}`);
|
|
844
828
|
} catch (error) {
|
|
845
829
|
console.error('Ingestion failed:', error);
|
|
@@ -952,11 +936,44 @@ Represents an active ingestion stream.
|
|
|
952
936
|
|
|
953
937
|
**Methods:**
|
|
954
938
|
|
|
939
|
+
```typescript
|
|
940
|
+
async ingestRecordOffset(payload: Buffer | string | object): Promise<bigint>
|
|
941
|
+
```
|
|
942
|
+
|
|
943
|
+
**(Recommended)** Ingests a single record. The Promise resolves immediately after the record is queued (before server acknowledgment). Use `waitForOffset()` to wait for acknowledgment when needed.
|
|
944
|
+
|
|
945
|
+
```typescript
|
|
946
|
+
// High-throughput pattern: send many, wait once
|
|
947
|
+
const offset1 = await stream.ingestRecordOffset(record1); // Resolves immediately
|
|
948
|
+
const offset2 = await stream.ingestRecordOffset(record2); // Resolves immediately
|
|
949
|
+
await stream.waitForOffset(offset2); // Waits for server to acknowledge all records up to offset2
|
|
950
|
+
```
|
|
951
|
+
|
|
952
|
+
---
|
|
953
|
+
|
|
954
|
+
```typescript
|
|
955
|
+
async ingestRecordsOffset(payloads: Array<Buffer | string | object>): Promise<bigint | null>
|
|
956
|
+
```
|
|
957
|
+
|
|
958
|
+
**(Recommended)** Ingests multiple records as a batch. The Promise resolves immediately after the batch is queued (before server acknowledgment). Returns `null` for empty batches.
|
|
959
|
+
|
|
960
|
+
---
|
|
961
|
+
|
|
962
|
+
```typescript
|
|
963
|
+
async waitForOffset(offsetId: bigint): Promise<void>
|
|
964
|
+
```
|
|
965
|
+
|
|
966
|
+
Waits for the server to acknowledge all records up to and including the specified offset ID.
|
|
967
|
+
|
|
968
|
+
---
|
|
969
|
+
|
|
955
970
|
```typescript
|
|
956
971
|
async ingestRecord(payload: Buffer | string | object): Promise<bigint>
|
|
957
972
|
```
|
|
958
973
|
|
|
959
|
-
|
|
974
|
+
**@deprecated** Use `ingestRecordOffset()` instead.
|
|
975
|
+
|
|
976
|
+
Ingests a single record. Unlike `ingestRecordOffset()`, the Promise only resolves **after the server acknowledges** the record. This is slower for high-throughput scenarios.
|
|
960
977
|
|
|
961
978
|
**Parameters:**
|
|
962
979
|
- `payload` - Record data. The SDK supports 4 input types for flexibility:
|
|
@@ -994,7 +1011,9 @@ await stream.ingestRecord(buffer);
|
|
|
994
1011
|
async ingestRecords(payloads: Array<Buffer | string | object>): Promise<bigint | null>
|
|
995
1012
|
```
|
|
996
1013
|
|
|
997
|
-
|
|
1014
|
+
**@deprecated** Use `ingestRecordsOffset()` instead.
|
|
1015
|
+
|
|
1016
|
+
Ingests multiple records as a batch. Unlike `ingestRecordsOffset()`, the Promise only resolves **after the server acknowledges** the batch. This is slower for high-throughput scenarios.
|
|
998
1017
|
|
|
999
1018
|
**Parameters:**
|
|
1000
1019
|
- `payloads` - Array of record data. Supports the same 4 types as `ingestRecord()`:
|
|
@@ -1135,14 +1154,15 @@ Configuration options for stream behavior.
|
|
|
1135
1154
|
|
|
1136
1155
|
```typescript
|
|
1137
1156
|
interface StreamConfigurationOptions {
|
|
1138
|
-
recordType?: RecordType;
|
|
1139
|
-
maxInflightRequests?: number;
|
|
1140
|
-
recovery?: boolean;
|
|
1141
|
-
recoveryTimeoutMs?: number;
|
|
1142
|
-
recoveryBackoffMs?: number;
|
|
1143
|
-
recoveryRetries?: number;
|
|
1144
|
-
flushTimeoutMs?: number;
|
|
1145
|
-
serverLackOfAckTimeoutMs?: number;
|
|
1157
|
+
recordType?: RecordType; // RecordType.Json or RecordType.Proto. Default: RecordType.Proto
|
|
1158
|
+
maxInflightRequests?: number; // Default: 10,000
|
|
1159
|
+
recovery?: boolean; // Default: true
|
|
1160
|
+
recoveryTimeoutMs?: number; // Default: 15,000
|
|
1161
|
+
recoveryBackoffMs?: number; // Default: 2,000
|
|
1162
|
+
recoveryRetries?: number; // Default: 4
|
|
1163
|
+
flushTimeoutMs?: number; // Default: 300,000
|
|
1164
|
+
serverLackOfAckTimeoutMs?: number; // Default: 60,000
|
|
1165
|
+
streamPausedMaxWaitTimeMs?: number; // Default: undefined (wait for full server duration)
|
|
1146
1166
|
}
|
|
1147
1167
|
|
|
1148
1168
|
enum RecordType {
|
|
@@ -1159,7 +1179,7 @@ enum RecordType {
|
|
|
1159
1179
|
4. **Error handling**: The stream handles errors internally with automatic retry. Only use `recreateStream()` for persistent failures after internal retries are exhausted.
|
|
1160
1180
|
5. **Use Protocol Buffers for production**: Protocol Buffers (the default) provides better performance and schema validation. Use JSON only when you need schema flexibility or for quick prototyping.
|
|
1161
1181
|
6. **Store credentials securely**: Use environment variables, never hardcode credentials
|
|
1162
|
-
7. **Use batch ingestion**: For high-throughput scenarios, use `
|
|
1182
|
+
7. **Use batch ingestion**: For high-throughput scenarios, use `ingestRecordsOffset()` instead of individual `ingestRecordOffset()` calls
|
|
1163
1183
|
|
|
1164
1184
|
## Platform Support
|
|
1165
1185
|
|
|
@@ -1203,7 +1223,7 @@ This SDK wraps the high-performance [Rust Zerobus SDK](https://github.com/databr
|
|
|
1203
1223
|
```
|
|
1204
1224
|
|
|
1205
1225
|
**Benefits:**
|
|
1206
|
-
- **
|
|
1226
|
+
- **Native performance** - Rust implementation for high-throughput ingestion
|
|
1207
1227
|
- **Native async/await support** - Rust futures become JavaScript Promises
|
|
1208
1228
|
- **Automatic memory management** - No manual cleanup required
|
|
1209
1229
|
- **Type safety** - Compile-time checks on both sides
|