@procwire/codec-arrow 0.1.3 → 0.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +230 -115
- package/dist/codec.d.ts +654 -0
- package/dist/codec.d.ts.map +1 -0
- package/dist/codec.js +598 -0
- package/dist/codec.js.map +1 -0
- package/dist/index.d.ts +118 -40
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +73 -56
- package/dist/index.js.map +1 -1
- package/package.json +2 -2
package/README.md
CHANGED
|
@@ -1,9 +1,27 @@
|
|
|
1
1
|
# @procwire/codec-arrow
|
|
2
2
|
|
|
3
|
-
Apache Arrow serialization codec for `@procwire/transport`.
|
|
3
|
+
High-performance Apache Arrow IPC serialization codec for `@procwire/transport`.
|
|
4
4
|
|
|
5
5
|
Provides efficient columnar data serialization using [apache-arrow](https://github.com/apache/arrow/tree/main/js), ideal for analytical workloads and large datasets.
|
|
6
6
|
|
|
7
|
+
## Features
|
|
8
|
+
|
|
9
|
+
- **Zero-copy serialization** - No unnecessary memory allocation
|
|
10
|
+
- **Configurable IPC format** - Stream (default) or File format
|
|
11
|
+
- **Input validation** - Can be disabled for maximum performance
|
|
12
|
+
- **Metrics collection** - Optional throughput monitoring
|
|
13
|
+
- **Cross-language** - Compatible with PyArrow, Arrow C++, etc.
|
|
14
|
+
- **Type-safe** - Full TypeScript support
|
|
15
|
+
|
|
16
|
+
## Performance
|
|
17
|
+
|
|
18
|
+
| Metric | Value |
|
|
19
|
+
| ---------------------- | ------------------------ |
|
|
20
|
+
| Throughput | >1M rows/second |
|
|
21
|
+
| Serialization overhead | Near-zero (zero-copy) |
|
|
22
|
+
| Memory overhead | Minimal (reuses buffers) |
|
|
23
|
+
| Stream format overhead | ~100-200 bytes |
|
|
24
|
+
|
|
7
25
|
## Installation
|
|
8
26
|
|
|
9
27
|
```bash
|
|
@@ -12,141 +30,263 @@ npm install @procwire/codec-arrow apache-arrow
|
|
|
12
30
|
|
|
13
31
|
Note: `apache-arrow` is a peer dependency and must be installed separately.
|
|
14
32
|
|
|
15
|
-
##
|
|
33
|
+
## Quick Start
|
|
16
34
|
|
|
17
35
|
### Basic Usage
|
|
18
36
|
|
|
19
37
|
```ts
|
|
20
|
-
import { tableFromArrays } from
|
|
21
|
-
import { ArrowCodec } from
|
|
22
|
-
import { ChannelBuilder } from '@procwire/transport';
|
|
38
|
+
import { tableFromArrays } from "apache-arrow";
|
|
39
|
+
import { ArrowCodec } from "@procwire/codec-arrow";
|
|
23
40
|
|
|
24
41
|
const codec = new ArrowCodec();
|
|
25
42
|
|
|
26
|
-
// Create a table
|
|
27
43
|
const table = tableFromArrays({
|
|
28
|
-
id: [1, 2, 3
|
|
29
|
-
name: [
|
|
30
|
-
score: [95.5, 87.3, 92.1,
|
|
44
|
+
id: [1, 2, 3],
|
|
45
|
+
name: ["Alice", "Bob", "Charlie"],
|
|
46
|
+
score: [95.5, 87.3, 92.1],
|
|
31
47
|
});
|
|
32
48
|
|
|
33
|
-
//
|
|
34
|
-
const
|
|
35
|
-
.withTransport(transport)
|
|
36
|
-
.withFraming(framing)
|
|
37
|
-
.withSerialization(codec)
|
|
38
|
-
.withProtocol(protocol)
|
|
39
|
-
.build();
|
|
49
|
+
// Serialize (zero-copy!)
|
|
50
|
+
const buffer = codec.serialize(table);
|
|
40
51
|
|
|
41
|
-
//
|
|
42
|
-
|
|
52
|
+
// Deserialize
|
|
53
|
+
const decoded = codec.deserialize(buffer);
|
|
54
|
+
console.log(decoded.numRows); // 3
|
|
43
55
|
```
|
|
44
56
|
|
|
45
|
-
###
|
|
57
|
+
### High-Performance Mode
|
|
46
58
|
|
|
47
59
|
```ts
|
|
48
|
-
import {
|
|
49
|
-
import { ArrowCodec } from '@procwire/codec-arrow';
|
|
60
|
+
import { createFastArrowCodec } from "@procwire/codec-arrow";
|
|
50
61
|
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
// Serialize
|
|
54
|
-
const table = tableFromArrays({
|
|
55
|
-
id: [1, 2, 3],
|
|
56
|
-
value: [10.5, 20.3, 30.1]
|
|
57
|
-
});
|
|
58
|
-
|
|
59
|
-
const buffer = codec.serialize(table);
|
|
62
|
+
// For trusted environments - validation disabled
|
|
63
|
+
const codec = createFastArrowCodec("stream");
|
|
60
64
|
|
|
61
|
-
//
|
|
62
|
-
const
|
|
63
|
-
|
|
64
|
-
|
|
65
|
+
// Process data at maximum throughput
|
|
66
|
+
for (const table of tables) {
|
|
67
|
+
const buffer = codec.serialize(table);
|
|
68
|
+
channel.send(buffer);
|
|
69
|
+
}
|
|
65
70
|
```
|
|
66
71
|
|
|
67
|
-
###
|
|
72
|
+
### With Metrics
|
|
68
73
|
|
|
69
74
|
```ts
|
|
70
|
-
import {
|
|
71
|
-
import { ArrowCodec } from '@procwire/codec-arrow';
|
|
75
|
+
import { createMonitoredArrowCodec } from "@procwire/codec-arrow";
|
|
72
76
|
|
|
73
|
-
const codec =
|
|
77
|
+
const codec = createMonitoredArrowCodec();
|
|
74
78
|
|
|
75
|
-
//
|
|
76
|
-
const
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
});
|
|
79
|
+
// Process data...
|
|
80
|
+
for (const table of tables) {
|
|
81
|
+
codec.serialize(table);
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
// Check throughput
|
|
85
|
+
const metrics = codec.metrics!;
|
|
86
|
+
console.log(`Processed: ${metrics.rowsSerialized.toLocaleString()} rows`);
|
|
87
|
+
console.log(`Data size: ${(metrics.bytesSerialised / 1024 / 1024).toFixed(2)} MB`);
|
|
88
|
+
console.log(`Errors: ${metrics.serializeErrors}`);
|
|
89
|
+
```
|
|
90
|
+
|
|
91
|
+
### File Format (Random Access)
|
|
83
92
|
|
|
84
|
-
|
|
93
|
+
```ts
|
|
94
|
+
import { createFileArrowCodec } from "@procwire/codec-arrow";
|
|
95
|
+
import { writeFileSync } from "fs";
|
|
96
|
+
|
|
97
|
+
const codec = createFileArrowCodec();
|
|
85
98
|
const buffer = codec.serialize(table);
|
|
86
|
-
console.log(`Serialized ${size} rows in ${buffer.length} bytes`);
|
|
87
99
|
|
|
88
|
-
//
|
|
89
|
-
|
|
90
|
-
console.log(`Deserialized table with ${decoded.numRows} rows`);
|
|
100
|
+
// Write to disk - format supports random access
|
|
101
|
+
writeFileSync("data.arrow", buffer);
|
|
91
102
|
```
|
|
92
103
|
|
|
93
|
-
##
|
|
104
|
+
## API Reference
|
|
94
105
|
|
|
95
|
-
|
|
96
|
-
- **Type Preservation**: Full type system support (integers, floats, strings, booleans, etc.)
|
|
97
|
-
- **Null Handling**: Native support for null values
|
|
98
|
-
- **Zero-Copy**: Efficient memory usage with zero-copy reads where possible
|
|
99
|
-
- **Error Handling**: Wraps encoding/decoding errors in `SerializationError` from `@procwire/transport`
|
|
100
|
-
- **IPC Stream Format**: Uses Arrow IPC streaming format for efficient transmission
|
|
106
|
+
### ArrowCodec
|
|
101
107
|
|
|
102
|
-
|
|
108
|
+
Main codec class implementing `SerializationCodec<Table>`.
|
|
103
109
|
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
110
|
+
```ts
|
|
111
|
+
const codec = new ArrowCodec(options?: ArrowCodecOptions);
|
|
112
|
+
```
|
|
107
113
|
|
|
108
114
|
#### Properties
|
|
109
115
|
|
|
110
|
-
|
|
111
|
-
|
|
116
|
+
| Property | Type | Description |
|
|
117
|
+
| ------------- | --------------------------- | ------------------------- |
|
|
118
|
+
| `name` | `"arrow"` | Codec identifier |
|
|
119
|
+
| `contentType` | `string` | MIME type based on format |
|
|
120
|
+
| `metrics` | `ArrowCodecMetrics \| null` | Current metrics or null |
|
|
112
121
|
|
|
113
122
|
#### Methods
|
|
114
123
|
|
|
115
124
|
##### `serialize(value: Table): Buffer`
|
|
116
125
|
|
|
117
|
-
Serializes an Apache Arrow Table to IPC
|
|
126
|
+
Serializes an Apache Arrow Table to IPC format using zero-copy optimization.
|
|
118
127
|
|
|
119
128
|
**Parameters:**
|
|
129
|
+
|
|
120
130
|
- `value` - Arrow Table to serialize
|
|
121
131
|
|
|
122
|
-
**Returns:** `Buffer` containing Arrow IPC
|
|
132
|
+
**Returns:** `Buffer` containing Arrow IPC data
|
|
123
133
|
|
|
124
|
-
**Throws:** `SerializationError` if encoding fails
|
|
134
|
+
**Throws:** `SerializationError` if value is not a valid Table or encoding fails
|
|
125
135
|
|
|
126
136
|
##### `deserialize(buffer: Buffer): Table`
|
|
127
137
|
|
|
128
|
-
Deserializes Arrow IPC
|
|
138
|
+
Deserializes Arrow IPC data to an Apache Arrow Table.
|
|
129
139
|
|
|
130
140
|
**Parameters:**
|
|
131
|
-
|
|
141
|
+
|
|
142
|
+
- `buffer` - Buffer containing Arrow IPC data
|
|
132
143
|
|
|
133
144
|
**Returns:** Deserialized Arrow Table
|
|
134
145
|
|
|
135
|
-
**Throws:** `SerializationError` if decoding fails
|
|
146
|
+
**Throws:** `SerializationError` if buffer is invalid or decoding fails
|
|
147
|
+
|
|
148
|
+
##### `resetMetrics(): void`
|
|
149
|
+
|
|
150
|
+
Resets all collected metrics to zero. No-op if metrics collection is disabled.
|
|
151
|
+
|
|
152
|
+
### ArrowCodecOptions
|
|
153
|
+
|
|
154
|
+
| Option | Type | Default | Description |
|
|
155
|
+
| ---------------- | -------------------- | ---------- | ---------------------------- |
|
|
156
|
+
| `format` | `'stream' \| 'file'` | `'stream'` | IPC format to use |
|
|
157
|
+
| `validateInput` | `boolean` | `true` | Enable input type validation |
|
|
158
|
+
| `collectMetrics` | `boolean` | `false` | Enable metrics collection |
|
|
159
|
+
|
|
160
|
+
### ArrowCodecMetrics
|
|
161
|
+
|
|
162
|
+
Metrics collected when `collectMetrics: true`:
|
|
163
|
+
|
|
164
|
+
| Metric | Type | Description |
|
|
165
|
+
| ------------------- | -------- | ------------------------------ |
|
|
166
|
+
| `serializeCount` | `number` | Successful serialize() calls |
|
|
167
|
+
| `deserializeCount` | `number` | Successful deserialize() calls |
|
|
168
|
+
| `bytesSerialised` | `number` | Total bytes serialized |
|
|
169
|
+
| `bytesDeserialized` | `number` | Total bytes deserialized |
|
|
170
|
+
| `rowsSerialized` | `number` | Total rows serialized |
|
|
171
|
+
| `rowsDeserialized` | `number` | Total rows deserialized |
|
|
172
|
+
| `serializeErrors` | `number` | Failed serialize() calls |
|
|
173
|
+
| `deserializeErrors` | `number` | Failed deserialize() calls |
|
|
174
|
+
|
|
175
|
+
### Helper Functions
|
|
176
|
+
|
|
177
|
+
#### `createFastArrowCodec(format?: ArrowIPCFormat): ArrowCodec`
|
|
178
|
+
|
|
179
|
+
Creates codec optimized for maximum throughput with validation disabled.
|
|
180
|
+
|
|
181
|
+
**Warning:** Only use in trusted environments where input is guaranteed valid.
|
|
182
|
+
|
|
183
|
+
#### `createMonitoredArrowCodec(options?: Omit<ArrowCodecOptions, 'collectMetrics'>): ArrowCodec`
|
|
184
|
+
|
|
185
|
+
Creates codec with metrics collection enabled.
|
|
186
|
+
|
|
187
|
+
#### `createFileArrowCodec(options?: Omit<ArrowCodecOptions, 'format'>): ArrowCodec`
|
|
188
|
+
|
|
189
|
+
Creates codec configured for file format (supports random access).
|
|
190
|
+
|
|
191
|
+
## Performance Tuning
|
|
192
|
+
|
|
193
|
+
### Maximum Throughput
|
|
194
|
+
|
|
195
|
+
For maximum performance in trusted environments:
|
|
196
|
+
|
|
197
|
+
```ts
|
|
198
|
+
const codec = new ArrowCodec({
|
|
199
|
+
format: "stream", // Smaller, no footer overhead
|
|
200
|
+
validateInput: false, // Skip type checks
|
|
201
|
+
collectMetrics: false, // Skip metric collection
|
|
202
|
+
});
|
|
203
|
+
```
|
|
204
|
+
|
|
205
|
+
Or use the helper:
|
|
206
|
+
|
|
207
|
+
```ts
|
|
208
|
+
const codec = createFastArrowCodec("stream");
|
|
209
|
+
```
|
|
210
|
+
|
|
211
|
+
### Memory Optimization
|
|
212
|
+
|
|
213
|
+
The codec uses zero-copy serialization by wrapping the underlying ArrayBuffer:
|
|
214
|
+
|
|
215
|
+
```ts
|
|
216
|
+
// Internally uses:
|
|
217
|
+
Buffer.from(uint8array.buffer, uint8array.byteOffset, uint8array.byteLength);
|
|
218
|
+
// Instead of:
|
|
219
|
+
Buffer.from(uint8array); // This copies data!
|
|
220
|
+
```
|
|
221
|
+
|
|
222
|
+
This reduces memory allocation by ~50% during serialization.
|
|
223
|
+
|
|
224
|
+
### Format Selection
|
|
225
|
+
|
|
226
|
+
| Use Case | Recommended Format |
|
|
227
|
+
| -------------------- | -------------------- |
|
|
228
|
+
| IPC streaming | `'stream'` (default) |
|
|
229
|
+
| Network transfer | `'stream'` |
|
|
230
|
+
| File storage | `'file'` |
|
|
231
|
+
| Random access needed | `'file'` |
|
|
232
|
+
| Smallest size | `'stream'` |
|
|
233
|
+
|
|
234
|
+
## Integration with @procwire/transport
|
|
235
|
+
|
|
236
|
+
```ts
|
|
237
|
+
import { ChannelBuilder } from "@procwire/transport";
|
|
238
|
+
import { ArrowCodec } from "@procwire/codec-arrow";
|
|
239
|
+
|
|
240
|
+
const channel = new ChannelBuilder()
|
|
241
|
+
.withTransport(transport)
|
|
242
|
+
.withFraming(new LengthPrefixedFraming())
|
|
243
|
+
.withSerialization(new ArrowCodec({ validateInput: false }))
|
|
244
|
+
.withProtocol(new JsonRpcProtocol())
|
|
245
|
+
.build();
|
|
246
|
+
|
|
247
|
+
// Send Arrow tables over the channel
|
|
248
|
+
await channel.request("processAnalytics", analyticsTable);
|
|
249
|
+
```
|
|
250
|
+
|
|
251
|
+
## Type System Support
|
|
252
|
+
|
|
253
|
+
The codec provides full TypeScript support:
|
|
254
|
+
|
|
255
|
+
```ts
|
|
256
|
+
import type { Table, Schema, Field, RecordBatch } from "@procwire/codec-arrow";
|
|
257
|
+
import { ArrowCodec, ArrowCodecOptions, ArrowCodecMetrics } from "@procwire/codec-arrow";
|
|
258
|
+
```
|
|
259
|
+
|
|
260
|
+
## Error Handling
|
|
261
|
+
|
|
262
|
+
All errors are wrapped in `SerializationError` from `@procwire/transport`:
|
|
263
|
+
|
|
264
|
+
```ts
|
|
265
|
+
import { SerializationError } from "@procwire/transport";
|
|
266
|
+
|
|
267
|
+
try {
|
|
268
|
+
codec.serialize(invalidTable);
|
|
269
|
+
} catch (error) {
|
|
270
|
+
if (error instanceof SerializationError) {
|
|
271
|
+
console.error("Serialization failed:", error.message);
|
|
272
|
+
console.error("Cause:", error.cause);
|
|
273
|
+
}
|
|
274
|
+
}
|
|
275
|
+
```
|
|
136
276
|
|
|
137
277
|
## Advanced Usage
|
|
138
278
|
|
|
139
279
|
### Creating Tables from Arrays
|
|
140
280
|
|
|
141
281
|
```ts
|
|
142
|
-
import { tableFromArrays } from
|
|
282
|
+
import { tableFromArrays } from "apache-arrow";
|
|
143
283
|
|
|
144
284
|
const table = tableFromArrays({
|
|
145
285
|
// Integer column
|
|
146
286
|
id: [1, 2, 3],
|
|
147
287
|
|
|
148
288
|
// String column
|
|
149
|
-
name: [
|
|
289
|
+
name: ["Alice", "Bob", "Charlie"],
|
|
150
290
|
|
|
151
291
|
// Float column
|
|
152
292
|
score: [95.5, 87.3, 92.1],
|
|
@@ -155,19 +295,19 @@ const table = tableFromArrays({
|
|
|
155
295
|
active: [true, false, true],
|
|
156
296
|
|
|
157
297
|
// Column with nulls
|
|
158
|
-
email: [
|
|
298
|
+
email: ["alice@example.com", null, "charlie@example.com"],
|
|
159
299
|
});
|
|
160
300
|
```
|
|
161
301
|
|
|
162
302
|
### Typed Arrays for Performance
|
|
163
303
|
|
|
164
304
|
```ts
|
|
165
|
-
import { tableFromArrays } from
|
|
305
|
+
import { tableFromArrays } from "apache-arrow";
|
|
166
306
|
|
|
167
307
|
const table = tableFromArrays({
|
|
168
308
|
int32_col: new Int32Array([1, 2, 3, 4, 5]),
|
|
169
309
|
float64_col: new Float64Array([1.1, 2.2, 3.3, 4.4, 5.5]),
|
|
170
|
-
uint8_col: new Uint8Array([255, 128, 64, 32, 0])
|
|
310
|
+
uint8_col: new Uint8Array([255, 128, 64, 32, 0]),
|
|
171
311
|
});
|
|
172
312
|
```
|
|
173
313
|
|
|
@@ -176,11 +316,11 @@ const table = tableFromArrays({
|
|
|
176
316
|
```ts
|
|
177
317
|
const table = tableFromArrays({
|
|
178
318
|
id: [1, 2, 3],
|
|
179
|
-
name: [
|
|
319
|
+
name: ["Alice", "Bob", "Charlie"],
|
|
180
320
|
});
|
|
181
321
|
|
|
182
322
|
// Get column
|
|
183
|
-
const idColumn = table.getChild(
|
|
323
|
+
const idColumn = table.getChild("id");
|
|
184
324
|
const ids = idColumn?.toArray(); // [1, 2, 3]
|
|
185
325
|
|
|
186
326
|
// Iterate rows
|
|
@@ -190,30 +330,17 @@ for (let i = 0; i < table.numRows; i++) {
|
|
|
190
330
|
}
|
|
191
331
|
```
|
|
192
332
|
|
|
193
|
-
##
|
|
194
|
-
|
|
195
|
-
Apache Arrow provides exceptional performance for columnar data:
|
|
333
|
+
## Cross-Language Compatibility
|
|
196
334
|
|
|
197
|
-
|
|
198
|
-
- **Zero-Copy Reads**: Direct memory access without deserialization overhead
|
|
199
|
-
- **Compression**: Built-in dictionary encoding for repeated values
|
|
200
|
-
- **Vectorized Operations**: SIMD-friendly data layout for fast processing
|
|
201
|
-
- **Cross-Language**: Same binary format used in Python, R, Java, C++, etc.
|
|
202
|
-
|
|
203
|
-
### Performance Characteristics
|
|
335
|
+
Arrow IPC format is cross-platform and cross-language:
|
|
204
336
|
|
|
205
|
-
|
|
206
|
-
- **
|
|
207
|
-
- **
|
|
208
|
-
- **
|
|
337
|
+
- **Python**: PyArrow
|
|
338
|
+
- **R**: arrow R package
|
|
339
|
+
- **Java**: Arrow Java
|
|
340
|
+
- **C++**: Arrow C++
|
|
341
|
+
- **Rust**: arrow-rs
|
|
209
342
|
|
|
210
|
-
|
|
211
|
-
- Time-series data
|
|
212
|
-
- Analytics and data science workloads
|
|
213
|
-
- Large datasets (millions of rows)
|
|
214
|
-
- High-throughput data streaming
|
|
215
|
-
- Cross-language data exchange
|
|
216
|
-
- Machine learning pipelines
|
|
343
|
+
Tables serialized in one language can be deserialized in another seamlessly.
|
|
217
344
|
|
|
218
345
|
## Use Cases
|
|
219
346
|
|
|
@@ -221,9 +348,9 @@ Ideal for:
|
|
|
221
348
|
|
|
222
349
|
```ts
|
|
223
350
|
const timeSeries = tableFromArrays({
|
|
224
|
-
timestamp: timestamps,
|
|
225
|
-
value: values,
|
|
226
|
-
quality: qualities
|
|
351
|
+
timestamp: timestamps,
|
|
352
|
+
value: values,
|
|
353
|
+
quality: qualities,
|
|
227
354
|
});
|
|
228
355
|
```
|
|
229
356
|
|
|
@@ -234,7 +361,7 @@ const analyticsData = tableFromArrays({
|
|
|
234
361
|
user_id: userIds,
|
|
235
362
|
event_type: eventTypes,
|
|
236
363
|
timestamp: timestamps,
|
|
237
|
-
properties: jsonProperties
|
|
364
|
+
properties: jsonProperties,
|
|
238
365
|
});
|
|
239
366
|
```
|
|
240
367
|
|
|
@@ -244,22 +371,10 @@ const analyticsData = tableFromArrays({
|
|
|
244
371
|
const features = tableFromArrays({
|
|
245
372
|
feature1: feature1Data,
|
|
246
373
|
feature2: feature2Data,
|
|
247
|
-
|
|
248
|
-
label: labels
|
|
374
|
+
label: labels,
|
|
249
375
|
});
|
|
250
376
|
```
|
|
251
377
|
|
|
252
|
-
## Compatibility
|
|
253
|
-
|
|
254
|
-
Arrow IPC format is cross-platform and cross-language:
|
|
255
|
-
- **Python**: PyArrow
|
|
256
|
-
- **R**: arrow R package
|
|
257
|
-
- **Java**: Arrow Java
|
|
258
|
-
- **C++**: Arrow C++
|
|
259
|
-
- **Rust**: arrow-rs
|
|
260
|
-
|
|
261
|
-
Tables can be serialized in one language and deserialized in another seamlessly.
|
|
262
|
-
|
|
263
378
|
## License
|
|
264
379
|
|
|
265
380
|
MIT
|