@procwire/codec-arrow 0.1.3 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +217 -104
- package/dist/codec.d.ts +654 -0
- package/dist/codec.d.ts.map +1 -0
- package/dist/codec.js +598 -0
- package/dist/codec.js.map +1 -0
- package/dist/index.d.ts +118 -40
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +73 -56
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
package/README.md
CHANGED
|
@@ -1,9 +1,27 @@
|
|
|
1
1
|
# @procwire/codec-arrow
|
|
2
2
|
|
|
3
|
-
Apache Arrow serialization codec for `@procwire/transport`.
|
|
3
|
+
High-performance Apache Arrow IPC serialization codec for `@procwire/transport`.
|
|
4
4
|
|
|
5
5
|
Provides efficient columnar data serialization using [apache-arrow](https://github.com/apache/arrow/tree/main/js), ideal for analytical workloads and large datasets.
|
|
6
6
|
|
|
7
|
+
## Features
|
|
8
|
+
|
|
9
|
+
- **Zero-copy serialization** - No unnecessary memory allocation
|
|
10
|
+
- **Configurable IPC format** - Stream (default) or File format
|
|
11
|
+
- **Input validation** - Can be disabled for maximum performance
|
|
12
|
+
- **Metrics collection** - Optional throughput monitoring
|
|
13
|
+
- **Cross-language** - Compatible with PyArrow, Arrow C++, etc.
|
|
14
|
+
- **Type-safe** - Full TypeScript support
|
|
15
|
+
|
|
16
|
+
## Performance
|
|
17
|
+
|
|
18
|
+
| Metric | Value |
|
|
19
|
+
|--------|-------|
|
|
20
|
+
| Throughput | >1M rows/second |
|
|
21
|
+
| Serialization overhead | Near-zero (zero-copy) |
|
|
22
|
+
| Memory overhead | Minimal (reuses buffers) |
|
|
23
|
+
| Stream format overhead | ~100-200 bytes |
|
|
24
|
+
|
|
7
25
|
## Installation
|
|
8
26
|
|
|
9
27
|
```bash
|
|
@@ -12,127 +30,247 @@ npm install @procwire/codec-arrow apache-arrow
|
|
|
12
30
|
|
|
13
31
|
Note: `apache-arrow` is a peer dependency and must be installed separately.
|
|
14
32
|
|
|
15
|
-
##
|
|
33
|
+
## Quick Start
|
|
16
34
|
|
|
17
35
|
### Basic Usage
|
|
18
36
|
|
|
19
37
|
```ts
|
|
20
38
|
import { tableFromArrays } from 'apache-arrow';
|
|
21
39
|
import { ArrowCodec } from '@procwire/codec-arrow';
|
|
22
|
-
import { ChannelBuilder } from '@procwire/transport';
|
|
23
40
|
|
|
24
41
|
const codec = new ArrowCodec();
|
|
25
42
|
|
|
26
|
-
// Create a table
|
|
27
43
|
const table = tableFromArrays({
|
|
28
|
-
id: [1, 2, 3
|
|
29
|
-
name: ['Alice', 'Bob', 'Charlie'
|
|
30
|
-
score: [95.5, 87.3, 92.1
|
|
44
|
+
id: [1, 2, 3],
|
|
45
|
+
name: ['Alice', 'Bob', 'Charlie'],
|
|
46
|
+
score: [95.5, 87.3, 92.1]
|
|
31
47
|
});
|
|
32
48
|
|
|
33
|
-
//
|
|
34
|
-
const
|
|
35
|
-
.withTransport(transport)
|
|
36
|
-
.withFraming(framing)
|
|
37
|
-
.withSerialization(codec)
|
|
38
|
-
.withProtocol(protocol)
|
|
39
|
-
.build();
|
|
49
|
+
// Serialize (zero-copy!)
|
|
50
|
+
const buffer = codec.serialize(table);
|
|
40
51
|
|
|
41
|
-
//
|
|
42
|
-
|
|
52
|
+
// Deserialize
|
|
53
|
+
const decoded = codec.deserialize(buffer);
|
|
54
|
+
console.log(decoded.numRows); // 3
|
|
43
55
|
```
|
|
44
56
|
|
|
45
|
-
###
|
|
57
|
+
### High-Performance Mode
|
|
46
58
|
|
|
47
59
|
```ts
|
|
48
|
-
import {
|
|
49
|
-
import { ArrowCodec } from '@procwire/codec-arrow';
|
|
50
|
-
|
|
51
|
-
const codec = new ArrowCodec();
|
|
60
|
+
import { createFastArrowCodec } from '@procwire/codec-arrow';
|
|
52
61
|
|
|
53
|
-
//
|
|
54
|
-
const
|
|
55
|
-
id: [1, 2, 3],
|
|
56
|
-
value: [10.5, 20.3, 30.1]
|
|
57
|
-
});
|
|
62
|
+
// For trusted environments - validation disabled
|
|
63
|
+
const codec = createFastArrowCodec('stream');
|
|
58
64
|
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
console.log(decoded.getChild('id')?.toArray()); // [1, 2, 3]
|
|
65
|
+
// Process data at maximum throughput
|
|
66
|
+
for (const table of tables) {
|
|
67
|
+
const buffer = codec.serialize(table);
|
|
68
|
+
channel.send(buffer);
|
|
69
|
+
}
|
|
65
70
|
```
|
|
66
71
|
|
|
67
|
-
###
|
|
72
|
+
### With Metrics
|
|
68
73
|
|
|
69
74
|
```ts
|
|
70
|
-
import {
|
|
71
|
-
import { ArrowCodec } from '@procwire/codec-arrow';
|
|
75
|
+
import { createMonitoredArrowCodec } from '@procwire/codec-arrow';
|
|
72
76
|
|
|
73
|
-
const codec =
|
|
77
|
+
const codec = createMonitoredArrowCodec();
|
|
74
78
|
|
|
75
|
-
//
|
|
76
|
-
const
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
});
|
|
79
|
+
// Process data...
|
|
80
|
+
for (const table of tables) {
|
|
81
|
+
codec.serialize(table);
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
// Check throughput
|
|
85
|
+
const metrics = codec.metrics!;
|
|
86
|
+
console.log(`Processed: ${metrics.rowsSerialized.toLocaleString()} rows`);
|
|
87
|
+
console.log(`Data size: ${(metrics.bytesSerialised / 1024 / 1024).toFixed(2)} MB`);
|
|
88
|
+
console.log(`Errors: ${metrics.serializeErrors}`);
|
|
89
|
+
```
|
|
90
|
+
|
|
91
|
+
### File Format (Random Access)
|
|
83
92
|
|
|
84
|
-
|
|
93
|
+
```ts
|
|
94
|
+
import { createFileArrowCodec } from '@procwire/codec-arrow';
|
|
95
|
+
import { writeFileSync } from 'fs';
|
|
96
|
+
|
|
97
|
+
const codec = createFileArrowCodec();
|
|
85
98
|
const buffer = codec.serialize(table);
|
|
86
|
-
console.log(`Serialized ${size} rows in ${buffer.length} bytes`);
|
|
87
99
|
|
|
88
|
-
//
|
|
89
|
-
|
|
90
|
-
console.log(`Deserialized table with ${decoded.numRows} rows`);
|
|
100
|
+
// Write to disk - format supports random access
|
|
101
|
+
writeFileSync('data.arrow', buffer);
|
|
91
102
|
```
|
|
92
103
|
|
|
93
|
-
##
|
|
104
|
+
## API Reference
|
|
94
105
|
|
|
95
|
-
|
|
96
|
-
- **Type Preservation**: Full type system support (integers, floats, strings, booleans, etc.)
|
|
97
|
-
- **Null Handling**: Native support for null values
|
|
98
|
-
- **Zero-Copy**: Efficient memory usage with zero-copy reads where possible
|
|
99
|
-
- **Error Handling**: Wraps encoding/decoding errors in `SerializationError` from `@procwire/transport`
|
|
100
|
-
- **IPC Stream Format**: Uses Arrow IPC streaming format for efficient transmission
|
|
106
|
+
### ArrowCodec
|
|
101
107
|
|
|
102
|
-
|
|
108
|
+
Main codec class implementing `SerializationCodec<Table>`.
|
|
103
109
|
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
110
|
+
```ts
|
|
111
|
+
const codec = new ArrowCodec(options?: ArrowCodecOptions);
|
|
112
|
+
```
|
|
107
113
|
|
|
108
114
|
#### Properties
|
|
109
115
|
|
|
110
|
-
|
|
111
|
-
|
|
116
|
+
| Property | Type | Description |
|
|
117
|
+
|----------|------|-------------|
|
|
118
|
+
| `name` | `"arrow"` | Codec identifier |
|
|
119
|
+
| `contentType` | `string` | MIME type based on format |
|
|
120
|
+
| `metrics` | `ArrowCodecMetrics \| null` | Current metrics or null |
|
|
112
121
|
|
|
113
122
|
#### Methods
|
|
114
123
|
|
|
115
124
|
##### `serialize(value: Table): Buffer`
|
|
116
125
|
|
|
117
|
-
Serializes an Apache Arrow Table to IPC
|
|
126
|
+
Serializes an Apache Arrow Table to IPC format using zero-copy optimization.
|
|
118
127
|
|
|
119
128
|
**Parameters:**
|
|
120
129
|
- `value` - Arrow Table to serialize
|
|
121
130
|
|
|
122
|
-
**Returns:** `Buffer` containing Arrow IPC
|
|
131
|
+
**Returns:** `Buffer` containing Arrow IPC data
|
|
123
132
|
|
|
124
|
-
**Throws:** `SerializationError` if encoding fails
|
|
133
|
+
**Throws:** `SerializationError` if value is not a valid Table or encoding fails
|
|
125
134
|
|
|
126
135
|
##### `deserialize(buffer: Buffer): Table`
|
|
127
136
|
|
|
128
|
-
Deserializes Arrow IPC
|
|
137
|
+
Deserializes Arrow IPC data to an Apache Arrow Table.
|
|
129
138
|
|
|
130
139
|
**Parameters:**
|
|
131
|
-
- `buffer` - Buffer containing Arrow IPC
|
|
140
|
+
- `buffer` - Buffer containing Arrow IPC data
|
|
132
141
|
|
|
133
142
|
**Returns:** Deserialized Arrow Table
|
|
134
143
|
|
|
135
|
-
**Throws:** `SerializationError` if decoding fails
|
|
144
|
+
**Throws:** `SerializationError` if buffer is invalid or decoding fails
|
|
145
|
+
|
|
146
|
+
##### `resetMetrics(): void`
|
|
147
|
+
|
|
148
|
+
Resets all collected metrics to zero. No-op if metrics collection is disabled.
|
|
149
|
+
|
|
150
|
+
### ArrowCodecOptions
|
|
151
|
+
|
|
152
|
+
| Option | Type | Default | Description |
|
|
153
|
+
|--------|------|---------|-------------|
|
|
154
|
+
| `format` | `'stream' \| 'file'` | `'stream'` | IPC format to use |
|
|
155
|
+
| `validateInput` | `boolean` | `true` | Enable input type validation |
|
|
156
|
+
| `collectMetrics` | `boolean` | `false` | Enable metrics collection |
|
|
157
|
+
|
|
158
|
+
### ArrowCodecMetrics
|
|
159
|
+
|
|
160
|
+
Metrics collected when `collectMetrics: true`:
|
|
161
|
+
|
|
162
|
+
| Metric | Type | Description |
|
|
163
|
+
|--------|------|-------------|
|
|
164
|
+
| `serializeCount` | `number` | Successful serialize() calls |
|
|
165
|
+
| `deserializeCount` | `number` | Successful deserialize() calls |
|
|
166
|
+
| `bytesSerialised` | `number` | Total bytes serialized |
|
|
167
|
+
| `bytesDeserialized` | `number` | Total bytes deserialized |
|
|
168
|
+
| `rowsSerialized` | `number` | Total rows serialized |
|
|
169
|
+
| `rowsDeserialized` | `number` | Total rows deserialized |
|
|
170
|
+
| `serializeErrors` | `number` | Failed serialize() calls |
|
|
171
|
+
| `deserializeErrors` | `number` | Failed deserialize() calls |
|
|
172
|
+
|
|
173
|
+
### Helper Functions
|
|
174
|
+
|
|
175
|
+
#### `createFastArrowCodec(format?: ArrowIPCFormat): ArrowCodec`
|
|
176
|
+
|
|
177
|
+
Creates codec optimized for maximum throughput with validation disabled.
|
|
178
|
+
|
|
179
|
+
**Warning:** Only use in trusted environments where input is guaranteed valid.
|
|
180
|
+
|
|
181
|
+
#### `createMonitoredArrowCodec(options?: Omit<ArrowCodecOptions, 'collectMetrics'>): ArrowCodec`
|
|
182
|
+
|
|
183
|
+
Creates codec with metrics collection enabled.
|
|
184
|
+
|
|
185
|
+
#### `createFileArrowCodec(options?: Omit<ArrowCodecOptions, 'format'>): ArrowCodec`
|
|
186
|
+
|
|
187
|
+
Creates codec configured for file format (supports random access).
|
|
188
|
+
|
|
189
|
+
## Performance Tuning
|
|
190
|
+
|
|
191
|
+
### Maximum Throughput
|
|
192
|
+
|
|
193
|
+
For maximum performance in trusted environments:
|
|
194
|
+
|
|
195
|
+
```ts
|
|
196
|
+
const codec = new ArrowCodec({
|
|
197
|
+
format: 'stream', // Smaller, no footer overhead
|
|
198
|
+
validateInput: false, // Skip type checks
|
|
199
|
+
collectMetrics: false // Skip metric collection
|
|
200
|
+
});
|
|
201
|
+
```
|
|
202
|
+
|
|
203
|
+
Or use the helper:
|
|
204
|
+
|
|
205
|
+
```ts
|
|
206
|
+
const codec = createFastArrowCodec('stream');
|
|
207
|
+
```
|
|
208
|
+
|
|
209
|
+
### Memory Optimization
|
|
210
|
+
|
|
211
|
+
The codec uses zero-copy serialization by wrapping the underlying ArrayBuffer:
|
|
212
|
+
|
|
213
|
+
```ts
|
|
214
|
+
// Internally uses:
|
|
215
|
+
Buffer.from(uint8array.buffer, uint8array.byteOffset, uint8array.byteLength)
|
|
216
|
+
// Instead of:
|
|
217
|
+
Buffer.from(uint8array) // This copies data!
|
|
218
|
+
```
|
|
219
|
+
|
|
220
|
+
This reduces memory allocation by ~50% during serialization.
|
|
221
|
+
|
|
222
|
+
### Format Selection
|
|
223
|
+
|
|
224
|
+
| Use Case | Recommended Format |
|
|
225
|
+
|----------|-------------------|
|
|
226
|
+
| IPC streaming | `'stream'` (default) |
|
|
227
|
+
| Network transfer | `'stream'` |
|
|
228
|
+
| File storage | `'file'` |
|
|
229
|
+
| Random access needed | `'file'` |
|
|
230
|
+
| Smallest size | `'stream'` |
|
|
231
|
+
|
|
232
|
+
## Integration with @procwire/transport
|
|
233
|
+
|
|
234
|
+
```ts
|
|
235
|
+
import { ChannelBuilder } from '@procwire/transport';
|
|
236
|
+
import { ArrowCodec } from '@procwire/codec-arrow';
|
|
237
|
+
|
|
238
|
+
const channel = new ChannelBuilder()
|
|
239
|
+
.withTransport(transport)
|
|
240
|
+
.withFraming(new LengthPrefixedFraming())
|
|
241
|
+
.withSerialization(new ArrowCodec({ validateInput: false }))
|
|
242
|
+
.withProtocol(new JsonRpcProtocol())
|
|
243
|
+
.build();
|
|
244
|
+
|
|
245
|
+
// Send Arrow tables over the channel
|
|
246
|
+
await channel.request('processAnalytics', analyticsTable);
|
|
247
|
+
```
|
|
248
|
+
|
|
249
|
+
## Type System Support
|
|
250
|
+
|
|
251
|
+
The codec provides full TypeScript support:
|
|
252
|
+
|
|
253
|
+
```ts
|
|
254
|
+
import type { Table, Schema, Field, RecordBatch } from '@procwire/codec-arrow';
|
|
255
|
+
import { ArrowCodec, ArrowCodecOptions, ArrowCodecMetrics } from '@procwire/codec-arrow';
|
|
256
|
+
```
|
|
257
|
+
|
|
258
|
+
## Error Handling
|
|
259
|
+
|
|
260
|
+
All errors are wrapped in `SerializationError` from `@procwire/transport`:
|
|
261
|
+
|
|
262
|
+
```ts
|
|
263
|
+
import { SerializationError } from '@procwire/transport';
|
|
264
|
+
|
|
265
|
+
try {
|
|
266
|
+
codec.serialize(invalidTable);
|
|
267
|
+
} catch (error) {
|
|
268
|
+
if (error instanceof SerializationError) {
|
|
269
|
+
console.error('Serialization failed:', error.message);
|
|
270
|
+
console.error('Cause:', error.cause);
|
|
271
|
+
}
|
|
272
|
+
}
|
|
273
|
+
```
|
|
136
274
|
|
|
137
275
|
## Advanced Usage
|
|
138
276
|
|
|
@@ -190,30 +328,17 @@ for (let i = 0; i < table.numRows; i++) {
|
|
|
190
328
|
}
|
|
191
329
|
```
|
|
192
330
|
|
|
193
|
-
##
|
|
331
|
+
## Cross-Language Compatibility
|
|
194
332
|
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
- **Columnar Storage**: Data stored in columns, not rows - ideal for analytical queries
|
|
198
|
-
- **Zero-Copy Reads**: Direct memory access without deserialization overhead
|
|
199
|
-
- **Compression**: Built-in dictionary encoding for repeated values
|
|
200
|
-
- **Vectorized Operations**: SIMD-friendly data layout for fast processing
|
|
201
|
-
- **Cross-Language**: Same binary format used in Python, R, Java, C++, etc.
|
|
202
|
-
|
|
203
|
-
### Performance Characteristics
|
|
333
|
+
Arrow IPC format is cross-platform and cross-language:
|
|
204
334
|
|
|
205
|
-
|
|
206
|
-
- **
|
|
207
|
-
- **
|
|
208
|
-
- **
|
|
335
|
+
- **Python**: PyArrow
|
|
336
|
+
- **R**: arrow R package
|
|
337
|
+
- **Java**: Arrow Java
|
|
338
|
+
- **C++**: Arrow C++
|
|
339
|
+
- **Rust**: arrow-rs
|
|
209
340
|
|
|
210
|
-
|
|
211
|
-
- Time-series data
|
|
212
|
-
- Analytics and data science workloads
|
|
213
|
-
- Large datasets (millions of rows)
|
|
214
|
-
- High-throughput data streaming
|
|
215
|
-
- Cross-language data exchange
|
|
216
|
-
- Machine learning pipelines
|
|
341
|
+
Tables serialized in one language can be deserialized in another seamlessly.
|
|
217
342
|
|
|
218
343
|
## Use Cases
|
|
219
344
|
|
|
@@ -221,9 +346,9 @@ Ideal for:
|
|
|
221
346
|
|
|
222
347
|
```ts
|
|
223
348
|
const timeSeries = tableFromArrays({
|
|
224
|
-
timestamp: timestamps,
|
|
225
|
-
value: values,
|
|
226
|
-
quality: qualities
|
|
349
|
+
timestamp: timestamps,
|
|
350
|
+
value: values,
|
|
351
|
+
quality: qualities
|
|
227
352
|
});
|
|
228
353
|
```
|
|
229
354
|
|
|
@@ -244,22 +369,10 @@ const analyticsData = tableFromArrays({
|
|
|
244
369
|
const features = tableFromArrays({
|
|
245
370
|
feature1: feature1Data,
|
|
246
371
|
feature2: feature2Data,
|
|
247
|
-
// ... many features
|
|
248
372
|
label: labels
|
|
249
373
|
});
|
|
250
374
|
```
|
|
251
375
|
|
|
252
|
-
## Compatibility
|
|
253
|
-
|
|
254
|
-
Arrow IPC format is cross-platform and cross-language:
|
|
255
|
-
- **Python**: PyArrow
|
|
256
|
-
- **R**: arrow R package
|
|
257
|
-
- **Java**: Arrow Java
|
|
258
|
-
- **C++**: Arrow C++
|
|
259
|
-
- **Rust**: arrow-rs
|
|
260
|
-
|
|
261
|
-
Tables can be serialized in one language and deserialized in another seamlessly.
|
|
262
|
-
|
|
263
376
|
## License
|
|
264
377
|
|
|
265
378
|
MIT
|