databonk 0.0.2 → 0.0.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +116 -111
- package/build/release.d.ts +719 -0
- package/build/release.js +774 -0
- package/build/release.wasm +0 -0
- package/build/release.wasm.map +1 -0
- package/build/release.wat +22633 -0
- package/dist/dataframe.d.ts +82 -0
- package/dist/dataframe.d.ts.map +1 -0
- package/dist/dataframe.js +318 -0
- package/dist/dataframe.js.map +1 -0
- package/dist/index.d.ts +42 -19
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +37 -6166
- package/dist/index.js.map +1 -1
- package/dist/loader.d.ts +86 -0
- package/dist/loader.d.ts.map +1 -0
- package/dist/loader.js +147 -0
- package/dist/loader.js.map +1 -0
- package/dist/shared-memory.d.ts +64 -0
- package/dist/shared-memory.d.ts.map +1 -0
- package/dist/shared-memory.js +113 -0
- package/dist/shared-memory.js.map +1 -0
- package/package.json +30 -56
- package/dist/core/column.d.ts +0 -55
- package/dist/core/column.d.ts.map +0 -1
- package/dist/core/dataframe.d.ts +0 -70
- package/dist/core/dataframe.d.ts.map +0 -1
- package/dist/core/index-cache.d.ts +0 -44
- package/dist/core/index-cache.d.ts.map +0 -1
- package/dist/index.esm.js +0 -6153
- package/dist/index.esm.js.map +0 -1
- package/dist/io/csv.d.ts +0 -23
- package/dist/io/csv.d.ts.map +0 -1
- package/dist/operations/aggregation.d.ts +0 -23
- package/dist/operations/aggregation.d.ts.map +0 -1
- package/dist/operations/derive.d.ts +0 -38
- package/dist/operations/derive.d.ts.map +0 -1
- package/dist/operations/groupby.d.ts +0 -36
- package/dist/operations/groupby.d.ts.map +0 -1
- package/dist/operations/join.d.ts +0 -22
- package/dist/operations/join.d.ts.map +0 -1
- package/dist/operations/reshape.d.ts +0 -17
- package/dist/operations/reshape.d.ts.map +0 -1
- package/dist/utils/aggregation-engine.d.ts +0 -84
- package/dist/utils/aggregation-engine.d.ts.map +0 -1
- package/dist/utils/bitset.d.ts +0 -30
- package/dist/utils/bitset.d.ts.map +0 -1
- package/dist/utils/hash.d.ts +0 -79
- package/dist/utils/hash.d.ts.map +0 -1
- package/dist/utils/performance.d.ts +0 -44
- package/dist/utils/performance.d.ts.map +0 -1
- package/dist/utils/types.d.ts +0 -7
- package/dist/utils/types.d.ts.map +0 -1
- package/dist/validation/schema.d.ts +0 -73
- package/dist/validation/schema.d.ts.map +0 -1
package/README.md
CHANGED
|
@@ -1,161 +1,166 @@
|
|
|
1
|
-
# Databonk
|
|
1
|
+
# Databonk
|
|
2
2
|
|
|
3
|
-
|
|
3
|
+
**WASM-powered DataFrame library with SIMD acceleration**
|
|
4
4
|
|
|
5
|
-
|
|
5
|
+
Databonk is a high-performance columnar DataFrame library built with AssemblyScript and WebAssembly, featuring SIMD-optimized operations and optional SharedArrayBuffer support for zero-copy data access.
|
|
6
6
|
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
- **
|
|
10
|
-
- **
|
|
11
|
-
- **
|
|
12
|
-
- **
|
|
7
|
+
## Key Features
|
|
8
|
+
|
|
9
|
+
- **14x faster** than JavaScript for aggregations (sum, mean, min, max)
|
|
10
|
+
- **SIMD acceleration** with 4-way parallel computation
|
|
11
|
+
- **Zero-copy access** to column data via SharedArrayBuffer
|
|
12
|
+
- **Full TypeScript support** with comprehensive type definitions
|
|
13
|
+
- **Memory efficient** columnar storage design
|
|
14
|
+
- **Fluent API** for method chaining
|
|
13
15
|
|
|
14
16
|
## Installation
|
|
15
17
|
|
|
16
18
|
```bash
|
|
17
|
-
npm install databonk
|
|
19
|
+
npm install databonk
|
|
18
20
|
```
|
|
19
21
|
|
|
20
22
|
## Quick Start
|
|
21
23
|
|
|
22
|
-
```
|
|
23
|
-
import {
|
|
24
|
-
|
|
25
|
-
// Create a DataFrame
|
|
26
|
-
const df = DataFrame.from({
|
|
27
|
-
name: ['Alice', 'Bob', 'Charlie'],
|
|
28
|
-
age: [25, 30, 35],
|
|
29
|
-
city: ['NYC', 'LA', 'Chicago']
|
|
30
|
-
});
|
|
24
|
+
```typescript
|
|
25
|
+
import { loadDatabonk, DatabonkDataFrame } from 'databonk';
|
|
31
26
|
|
|
32
|
-
//
|
|
33
|
-
const
|
|
34
|
-
const avgAge = df.column('age').mean();
|
|
35
|
-
const grouped = df.groupBy(['city']).agg({ avgAge: 'mean' });
|
|
27
|
+
// Load the WASM module
|
|
28
|
+
const module = await loadDatabonk();
|
|
36
29
|
|
|
37
|
-
//
|
|
38
|
-
const
|
|
39
|
-
|
|
40
|
-
|
|
30
|
+
// Create a DataFrame from typed arrays
|
|
31
|
+
const df = await DatabonkDataFrame.fromTypedArrays(module, [
|
|
32
|
+
{ name: 'id', data: new Int32Array([1, 2, 3, 4, 5]) },
|
|
33
|
+
{ name: 'value', data: new Float32Array([10.5, 20.5, 30.5, 40.5, 50.5]) },
|
|
34
|
+
]);
|
|
41
35
|
|
|
42
|
-
|
|
36
|
+
// Aggregations
|
|
37
|
+
console.log('Sum:', df.sum('value')); // 152.5
|
|
38
|
+
console.log('Mean:', df.mean('value')); // 30.5
|
|
39
|
+
console.log('Min:', df.min('value')); // 10.5
|
|
40
|
+
console.log('Max:', df.max('value')); // 50.5
|
|
41
|
+
console.log('Rows:', df.rowCount); // 5
|
|
43
42
|
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
- **Aggregations**: Sum, mean, count, min, max, std, variance with group-by support
|
|
48
|
-
- **Reshaping**: Pivot, melt, transpose operations for data transformation
|
|
49
|
-
- **Sorting**: Multi-column sorting with custom comparators
|
|
43
|
+
// Clean up when done
|
|
44
|
+
df.free();
|
|
45
|
+
```
|
|
50
46
|
|
|
51
|
-
|
|
52
|
-
- **Built-in Schemas**: Common patterns for users, products, transactions, coordinates
|
|
53
|
-
- **Custom Validation**: Define your own schemas with Zod
|
|
54
|
-
- **Data Cleaning**: Filter valid/invalid rows, transform data types
|
|
55
|
-
- **Error Reporting**: Detailed validation errors with row/column information
|
|
47
|
+
## Performance
|
|
56
48
|
|
|
57
|
-
|
|
58
|
-
- **CSV**: Read/write CSV files with automatic type inference
|
|
59
|
-
- **Apache Arrow**: Optional integration for columnar data exchange
|
|
60
|
-
- **Streaming**: Memory-efficient processing of large datasets
|
|
49
|
+
Benchmarks on 1 million rows (Float32):
|
|
61
50
|
|
|
62
|
-
|
|
51
|
+
| Operation | WASM SIMD | JavaScript | Speedup |
|
|
52
|
+
|-----------|-----------|------------|---------|
|
|
53
|
+
| Sum | ~0.3ms | ~4.2ms | **14x** |
|
|
54
|
+
| Min | ~0.4ms | ~4.8ms | **12x** |
|
|
55
|
+
| Max | ~0.4ms | ~4.8ms | **12x** |
|
|
56
|
+
| Mean | ~0.3ms | ~5.0ms | **16x** |
|
|
63
57
|
|
|
64
|
-
|
|
58
|
+
## API Overview
|
|
65
59
|
|
|
66
|
-
|
|
67
|
-
import { DataFrame, SchemaValidator } from 'databonk';
|
|
68
|
-
import { z } from 'zod';
|
|
60
|
+
### Module Loading
|
|
69
61
|
|
|
70
|
-
|
|
71
|
-
const
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
62
|
+
```typescript
|
|
63
|
+
const module = await loadDatabonk({
|
|
64
|
+
wasmPath: './build/release.wasm', // Optional: custom WASM path
|
|
65
|
+
sharedMemory: true, // Optional: enable SharedArrayBuffer
|
|
66
|
+
initialMemory: 256, // Optional: initial memory pages (16MB default)
|
|
67
|
+
maximumMemory: 16384, // Optional: max memory pages (1GB default)
|
|
76
68
|
});
|
|
69
|
+
```
|
|
70
|
+
|
|
71
|
+
### DataFrame Creation
|
|
77
72
|
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
{ name: '',
|
|
81
|
-
]
|
|
73
|
+
```typescript
|
|
74
|
+
const df = await DatabonkDataFrame.fromTypedArrays(module, [
|
|
75
|
+
{ name: 'int_col', data: new Int32Array([1, 2, 3]) },
|
|
76
|
+
{ name: 'float_col', data: new Float32Array([1.5, 2.5, 3.5]) },
|
|
77
|
+
{ name: 'double_col', data: new Float64Array([1.1, 2.2, 3.3]) },
|
|
78
|
+
]);
|
|
79
|
+
```
|
|
82
80
|
|
|
83
|
-
|
|
81
|
+
### Aggregations
|
|
84
82
|
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
83
|
+
```typescript
|
|
84
|
+
df.sum('column'); // Sum of values
|
|
85
|
+
df.mean('column'); // Average
|
|
86
|
+
df.min('column'); // Minimum
|
|
87
|
+
df.max('column'); // Maximum
|
|
88
|
+
df.count('column'); // Count of values
|
|
89
|
+
```
|
|
88
90
|
|
|
89
|
-
|
|
90
|
-
const validUsers = df.filterValid(userSchema);
|
|
91
|
+
### Column Arithmetic
|
|
91
92
|
|
|
92
|
-
|
|
93
|
-
|
|
93
|
+
```typescript
|
|
94
|
+
df.add('a', 'b', 'sum') // sum = a + b
|
|
95
|
+
.sub('a', 'b', 'diff') // diff = a - b
|
|
96
|
+
.scalarMul('a', 2.5, 'scaled'); // scaled = a * 2.5
|
|
94
97
|
```
|
|
95
98
|
|
|
96
|
-
###
|
|
99
|
+
### GroupBy
|
|
97
100
|
|
|
98
|
-
```
|
|
99
|
-
//
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
{ product_id: 2, quantity: 150, region: 'South' }
|
|
103
|
-
]);
|
|
101
|
+
```typescript
|
|
102
|
+
const grouped = df.groupBy('category', 256) // maxKey parameter
|
|
103
|
+
.sum('value'); // or .mean('value')
|
|
104
|
+
```
|
|
104
105
|
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
106
|
+
### Inner Join
|
|
107
|
+
|
|
108
|
+
```typescript
|
|
109
|
+
const result = left.innerJoin(right, 'left_key', 'right_key');
|
|
110
|
+
```
|
|
109
111
|
|
|
110
|
-
|
|
112
|
+
### Zero-Copy Column Access
|
|
111
113
|
|
|
112
|
-
|
|
113
|
-
const
|
|
114
|
-
|
|
115
|
-
.
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
114
|
+
```typescript
|
|
115
|
+
const view = df.getColumnView('value');
|
|
116
|
+
if (view) {
|
|
117
|
+
console.log(view.get(0)); // First value
|
|
118
|
+
console.log([...view]); // Iterate
|
|
119
|
+
console.log(view.toArray()); // Copy to regular array
|
|
120
|
+
}
|
|
121
|
+
```
|
|
119
122
|
|
|
120
|
-
|
|
121
|
-
const withRevenue = joined.withColumn('revenue',
|
|
122
|
-
row => row.quantity * row.price
|
|
123
|
-
);
|
|
123
|
+
### Memory Management
|
|
124
124
|
|
|
125
|
-
|
|
126
|
-
|
|
125
|
+
```typescript
|
|
126
|
+
df.free(); // Always free DataFrames when done
|
|
127
127
|
```
|
|
128
128
|
|
|
129
|
-
##
|
|
129
|
+
## Documentation
|
|
130
130
|
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
make docker-dev
|
|
131
|
+
- [API Reference](./docs/api.md) - Full API documentation
|
|
132
|
+
- [Examples](./docs/examples.md) - Detailed code examples
|
|
134
133
|
|
|
135
|
-
|
|
136
|
-
make docker-test
|
|
134
|
+
## Supported Column Types
|
|
137
135
|
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
136
|
+
| Type | TypedArray | Use Case |
|
|
137
|
+
|------|------------|----------|
|
|
138
|
+
| Int32 | `Int32Array` | Integer keys, IDs, counts |
|
|
139
|
+
| Float32 | `Float32Array` | Standard floating-point values |
|
|
140
|
+
| Float64 | `Float64Array` | High-precision values |
|
|
141
|
+
|
|
142
|
+
## Current Limitations
|
|
143
|
+
|
|
144
|
+
- GroupBy currently supports single value column aggregation
|
|
145
|
+
- Join keys must be Int32 values
|
|
146
|
+
- String columns are supported for storage but not for operations
|
|
141
147
|
|
|
142
148
|
## Development
|
|
143
149
|
|
|
144
150
|
```bash
|
|
145
|
-
#
|
|
151
|
+
# Install dependencies
|
|
146
152
|
npm install
|
|
147
|
-
|
|
153
|
+
|
|
154
|
+
# Build WASM module
|
|
155
|
+
npm run asbuild
|
|
156
|
+
|
|
157
|
+
# Run tests
|
|
148
158
|
npm test
|
|
149
159
|
|
|
150
|
-
#
|
|
151
|
-
|
|
152
|
-
make dev
|
|
160
|
+
# Run benchmarks
|
|
161
|
+
npm run benchmark
|
|
153
162
|
```
|
|
154
163
|
|
|
155
|
-
##
|
|
164
|
+
## License
|
|
156
165
|
|
|
157
|
-
|
|
158
|
-
- **Memory efficient**: Columnar storage with TypedArrays
|
|
159
|
-
- **Fast operations**: Optimized algorithms for joins, aggregations
|
|
160
|
-
- **Minimal overhead**: Zero-copy operations where possible
|
|
161
|
-
- **Tree-shakeable**: Only import what you use
|
|
166
|
+
MIT
|