databonk 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +161 -0
- package/dist/core/column.d.ts +25 -0
- package/dist/core/column.d.ts.map +1 -0
- package/dist/core/dataframe.d.ts +30 -0
- package/dist/core/dataframe.d.ts.map +1 -0
- package/dist/index.d.ts +20 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.esm.js +5553 -0
- package/dist/index.esm.js.map +1 -0
- package/dist/index.js +5570 -0
- package/dist/index.js.map +1 -0
- package/dist/io/csv.d.ts +23 -0
- package/dist/io/csv.d.ts.map +1 -0
- package/dist/operations/aggregation.d.ts +23 -0
- package/dist/operations/aggregation.d.ts.map +1 -0
- package/dist/operations/derive.d.ts +38 -0
- package/dist/operations/derive.d.ts.map +1 -0
- package/dist/operations/groupby.d.ts +29 -0
- package/dist/operations/groupby.d.ts.map +1 -0
- package/dist/operations/join.d.ts +19 -0
- package/dist/operations/join.d.ts.map +1 -0
- package/dist/operations/reshape.d.ts +17 -0
- package/dist/operations/reshape.d.ts.map +1 -0
- package/dist/utils/bitset.d.ts +12 -0
- package/dist/utils/bitset.d.ts.map +1 -0
- package/dist/utils/performance.d.ts +44 -0
- package/dist/utils/performance.d.ts.map +1 -0
- package/dist/utils/types.d.ts +7 -0
- package/dist/utils/types.d.ts.map +1 -0
- package/dist/validation/schema.d.ts +73 -0
- package/dist/validation/schema.d.ts.map +1 -0
- package/package.json +77 -0
package/README.md
ADDED
|
@@ -0,0 +1,161 @@
|
|
|
1
|
+
# Databonk.js
|
|
2
|
+
|
|
3
|
+
A lightweight, fast data frame library for JavaScript and TypeScript with built-in schema validation.
|
|
4
|
+
|
|
5
|
+
## Features
|
|
6
|
+
|
|
7
|
+
- **Lightweight**: Minimal dependencies, tree-shakeable modules
|
|
8
|
+
- **Fast**: Columnar storage using TypedArrays for performance
|
|
9
|
+
- **Simple**: Clean API for common data operations
|
|
10
|
+
- **Flexible**: Works with regular arrays, TypedArrays, or Apache Arrow
|
|
11
|
+
- **Schema Validation**: Built-in Zod integration for data validation
|
|
12
|
+
- **Type Safe**: Full TypeScript support with inferred types
|
|
13
|
+
|
|
14
|
+
## Installation
|
|
15
|
+
|
|
16
|
+
```bash
|
|
17
|
+
npm install databonk zod
|
|
18
|
+
```
|
|
19
|
+
|
|
20
|
+
## Quick Start
|
|
21
|
+
|
|
22
|
+
```javascript
|
|
23
|
+
import { DataFrame, SchemaValidator, CommonSchemas } from 'databonk';
|
|
24
|
+
|
|
25
|
+
// Create a DataFrame
|
|
26
|
+
const df = DataFrame.from({
|
|
27
|
+
name: ['Alice', 'Bob', 'Charlie'],
|
|
28
|
+
age: [25, 30, 35],
|
|
29
|
+
city: ['NYC', 'LA', 'Chicago']
|
|
30
|
+
});
|
|
31
|
+
|
|
32
|
+
// Basic operations
|
|
33
|
+
const adults = df.filter(row => row.age >= 30);
|
|
34
|
+
const avgAge = df.column('age').mean();
|
|
35
|
+
const grouped = df.groupBy(['city']).agg({ avgAge: 'mean' });
|
|
36
|
+
|
|
37
|
+
// Schema validation
|
|
38
|
+
const result = df.validate(CommonSchemas.person);
|
|
39
|
+
console.log(`Valid rows: ${result.validRows}/${result.totalRows}`);
|
|
40
|
+
```
|
|
41
|
+
|
|
42
|
+
## Core Features
|
|
43
|
+
|
|
44
|
+
### Data Operations
|
|
45
|
+
- **Filtering & Selection**: Powerful row/column filtering with predicate functions
|
|
46
|
+
- **Joins**: Inner, left, right, and outer joins with multiple keys
|
|
47
|
+
- **Aggregations**: Sum, mean, count, min, max, std, variance with group-by support
|
|
48
|
+
- **Reshaping**: Pivot, melt, transpose operations for data transformation
|
|
49
|
+
- **Sorting**: Multi-column sorting with custom comparators
|
|
50
|
+
|
|
51
|
+
### Schema Validation
|
|
52
|
+
- **Built-in Schemas**: Common patterns for users, products, transactions, coordinates
|
|
53
|
+
- **Custom Validation**: Define your own schemas with Zod
|
|
54
|
+
- **Data Cleaning**: Filter valid/invalid rows, transform data types
|
|
55
|
+
- **Error Reporting**: Detailed validation errors with row/column information
|
|
56
|
+
|
|
57
|
+
### I/O Support
|
|
58
|
+
- **CSV**: Read/write CSV files with automatic type inference
|
|
59
|
+
- **Apache Arrow**: Optional integration for columnar data exchange
|
|
60
|
+
- **Streaming**: Memory-efficient processing of large datasets
|
|
61
|
+
|
|
62
|
+
## Examples
|
|
63
|
+
|
|
64
|
+
### Schema Validation
|
|
65
|
+
|
|
66
|
+
```javascript
|
|
67
|
+
import { DataFrame, SchemaValidator } from 'databonk';
|
|
68
|
+
import { z } from 'zod';
|
|
69
|
+
|
|
70
|
+
// Define a custom schema
|
|
71
|
+
const userSchema = SchemaValidator.define({
|
|
72
|
+
name: z.string().min(1),
|
|
73
|
+
age: z.number().int().min(0).max(150),
|
|
74
|
+
email: z.string().email(),
|
|
75
|
+
role: z.enum(['admin', 'user', 'guest'])
|
|
76
|
+
});
|
|
77
|
+
|
|
78
|
+
const userData = [
|
|
79
|
+
{ name: 'Alice', age: 25, email: 'alice@example.com', role: 'admin' },
|
|
80
|
+
{ name: '', age: -5, email: 'invalid', role: 'unknown' } // Invalid
|
|
81
|
+
];
|
|
82
|
+
|
|
83
|
+
const df = DataFrame.fromRows(userData);
|
|
84
|
+
|
|
85
|
+
// Validate data
|
|
86
|
+
const validation = df.validate(userSchema);
|
|
87
|
+
console.log(`Errors: ${validation.errors.length}`);
|
|
88
|
+
|
|
89
|
+
// Filter valid rows
|
|
90
|
+
const validUsers = df.filterValid(userSchema);
|
|
91
|
+
|
|
92
|
+
// Transform data with type coercion
|
|
93
|
+
const cleanData = df.validateAndTransform(userSchema);
|
|
94
|
+
```
|
|
95
|
+
|
|
96
|
+
### Advanced Data Operations
|
|
97
|
+
|
|
98
|
+
```javascript
|
|
99
|
+
// Join operations
|
|
100
|
+
const sales = DataFrame.fromRows([
|
|
101
|
+
{ product_id: 1, quantity: 100, region: 'North' },
|
|
102
|
+
{ product_id: 2, quantity: 150, region: 'South' }
|
|
103
|
+
]);
|
|
104
|
+
|
|
105
|
+
const products = DataFrame.fromRows([
|
|
106
|
+
{ product_id: 1, name: 'Widget', price: 10.99 },
|
|
107
|
+
{ product_id: 2, name: 'Gadget', price: 15.99 }
|
|
108
|
+
]);
|
|
109
|
+
|
|
110
|
+
const joined = sales.join(products, 'product_id', 'inner');
|
|
111
|
+
|
|
112
|
+
// Group by with multiple aggregations
|
|
113
|
+
const summary = joined
|
|
114
|
+
.groupBy(['region'])
|
|
115
|
+
.agg({
|
|
116
|
+
quantity: ['sum', 'mean'],
|
|
117
|
+
price: 'mean'
|
|
118
|
+
});
|
|
119
|
+
|
|
120
|
+
// Add calculated columns
|
|
121
|
+
const withRevenue = joined.withColumn('revenue',
|
|
122
|
+
row => row.quantity * row.price
|
|
123
|
+
);
|
|
124
|
+
|
|
125
|
+
// Pivot tables
|
|
126
|
+
const pivot = sales.pivot(['region'], 'product_id', 'quantity', 'sum');
|
|
127
|
+
```
|
|
128
|
+
|
|
129
|
+
## Docker Development
|
|
130
|
+
|
|
131
|
+
```bash
|
|
132
|
+
# Build and start development environment
|
|
133
|
+
make docker-dev
|
|
134
|
+
|
|
135
|
+
# Run tests in Docker
|
|
136
|
+
make docker-test
|
|
137
|
+
|
|
138
|
+
# Open shell in container
|
|
139
|
+
make docker-shell
|
|
140
|
+
```
|
|
141
|
+
|
|
142
|
+
## Development
|
|
143
|
+
|
|
144
|
+
```bash
|
|
145
|
+
# Local development
|
|
146
|
+
npm install
|
|
147
|
+
npm run build
|
|
148
|
+
npm test
|
|
149
|
+
|
|
150
|
+
# With Docker
|
|
151
|
+
make setup
|
|
152
|
+
make dev
|
|
153
|
+
```
|
|
154
|
+
|
|
155
|
+
## Performance
|
|
156
|
+
|
|
157
|
+
Databonk.js is designed for small to medium datasets (up to ~1M rows) with:
|
|
158
|
+
- **Memory efficient**: Columnar storage with TypedArrays
|
|
159
|
+
- **Fast operations**: Optimized algorithms for joins, aggregations
|
|
160
|
+
- **Minimal overhead**: Zero-copy operations where possible
|
|
161
|
+
- **Tree-shakeable**: Only import what you use
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
import { DataType } from '../utils/types.js';
|
|
2
|
+
export declare class Column<T = any> {
|
|
3
|
+
readonly name: string;
|
|
4
|
+
readonly dataType: DataType;
|
|
5
|
+
private data;
|
|
6
|
+
private nullBitmap;
|
|
7
|
+
readonly length: number;
|
|
8
|
+
constructor(name: string, values: T[], dataType?: DataType);
|
|
9
|
+
private createDataArray;
|
|
10
|
+
get(index: number): T | null;
|
|
11
|
+
isNull(index: number): boolean;
|
|
12
|
+
slice(start?: number, end?: number): Column<T>;
|
|
13
|
+
filter(predicate: (value: T | null, index: number) => boolean): Column<T>;
|
|
14
|
+
map<U>(fn: (value: T | null, index: number) => U, newDataType?: DataType): Column<U>;
|
|
15
|
+
sum(): number;
|
|
16
|
+
mean(): number;
|
|
17
|
+
min(): number;
|
|
18
|
+
max(): number;
|
|
19
|
+
count(): number;
|
|
20
|
+
unique(): T[];
|
|
21
|
+
values(): Iterator<T | null>;
|
|
22
|
+
toArray(): (T | null)[];
|
|
23
|
+
static from<T>(name: string, values: T[], dataType?: DataType): Column<T>;
|
|
24
|
+
}
|
|
25
|
+
//# sourceMappingURL=column.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"column.d.ts","sourceRoot":"","sources":["../../src/core/column.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,QAAQ,EAA6D,MAAM,mBAAmB,CAAC;AAGxG,qBAAa,MAAM,CAAC,CAAC,GAAG,GAAG;IACzB,SAAgB,IAAI,EAAE,MAAM,CAAC;IAC7B,SAAgB,QAAQ,EAAE,QAAQ,CAAC;IACnC,OAAO,CAAC,IAAI,CAA6B;IACzC,OAAO,CAAC,UAAU,CAAS;IAC3B,SAAgB,MAAM,EAAE,MAAM,CAAC;gBAEnB,IAAI,EAAE,MAAM,EAAE,MAAM,EAAE,CAAC,EAAE,EAAE,QAAQ,CAAC,EAAE,QAAQ;IAS1D,OAAO,CAAC,eAAe;IAsBvB,GAAG,CAAC,KAAK,EAAE,MAAM,GAAG,CAAC,GAAG,IAAI;IAY5B,MAAM,CAAC,KAAK,EAAE,MAAM,GAAG,OAAO;IAI9B,KAAK,CAAC,KAAK,CAAC,EAAE,MAAM,EAAE,GAAG,CAAC,EAAE,MAAM,GAAG,MAAM,CAAC,CAAC,CAAC;IAY9C,MAAM,CAAC,SAAS,EAAE,CAAC,KAAK,EAAE,CAAC,GAAG,IAAI,EAAE,KAAK,EAAE,MAAM,KAAK,OAAO,GAAG,MAAM,CAAC,CAAC,CAAC;IAazE,GAAG,CAAC,CAAC,EAAE,EAAE,EAAE,CAAC,KAAK,EAAE,CAAC,GAAG,IAAI,EAAE,KAAK,EAAE,MAAM,KAAK,CAAC,EAAE,WAAW,CAAC,EAAE,QAAQ,GAAG,MAAM,CAAC,CAAC,CAAC;IAWpF,GAAG,IAAI,MAAM;IAeb,IAAI,IAAI,MAAM;IASd,GAAG,IAAI,MAAM;IAqBb,GAAG,IAAI,MAAM;IAqBb,KAAK,IAAI,MAAM;IAIf,MAAM,IAAI,CAAC,EAAE;IAYZ,MAAM,IAAI,QAAQ,CAAC,CAAC,GAAG,IAAI,CAAC;IAM7B,OAAO,IAAI,CAAC,CAAC,GAAG,IAAI,CAAC,EAAE;IAQvB,MAAM,CAAC,IAAI,CAAC,CAAC,EAAE,IAAI,EAAE,MAAM,EAAE,MAAM,EAAE,CAAC,EAAE,EAAE,QAAQ,CAAC,EAAE,QAAQ,GAAG,MAAM,CAAC,CAAC,CAAC;CAG1E"}
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
import { Column } from './column';
|
|
2
|
+
export type RowObject = Record<string, any>;
|
|
3
|
+
export declare class DataFrame {
|
|
4
|
+
private columns;
|
|
5
|
+
readonly length: number;
|
|
6
|
+
constructor(data: Record<string, Column> | Column[]);
|
|
7
|
+
get columnNames(): string[];
|
|
8
|
+
get columnCount(): number;
|
|
9
|
+
column(name: string): Column;
|
|
10
|
+
hasColumn(name: string): boolean;
|
|
11
|
+
addColumn(column: Column): DataFrame;
|
|
12
|
+
removeColumn(name: string): DataFrame;
|
|
13
|
+
select(columns: string[]): DataFrame;
|
|
14
|
+
filter(predicate: (row: RowObject, index: number) => boolean): DataFrame;
|
|
15
|
+
slice(start?: number, end?: number): DataFrame;
|
|
16
|
+
selectRows(indices: number[]): DataFrame;
|
|
17
|
+
getRow(index: number): RowObject;
|
|
18
|
+
rows(): IterableIterator<RowObject>;
|
|
19
|
+
head(n?: number): DataFrame;
|
|
20
|
+
tail(n?: number): DataFrame;
|
|
21
|
+
sort(columnName: string, ascending?: boolean): DataFrame;
|
|
22
|
+
drop(columns: string[]): DataFrame;
|
|
23
|
+
rename(columnMapping: Record<string, string>): DataFrame;
|
|
24
|
+
toArray(): RowObject[];
|
|
25
|
+
toColumns(): Record<string, any[]>;
|
|
26
|
+
static from(data: RowObject[] | Record<string, any[]>): DataFrame;
|
|
27
|
+
static fromRows(rows: RowObject[]): DataFrame;
|
|
28
|
+
static fromColumns(data: Record<string, any[]>): DataFrame;
|
|
29
|
+
}
|
|
30
|
+
//# sourceMappingURL=dataframe.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"dataframe.d.ts","sourceRoot":"","sources":["../../src/core/dataframe.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,MAAM,EAAE,MAAM,UAAU,CAAC;AAGlC,MAAM,MAAM,SAAS,GAAG,MAAM,CAAC,MAAM,EAAE,GAAG,CAAC,CAAC;AAE5C,qBAAa,SAAS;IACpB,OAAO,CAAC,OAAO,CAAkC;IACjD,SAAgB,MAAM,EAAE,MAAM,CAAC;gBAEnB,IAAI,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,GAAG,MAAM,EAAE;IAiBnD,IAAI,WAAW,IAAI,MAAM,EAAE,CAE1B;IAED,IAAI,WAAW,IAAI,MAAM,CAExB;IAED,MAAM,CAAC,IAAI,EAAE,MAAM,GAAG,MAAM;IAQ5B,SAAS,CAAC,IAAI,EAAE,MAAM,GAAG,OAAO;IAIhC,SAAS,CAAC,MAAM,EAAE,MAAM,GAAG,SAAS;IAUpC,YAAY,CAAC,IAAI,EAAE,MAAM,GAAG,SAAS;IAUrC,MAAM,CAAC,OAAO,EAAE,MAAM,EAAE,GAAG,SAAS;IAapC,MAAM,CAAC,SAAS,EAAE,CAAC,GAAG,EAAE,SAAS,EAAE,KAAK,EAAE,MAAM,KAAK,OAAO,GAAG,SAAS;IAaxE,KAAK,CAAC,KAAK,CAAC,EAAE,MAAM,EAAE,GAAG,CAAC,EAAE,MAAM,GAAG,SAAS;IAa9C,UAAU,CAAC,OAAO,EAAE,MAAM,EAAE,GAAG,SAAS;IAWxC,MAAM,CAAC,KAAK,EAAE,MAAM,GAAG,SAAS;IAa/B,IAAI,IAAI,gBAAgB,CAAC,SAAS,CAAC;IAMpC,IAAI,CAAC,CAAC,GAAE,MAAU,GAAG,SAAS;IAI9B,IAAI,CAAC,CAAC,GAAE,MAAU,GAAG,SAAS;IAI9B,IAAI,CAAC,UAAU,EAAE,MAAM,EAAE,SAAS,GAAE,OAAc,GAAG,SAAS;IA6B9D,IAAI,CAAC,OAAO,EAAE,MAAM,EAAE,GAAG,SAAS;IAYlC,MAAM,CAAC,aAAa,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,GAAG,SAAS;IAWxD,OAAO,IAAI,SAAS,EAAE;IAQtB,SAAS,IAAI,MAAM,CAAC,MAAM,EAAE,GAAG,EAAE,CAAC;IAQlC,MAAM,CAAC,IAAI,CAAC,IAAI,EAAE,SAAS,EAAE,GAAG,MAAM,CAAC,MAAM,EAAE,GAAG,EAAE,CAAC,GAAG,SAAS;IAQjE,MAAM,CAAC,QAAQ,CAAC,IAAI,EAAE,SAAS,EAAE,GAAG,SAAS;IAgB7C,MAAM,CAAC,WAAW,CAAC,IAAI,EAAE,MAAM,CAAC,MAAM,EAAE,GAAG,EAAE,CAAC,GAAG,SAAS;CAS3D"}
|
package/dist/index.d.ts
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
export { DataFrame } from './core/dataframe.js';
|
|
2
|
+
export { Column } from './core/column.js';
|
|
3
|
+
export { GroupBy } from './operations/groupby.js';
|
|
4
|
+
export { Joiner, JoinType } from './operations/join.js';
|
|
5
|
+
export { Reshaper } from './operations/reshape.js';
|
|
6
|
+
export { ColumnDeriver, ColumnExpression } from './operations/derive.js';
|
|
7
|
+
export { Aggregator, AggregateFunction, AggregateSpec } from './operations/aggregation.js';
|
|
8
|
+
export { CsvReader, CsvWriter, CsvOptions } from './io/csv.js';
|
|
9
|
+
export { SchemaValidator, SchemaBuilders, CommonSchemas, ValidationResult, ValidationError, ValidationOptions, DataFrameSchema, SchemaDefinition } from './validation/schema.js';
|
|
10
|
+
export { DataType } from './utils/types.js';
|
|
11
|
+
export { BitSet } from './utils/bitset.js';
|
|
12
|
+
export { PerformanceTimer, DataGenerator, MemoryProfiler } from './utils/performance.js';
|
|
13
|
+
import './operations/aggregation.js';
|
|
14
|
+
import './operations/groupby.js';
|
|
15
|
+
import './operations/join.js';
|
|
16
|
+
import './operations/reshape.js';
|
|
17
|
+
import './operations/derive.js';
|
|
18
|
+
import './io/csv.js';
|
|
19
|
+
import './validation/schema.js';
|
|
20
|
+
//# sourceMappingURL=index.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,SAAS,EAAE,MAAM,qBAAqB,CAAC;AAChD,OAAO,EAAE,MAAM,EAAE,MAAM,kBAAkB,CAAC;AAC1C,OAAO,EAAE,OAAO,EAAE,MAAM,yBAAyB,CAAC;AAClD,OAAO,EAAE,MAAM,EAAE,QAAQ,EAAE,MAAM,sBAAsB,CAAC;AACxD,OAAO,EAAE,QAAQ,EAAE,MAAM,yBAAyB,CAAC;AACnD,OAAO,EAAE,aAAa,EAAE,gBAAgB,EAAE,MAAM,wBAAwB,CAAC;AACzE,OAAO,EAAE,UAAU,EAAE,iBAAiB,EAAE,aAAa,EAAE,MAAM,6BAA6B,CAAC;AAC3F,OAAO,EAAE,SAAS,EAAE,SAAS,EAAE,UAAU,EAAE,MAAM,aAAa,CAAC;AAC/D,OAAO,EACL,eAAe,EACf,cAAc,EACd,aAAa,EACb,gBAAgB,EAChB,eAAe,EACf,iBAAiB,EACjB,eAAe,EACf,gBAAgB,EACjB,MAAM,wBAAwB,CAAC;AAChC,OAAO,EAAE,QAAQ,EAAE,MAAM,kBAAkB,CAAC;AAC5C,OAAO,EAAE,MAAM,EAAE,MAAM,mBAAmB,CAAC;AAC3C,OAAO,EAAE,gBAAgB,EAAE,aAAa,EAAE,cAAc,EAAE,MAAM,wBAAwB,CAAC;AAEzF,OAAO,6BAA6B,CAAC;AACrC,OAAO,yBAAyB,CAAC;AACjC,OAAO,sBAAsB,CAAC;AAC9B,OAAO,yBAAyB,CAAC;AACjC,OAAO,wBAAwB,CAAC;AAChC,OAAO,aAAa,CAAC;AACrB,OAAO,wBAAwB,CAAC"}
|