databonk 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md ADDED
@@ -0,0 +1,161 @@
1
+ # Databonk.js
2
+
3
+ A lightweight, fast data frame library for JavaScript and TypeScript with built-in schema validation.
4
+
5
+ ## Features
6
+
7
+ - **Lightweight**: Minimal dependencies, tree-shakeable modules
8
+ - **Fast**: Columnar storage using TypedArrays for performance
9
+ - **Simple**: Clean API for common data operations
10
+ - **Flexible**: Works with regular arrays, TypedArrays, or Apache Arrow
11
+ - **Schema Validation**: Built-in Zod integration for data validation
12
+ - **Type Safe**: Full TypeScript support with inferred types
13
+
14
+ ## Installation
15
+
16
+ ```bash
17
+ npm install databonk zod
18
+ ```
19
+
20
+ ## Quick Start
21
+
22
+ ```javascript
23
+ import { DataFrame, SchemaValidator, CommonSchemas } from 'databonk';
24
+
25
+ // Create a DataFrame
26
+ const df = DataFrame.from({
27
+ name: ['Alice', 'Bob', 'Charlie'],
28
+ age: [25, 30, 35],
29
+ city: ['NYC', 'LA', 'Chicago']
30
+ });
31
+
32
+ // Basic operations
33
+ const adults = df.filter(row => row.age >= 30);
34
+ const avgAge = df.column('age').mean();
35
+ const grouped = df.groupBy(['city']).agg({ avgAge: 'mean' });
36
+
37
+ // Schema validation
38
+ const result = df.validate(CommonSchemas.person);
39
+ console.log(`Valid rows: ${result.validRows}/${result.totalRows}`);
40
+ ```
41
+
42
+ ## Core Features
43
+
44
+ ### Data Operations
45
+ - **Filtering & Selection**: Powerful row/column filtering with predicate functions
46
+ - **Joins**: Inner, left, right, and outer joins with multiple keys
47
+ - **Aggregations**: Sum, mean, count, min, max, std, variance with group-by support
48
+ - **Reshaping**: Pivot, melt, transpose operations for data transformation
49
+ - **Sorting**: Multi-column sorting with custom comparators
50
+
51
+ ### Schema Validation
52
+ - **Built-in Schemas**: Common patterns for users, products, transactions, coordinates
53
+ - **Custom Validation**: Define your own schemas with Zod
54
+ - **Data Cleaning**: Filter valid/invalid rows, transform data types
55
+ - **Error Reporting**: Detailed validation errors with row/column information
56
+
57
+ ### I/O Support
58
+ - **CSV**: Read/write CSV files with automatic type inference
59
+ - **Apache Arrow**: Optional integration for columnar data exchange
60
+ - **Streaming**: Memory-efficient processing of large datasets
61
+
62
+ ## Examples
63
+
64
+ ### Schema Validation
65
+
66
+ ```javascript
67
+ import { DataFrame, SchemaValidator } from 'databonk';
68
+ import { z } from 'zod';
69
+
70
+ // Define a custom schema
71
+ const userSchema = SchemaValidator.define({
72
+ name: z.string().min(1),
73
+ age: z.number().int().min(0).max(150),
74
+ email: z.string().email(),
75
+ role: z.enum(['admin', 'user', 'guest'])
76
+ });
77
+
78
+ const userData = [
79
+ { name: 'Alice', age: 25, email: 'alice@example.com', role: 'admin' },
80
+ { name: '', age: -5, email: 'invalid', role: 'unknown' } // Invalid
81
+ ];
82
+
83
+ const df = DataFrame.fromRows(userData);
84
+
85
+ // Validate data
86
+ const validation = df.validate(userSchema);
87
+ console.log(`Errors: ${validation.errors.length}`);
88
+
89
+ // Filter valid rows
90
+ const validUsers = df.filterValid(userSchema);
91
+
92
+ // Transform data with type coercion
93
+ const cleanData = df.validateAndTransform(userSchema);
94
+ ```
95
+
96
+ ### Advanced Data Operations
97
+
98
+ ```javascript
99
+ // Join operations
100
+ const sales = DataFrame.fromRows([
101
+ { product_id: 1, quantity: 100, region: 'North' },
102
+ { product_id: 2, quantity: 150, region: 'South' }
103
+ ]);
104
+
105
+ const products = DataFrame.fromRows([
106
+ { product_id: 1, name: 'Widget', price: 10.99 },
107
+ { product_id: 2, name: 'Gadget', price: 15.99 }
108
+ ]);
109
+
110
+ const joined = sales.join(products, 'product_id', 'inner');
111
+
112
+ // Group by with multiple aggregations
113
+ const summary = joined
114
+ .groupBy(['region'])
115
+ .agg({
116
+ quantity: ['sum', 'mean'],
117
+ price: 'mean'
118
+ });
119
+
120
+ // Add calculated columns
121
+ const withRevenue = joined.withColumn('revenue',
122
+ row => row.quantity * row.price
123
+ );
124
+
125
+ // Pivot tables
126
+ const pivot = sales.pivot(['region'], 'product_id', 'quantity', 'sum');
127
+ ```
128
+
129
+ ## Docker Development
130
+
131
+ ```bash
132
+ # Build and start development environment
133
+ make docker-dev
134
+
135
+ # Run tests in Docker
136
+ make docker-test
137
+
138
+ # Open shell in container
139
+ make docker-shell
140
+ ```
141
+
142
+ ## Development
143
+
144
+ ```bash
145
+ # Local development
146
+ npm install
147
+ npm run build
148
+ npm test
149
+
150
+ # With Docker
151
+ make setup
152
+ make dev
153
+ ```
154
+
155
+ ## Performance
156
+
157
+ Databonk.js is designed for small to medium datasets (up to ~1M rows) with:
158
+ - **Memory efficient**: Columnar storage with TypedArrays
159
+ - **Fast operations**: Optimized algorithms for joins, aggregations
160
+ - **Minimal overhead**: Zero-copy operations where possible
161
+ - **Tree-shakeable**: Only import what you use
@@ -0,0 +1,25 @@
1
+ import { DataType } from '../utils/types.js';
2
+ export declare class Column<T = any> {
3
+ readonly name: string;
4
+ readonly dataType: DataType;
5
+ private data;
6
+ private nullBitmap;
7
+ readonly length: number;
8
+ constructor(name: string, values: T[], dataType?: DataType);
9
+ private createDataArray;
10
+ get(index: number): T | null;
11
+ isNull(index: number): boolean;
12
+ slice(start?: number, end?: number): Column<T>;
13
+ filter(predicate: (value: T | null, index: number) => boolean): Column<T>;
14
+ map<U>(fn: (value: T | null, index: number) => U, newDataType?: DataType): Column<U>;
15
+ sum(): number;
16
+ mean(): number;
17
+ min(): number;
18
+ max(): number;
19
+ count(): number;
20
+ unique(): T[];
21
+ values(): Iterator<T | null>;
22
+ toArray(): (T | null)[];
23
+ static from<T>(name: string, values: T[], dataType?: DataType): Column<T>;
24
+ }
25
+ //# sourceMappingURL=column.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"column.d.ts","sourceRoot":"","sources":["../../src/core/column.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,QAAQ,EAA6D,MAAM,mBAAmB,CAAC;AAGxG,qBAAa,MAAM,CAAC,CAAC,GAAG,GAAG;IACzB,SAAgB,IAAI,EAAE,MAAM,CAAC;IAC7B,SAAgB,QAAQ,EAAE,QAAQ,CAAC;IACnC,OAAO,CAAC,IAAI,CAA6B;IACzC,OAAO,CAAC,UAAU,CAAS;IAC3B,SAAgB,MAAM,EAAE,MAAM,CAAC;gBAEnB,IAAI,EAAE,MAAM,EAAE,MAAM,EAAE,CAAC,EAAE,EAAE,QAAQ,CAAC,EAAE,QAAQ;IAS1D,OAAO,CAAC,eAAe;IAsBvB,GAAG,CAAC,KAAK,EAAE,MAAM,GAAG,CAAC,GAAG,IAAI;IAY5B,MAAM,CAAC,KAAK,EAAE,MAAM,GAAG,OAAO;IAI9B,KAAK,CAAC,KAAK,CAAC,EAAE,MAAM,EAAE,GAAG,CAAC,EAAE,MAAM,GAAG,MAAM,CAAC,CAAC,CAAC;IAY9C,MAAM,CAAC,SAAS,EAAE,CAAC,KAAK,EAAE,CAAC,GAAG,IAAI,EAAE,KAAK,EAAE,MAAM,KAAK,OAAO,GAAG,MAAM,CAAC,CAAC,CAAC;IAazE,GAAG,CAAC,CAAC,EAAE,EAAE,EAAE,CAAC,KAAK,EAAE,CAAC,GAAG,IAAI,EAAE,KAAK,EAAE,MAAM,KAAK,CAAC,EAAE,WAAW,CAAC,EAAE,QAAQ,GAAG,MAAM,CAAC,CAAC,CAAC;IAWpF,GAAG,IAAI,MAAM;IAeb,IAAI,IAAI,MAAM;IASd,GAAG,IAAI,MAAM;IAqBb,GAAG,IAAI,MAAM;IAqBb,KAAK,IAAI,MAAM;IAIf,MAAM,IAAI,CAAC,EAAE;IAYZ,MAAM,IAAI,QAAQ,CAAC,CAAC,GAAG,IAAI,CAAC;IAM7B,OAAO,IAAI,CAAC,CAAC,GAAG,IAAI,CAAC,EAAE;IAQvB,MAAM,CAAC,IAAI,CAAC,CAAC,EAAE,IAAI,EAAE,MAAM,EAAE,MAAM,EAAE,CAAC,EAAE,EAAE,QAAQ,CAAC,EAAE,QAAQ,GAAG,MAAM,CAAC,CAAC,CAAC;CAG1E"}
@@ -0,0 +1,30 @@
1
+ import { Column } from './column';
2
+ export type RowObject = Record<string, any>;
3
+ export declare class DataFrame {
4
+ private columns;
5
+ readonly length: number;
6
+ constructor(data: Record<string, Column> | Column[]);
7
+ get columnNames(): string[];
8
+ get columnCount(): number;
9
+ column(name: string): Column;
10
+ hasColumn(name: string): boolean;
11
+ addColumn(column: Column): DataFrame;
12
+ removeColumn(name: string): DataFrame;
13
+ select(columns: string[]): DataFrame;
14
+ filter(predicate: (row: RowObject, index: number) => boolean): DataFrame;
15
+ slice(start?: number, end?: number): DataFrame;
16
+ selectRows(indices: number[]): DataFrame;
17
+ getRow(index: number): RowObject;
18
+ rows(): IterableIterator<RowObject>;
19
+ head(n?: number): DataFrame;
20
+ tail(n?: number): DataFrame;
21
+ sort(columnName: string, ascending?: boolean): DataFrame;
22
+ drop(columns: string[]): DataFrame;
23
+ rename(columnMapping: Record<string, string>): DataFrame;
24
+ toArray(): RowObject[];
25
+ toColumns(): Record<string, any[]>;
26
+ static from(data: RowObject[] | Record<string, any[]>): DataFrame;
27
+ static fromRows(rows: RowObject[]): DataFrame;
28
+ static fromColumns(data: Record<string, any[]>): DataFrame;
29
+ }
30
+ //# sourceMappingURL=dataframe.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"dataframe.d.ts","sourceRoot":"","sources":["../../src/core/dataframe.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,MAAM,EAAE,MAAM,UAAU,CAAC;AAGlC,MAAM,MAAM,SAAS,GAAG,MAAM,CAAC,MAAM,EAAE,GAAG,CAAC,CAAC;AAE5C,qBAAa,SAAS;IACpB,OAAO,CAAC,OAAO,CAAkC;IACjD,SAAgB,MAAM,EAAE,MAAM,CAAC;gBAEnB,IAAI,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,GAAG,MAAM,EAAE;IAiBnD,IAAI,WAAW,IAAI,MAAM,EAAE,CAE1B;IAED,IAAI,WAAW,IAAI,MAAM,CAExB;IAED,MAAM,CAAC,IAAI,EAAE,MAAM,GAAG,MAAM;IAQ5B,SAAS,CAAC,IAAI,EAAE,MAAM,GAAG,OAAO;IAIhC,SAAS,CAAC,MAAM,EAAE,MAAM,GAAG,SAAS;IAUpC,YAAY,CAAC,IAAI,EAAE,MAAM,GAAG,SAAS;IAUrC,MAAM,CAAC,OAAO,EAAE,MAAM,EAAE,GAAG,SAAS;IAapC,MAAM,CAAC,SAAS,EAAE,CAAC,GAAG,EAAE,SAAS,EAAE,KAAK,EAAE,MAAM,KAAK,OAAO,GAAG,SAAS;IAaxE,KAAK,CAAC,KAAK,CAAC,EAAE,MAAM,EAAE,GAAG,CAAC,EAAE,MAAM,GAAG,SAAS;IAa9C,UAAU,CAAC,OAAO,EAAE,MAAM,EAAE,GAAG,SAAS;IAWxC,MAAM,CAAC,KAAK,EAAE,MAAM,GAAG,SAAS;IAa/B,IAAI,IAAI,gBAAgB,CAAC,SAAS,CAAC;IAMpC,IAAI,CAAC,CAAC,GAAE,MAAU,GAAG,SAAS;IAI9B,IAAI,CAAC,CAAC,GAAE,MAAU,GAAG,SAAS;IAI9B,IAAI,CAAC,UAAU,EAAE,MAAM,EAAE,SAAS,GAAE,OAAc,GAAG,SAAS;IA6B9D,IAAI,CAAC,OAAO,EAAE,MAAM,EAAE,GAAG,SAAS;IAYlC,MAAM,CAAC,aAAa,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,GAAG,SAAS;IAWxD,OAAO,IAAI,SAAS,EAAE;IAQtB,SAAS,IAAI,MAAM,CAAC,MAAM,EAAE,GAAG,EAAE,CAAC;IAQlC,MAAM,CAAC,IAAI,CAAC,IAAI,EAAE,SAAS,EAAE,GAAG,MAAM,CAAC,MAAM,EAAE,GAAG,EAAE,CAAC,GAAG,SAAS;IAQjE,MAAM,CAAC,QAAQ,CAAC,IAAI,EAAE,SAAS,EAAE,GAAG,SAAS;IAgB7C,MAAM,CAAC,WAAW,CAAC,IAAI,EAAE,MAAM,CAAC,MAAM,EAAE,GAAG,EAAE,CAAC,GAAG,SAAS;CAS3D"}
@@ -0,0 +1,20 @@
1
+ export { DataFrame } from './core/dataframe.js';
2
+ export { Column } from './core/column.js';
3
+ export { GroupBy } from './operations/groupby.js';
4
+ export { Joiner, JoinType } from './operations/join.js';
5
+ export { Reshaper } from './operations/reshape.js';
6
+ export { ColumnDeriver, ColumnExpression } from './operations/derive.js';
7
+ export { Aggregator, AggregateFunction, AggregateSpec } from './operations/aggregation.js';
8
+ export { CsvReader, CsvWriter, CsvOptions } from './io/csv.js';
9
+ export { SchemaValidator, SchemaBuilders, CommonSchemas, ValidationResult, ValidationError, ValidationOptions, DataFrameSchema, SchemaDefinition } from './validation/schema.js';
10
+ export { DataType } from './utils/types.js';
11
+ export { BitSet } from './utils/bitset.js';
12
+ export { PerformanceTimer, DataGenerator, MemoryProfiler } from './utils/performance.js';
13
+ import './operations/aggregation.js';
14
+ import './operations/groupby.js';
15
+ import './operations/join.js';
16
+ import './operations/reshape.js';
17
+ import './operations/derive.js';
18
+ import './io/csv.js';
19
+ import './validation/schema.js';
20
+ //# sourceMappingURL=index.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,SAAS,EAAE,MAAM,qBAAqB,CAAC;AAChD,OAAO,EAAE,MAAM,EAAE,MAAM,kBAAkB,CAAC;AAC1C,OAAO,EAAE,OAAO,EAAE,MAAM,yBAAyB,CAAC;AAClD,OAAO,EAAE,MAAM,EAAE,QAAQ,EAAE,MAAM,sBAAsB,CAAC;AACxD,OAAO,EAAE,QAAQ,EAAE,MAAM,yBAAyB,CAAC;AACnD,OAAO,EAAE,aAAa,EAAE,gBAAgB,EAAE,MAAM,wBAAwB,CAAC;AACzE,OAAO,EAAE,UAAU,EAAE,iBAAiB,EAAE,aAAa,EAAE,MAAM,6BAA6B,CAAC;AAC3F,OAAO,EAAE,SAAS,EAAE,SAAS,EAAE,UAAU,EAAE,MAAM,aAAa,CAAC;AAC/D,OAAO,EACL,eAAe,EACf,cAAc,EACd,aAAa,EACb,gBAAgB,EAChB,eAAe,EACf,iBAAiB,EACjB,eAAe,EACf,gBAAgB,EACjB,MAAM,wBAAwB,CAAC;AAChC,OAAO,EAAE,QAAQ,EAAE,MAAM,kBAAkB,CAAC;AAC5C,OAAO,EAAE,MAAM,EAAE,MAAM,mBAAmB,CAAC;AAC3C,OAAO,EAAE,gBAAgB,EAAE,aAAa,EAAE,cAAc,EAAE,MAAM,wBAAwB,CAAC;AAEzF,OAAO,6BAA6B,CAAC;AACrC,OAAO,yBAAyB,CAAC;AACjC,OAAO,sBAAsB,CAAC;AAC9B,OAAO,yBAAyB,CAAC;AACjC,OAAO,wBAAwB,CAAC;AAChC,OAAO,aAAa,CAAC;AACrB,OAAO,wBAAwB,CAAC"}