@uwdata/mosaic-duckdb 0.1.0 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/run-server.js +3 -0
- package/bin/{csv2arrow.js → to-arrow.js} +3 -8
- package/bin/to-csv.js +10 -0
- package/bin/to-parquet.js +10 -0
- package/package.json +10 -4
- package/src/Cache.js +113 -0
- package/src/DuckDB.js +3 -39
- package/src/data-server.js +62 -48
- package/src/index.js +4 -0
- package/src/load/arrow.js +14 -0
- package/src/load/bundle.js +75 -0
- package/src/load/create-table.js +6 -0
- package/src/load/csv.js +9 -0
- package/src/load/json.js +9 -0
- package/src/load/parameters.js +11 -0
- package/src/load/parquet.js +7 -0
- package/src/merge-buffers.js +5 -3
|
@@ -5,10 +5,10 @@ import { createWriteStream } from 'fs';
|
|
|
5
5
|
const db = new DuckDB();
|
|
6
6
|
|
|
7
7
|
// load CSV into duckdb
|
|
8
|
-
await db.
|
|
8
|
+
await db.exec(`CREATE TABLE data AS SELECT * FROM '${process.argv[2]}'`);
|
|
9
9
|
|
|
10
10
|
// get output stream of arrow bytes
|
|
11
|
-
const
|
|
11
|
+
const buf = await db.arrowBuffer('SELECT * FROM data');
|
|
12
12
|
|
|
13
13
|
// determine the output stream
|
|
14
14
|
const output = process.argv[3]
|
|
@@ -21,9 +21,4 @@ output.on('error', (error) => {
|
|
|
21
21
|
});
|
|
22
22
|
|
|
23
23
|
// write arrow bytes to output
|
|
24
|
-
|
|
25
|
-
output.write(chunk);
|
|
26
|
-
}
|
|
27
|
-
|
|
28
|
-
// finish
|
|
29
|
-
output.end(new Uint8Array(4));
|
|
24
|
+
output.end(buf);
|
package/bin/to-csv.js
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
#! /usr/bin/env node
|
|
2
|
+
import path from 'node:path';
|
|
3
|
+
import { DuckDB } from '../src/index.js';
|
|
4
|
+
|
|
5
|
+
const db = new DuckDB();
|
|
6
|
+
const input = process.argv[2];
|
|
7
|
+
const output = process.argv[3] ||
|
|
8
|
+
(path.basename(input, path.extname(input)) + '.csv');
|
|
9
|
+
|
|
10
|
+
await db.exec(`COPY (SELECT * FROM '${input}') TO '${output}' (FORMAT CSV, HEADER)`);
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
#! /usr/bin/env node
|
|
2
|
+
import path from 'node:path';
|
|
3
|
+
import { DuckDB } from '../src/index.js';
|
|
4
|
+
|
|
5
|
+
const db = new DuckDB();
|
|
6
|
+
const input = process.argv[2];
|
|
7
|
+
const output = process.argv[3] ||
|
|
8
|
+
(path.basename(input, path.extname(input)) + '.parquet');
|
|
9
|
+
|
|
10
|
+
await db.exec(`COPY (SELECT * FROM '${input}') TO '${output}' (FORMAT PARQUET)`);
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@uwdata/mosaic-duckdb",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "0.3.0",
|
|
4
4
|
"description": "A Promise-based DuckDB API and Node.js data server.",
|
|
5
5
|
"keywords": [
|
|
6
6
|
"duckdb",
|
|
@@ -18,14 +18,20 @@
|
|
|
18
18
|
"type": "git",
|
|
19
19
|
"url": "https://github.com/uwdata/mosaic.git"
|
|
20
20
|
},
|
|
21
|
+
"bin": {
|
|
22
|
+
"to-arrow": "./bin/to-arrow.js",
|
|
23
|
+
"to-csv": "./bin/to-csv.js",
|
|
24
|
+
"to-parquet": "./bin/to-parquet.js"
|
|
25
|
+
},
|
|
21
26
|
"scripts": {
|
|
22
27
|
"lint": "eslint src test --ext .js",
|
|
28
|
+
"server": "node bin/run-server.js",
|
|
23
29
|
"test": "mocha 'test/**/*-test.js'",
|
|
24
30
|
"prepublishOnly": "npm run test && npm run lint"
|
|
25
31
|
},
|
|
26
32
|
"dependencies": {
|
|
27
|
-
"duckdb": "~0.
|
|
28
|
-
"ws": "^8.
|
|
33
|
+
"duckdb": "~0.8.1",
|
|
34
|
+
"ws": "^8.13.0"
|
|
29
35
|
},
|
|
30
|
-
"gitHead": "
|
|
36
|
+
"gitHead": "a8dd23fed4c7a24c0a2ee5261d1aabe4239ce574"
|
|
31
37
|
}
|
package/src/Cache.js
ADDED
|
@@ -0,0 +1,113 @@
|
|
|
1
|
+
import { createHash } from 'node:crypto';
|
|
2
|
+
import fs from 'node:fs/promises';
|
|
3
|
+
import path from 'node:path';
|
|
4
|
+
|
|
5
|
+
const DEFAULT_CACHE_DIR = '.cache';
|
|
6
|
+
const DEFAULT_TTL = 1000 * 60 * 60 * 24 * 7; // 7 days
|
|
7
|
+
|
|
8
|
+
export function cacheKey(hashable, type) {
|
|
9
|
+
return createHash('sha256').update(hashable).digest('hex') + '.' + type;
|
|
10
|
+
}
|
|
11
|
+
|
|
12
|
+
class CacheEntry {
|
|
13
|
+
constructor(data, ttl = DEFAULT_TTL) {
|
|
14
|
+
this.data = data;
|
|
15
|
+
this.touch(ttl);
|
|
16
|
+
}
|
|
17
|
+
touch(ttl = DEFAULT_TTL) {
|
|
18
|
+
this.last = Math.round(Math.max(this.last, performance.now() + ttl));
|
|
19
|
+
return this;
|
|
20
|
+
}
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
export class Cache {
|
|
24
|
+
constructor({
|
|
25
|
+
max = 10000, // max entries
|
|
26
|
+
dir = DEFAULT_CACHE_DIR,
|
|
27
|
+
ttl = DEFAULT_TTL
|
|
28
|
+
}) {
|
|
29
|
+
this.cache = new Map;
|
|
30
|
+
this.max = max;
|
|
31
|
+
this.dir = dir;
|
|
32
|
+
this.ttl = ttl;
|
|
33
|
+
readEntries(dir, this.cache);
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
has(key) {
|
|
37
|
+
return this.cache.has(key);
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
delete(key) {
|
|
41
|
+
const deleted = this.cache.delete(key);
|
|
42
|
+
if (deleted) {
|
|
43
|
+
fs.rm(path.resolve(this.dir, key), { force: true });
|
|
44
|
+
}
|
|
45
|
+
return deleted;
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
get(key) {
|
|
49
|
+
return this.cache.get(key)?.touch(this.ttl).data;
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
set(key, data, { persist = false, ttl = this.ttl } = {}) {
|
|
53
|
+
const entry = new CacheEntry(data, persist ? Infinity : ttl);
|
|
54
|
+
this.cache.set(key, entry);
|
|
55
|
+
if (persist) writeEntry(this.dir, key, entry);
|
|
56
|
+
if (this.shouldEvict()) setTimeout(() => this.evict());
|
|
57
|
+
return this;
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
shouldEvict() {
|
|
61
|
+
return this.cache.size > this.max;
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
evict() {
|
|
65
|
+
const expire = performance.now();
|
|
66
|
+
let lruKey = null;
|
|
67
|
+
let lruLast = Infinity;
|
|
68
|
+
|
|
69
|
+
for (const [key, entry] of this.cache) {
|
|
70
|
+
const { last } = entry;
|
|
71
|
+
if (last === Infinity) continue;
|
|
72
|
+
|
|
73
|
+
// least recently used entry seen so far
|
|
74
|
+
if (last < lruLast) {
|
|
75
|
+
lruKey = key;
|
|
76
|
+
lruLast = last;
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
// remove if time since last access exceeds ttl
|
|
80
|
+
if (expire > last) {
|
|
81
|
+
this.cache.delete(key);
|
|
82
|
+
}
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
// remove lru entry
|
|
86
|
+
if (this.cache.size > this.max && lruKey) {
|
|
87
|
+
this.cache.delete(lruKey);
|
|
88
|
+
}
|
|
89
|
+
}
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
async function readEntries(dir, cache) {
|
|
93
|
+
let files;
|
|
94
|
+
try {
|
|
95
|
+
files = await fs.readdir(dir);
|
|
96
|
+
} catch (err) {
|
|
97
|
+
return; // dir does not exist, nothing to do
|
|
98
|
+
}
|
|
99
|
+
await Promise.allSettled(files.map(async file => {
|
|
100
|
+
const m = file.match(/.*\.(arrow|json)/);
|
|
101
|
+
const key = m?.[1] || null;
|
|
102
|
+
if (key) {
|
|
103
|
+
const data = await fs.readFile(path.resolve(dir, file));
|
|
104
|
+
cache.set(key, new CacheEntry(data, Infinity));
|
|
105
|
+
}
|
|
106
|
+
}));
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
function writeEntry(dir, key, entry) {
|
|
110
|
+
return fs.mkdir(dir, { recursive: true }).then(
|
|
111
|
+
() => fs.writeFile(path.resolve(dir, key), entry.data)
|
|
112
|
+
);
|
|
113
|
+
}
|
package/src/DuckDB.js
CHANGED
|
@@ -1,9 +1,10 @@
|
|
|
1
1
|
import duckdb from 'duckdb';
|
|
2
|
-
import { readFile } from 'node:fs/promises';
|
|
3
2
|
import { mergeBuffers } from './merge-buffers.js';
|
|
4
3
|
|
|
4
|
+
const TEMP_DIR = '.duckdb';
|
|
5
|
+
|
|
5
6
|
const CONFIG = [
|
|
6
|
-
`PRAGMA temp_directory='
|
|
7
|
+
`PRAGMA temp_directory='${TEMP_DIR}'`,
|
|
7
8
|
`INSTALL arrow`,
|
|
8
9
|
`INSTALL httpfs`,
|
|
9
10
|
`LOAD arrow`,
|
|
@@ -29,35 +30,6 @@ export class DuckDB {
|
|
|
29
30
|
});
|
|
30
31
|
}
|
|
31
32
|
|
|
32
|
-
async csv(tableName, fileName, options = {}) {
|
|
33
|
-
const opt = Object.entries({ sample_size: -1, ...options })
|
|
34
|
-
.map(([key, value]) => {
|
|
35
|
-
const t = typeof value;
|
|
36
|
-
const v = t === 'boolean' ? String(value).toUpperCase()
|
|
37
|
-
: t === 'string' ? `'${value}'`
|
|
38
|
-
: value;
|
|
39
|
-
return `${key.toUpperCase()}=${v}`;
|
|
40
|
-
})
|
|
41
|
-
.join(', ');
|
|
42
|
-
return this.exec(`CREATE TABLE ${tableName} AS SELECT *
|
|
43
|
-
FROM read_csv_auto('${fileName}', ${opt});`);
|
|
44
|
-
}
|
|
45
|
-
|
|
46
|
-
async parquet(tableName, fileName) {
|
|
47
|
-
return this.exec(`CREATE TABLE ${tableName} AS SELECT *
|
|
48
|
-
FROM read_parquet('${fileName}');`);
|
|
49
|
-
}
|
|
50
|
-
|
|
51
|
-
async ipc(tableName, buffer) {
|
|
52
|
-
const bufName = `__ipc__${tableName}`;
|
|
53
|
-
const arrowData = ArrayBuffer.isView(buffer) ? buffer : await readFile(buffer);
|
|
54
|
-
this.con.register_buffer(bufName, [arrowData], true, err => {
|
|
55
|
-
if (err) console.error(err);
|
|
56
|
-
});
|
|
57
|
-
await this.exec(`CREATE TABLE ${tableName} AS SELECT * FROM ${bufName}`);
|
|
58
|
-
this.con.unregister_buffer(bufName);
|
|
59
|
-
}
|
|
60
|
-
|
|
61
33
|
prepare(sql) {
|
|
62
34
|
return new DuckDBStatement(this.con.prepare(sql));
|
|
63
35
|
}
|
|
@@ -97,10 +69,6 @@ export class DuckDB {
|
|
|
97
69
|
});
|
|
98
70
|
});
|
|
99
71
|
}
|
|
100
|
-
|
|
101
|
-
arrowStream(sql) {
|
|
102
|
-
return this.con.arrowIPCStream(sql);
|
|
103
|
-
}
|
|
104
72
|
}
|
|
105
73
|
|
|
106
74
|
export class DuckDBStatement {
|
|
@@ -151,8 +119,4 @@ export class DuckDBStatement {
|
|
|
151
119
|
});
|
|
152
120
|
});
|
|
153
121
|
}
|
|
154
|
-
|
|
155
|
-
arrowStream(params) {
|
|
156
|
-
return this.statement.arrowIPCStream(...params);
|
|
157
|
-
}
|
|
158
122
|
}
|
package/src/data-server.js
CHANGED
|
@@ -1,13 +1,21 @@
|
|
|
1
1
|
import http from 'node:http';
|
|
2
|
+
import path from 'node:path';
|
|
2
3
|
import url from 'node:url';
|
|
3
4
|
import { WebSocketServer } from 'ws';
|
|
5
|
+
import { Cache, cacheKey } from './Cache.js';
|
|
6
|
+
import { createBundle, loadBundle } from './load/bundle.js';
|
|
7
|
+
|
|
8
|
+
const CACHE_DIR = '.mosaic/cache';
|
|
9
|
+
const BUNDLE_DIR = '.mosaic/bundle';
|
|
4
10
|
|
|
5
11
|
export function dataServer(db, {
|
|
12
|
+
cache = true,
|
|
6
13
|
rest = true,
|
|
7
14
|
socket = true,
|
|
8
15
|
port = 3000
|
|
9
16
|
} = {}) {
|
|
10
|
-
const
|
|
17
|
+
const queryCache = cache ? new Cache({ dir: CACHE_DIR }) : null;
|
|
18
|
+
const handleQuery = queryHandler(db, queryCache);
|
|
11
19
|
const app = createHTTPServer(handleQuery, rest);
|
|
12
20
|
if (socket) createSocketServer(app, handleQuery);
|
|
13
21
|
|
|
@@ -25,7 +33,6 @@ function createHTTPServer(handleQuery, rest) {
|
|
|
25
33
|
return;
|
|
26
34
|
}
|
|
27
35
|
|
|
28
|
-
resp.setHeader('Content-Type', 'application/json');
|
|
29
36
|
resp.setHeader('Access-Control-Allow-Origin', '*');
|
|
30
37
|
resp.setHeader('Access-Control-Request-Method', '*');
|
|
31
38
|
resp.setHeader('Access-Control-Allow-Methods', 'OPTIONS, POST, GET');
|
|
@@ -61,7 +68,27 @@ function createSocketServer(server, handleQuery) {
|
|
|
61
68
|
});
|
|
62
69
|
}
|
|
63
70
|
|
|
64
|
-
function queryHandler(db) {
|
|
71
|
+
function queryHandler(db, queryCache) {
|
|
72
|
+
|
|
73
|
+
// retrieve query result
|
|
74
|
+
async function retrieve(query, get) {
|
|
75
|
+
const { sql, type, persist } = query;
|
|
76
|
+
const key = cacheKey(sql, type);
|
|
77
|
+
let result = queryCache?.get(key);
|
|
78
|
+
|
|
79
|
+
if (result) {
|
|
80
|
+
console.log('CACHE HIT');
|
|
81
|
+
} else {
|
|
82
|
+
result = await get(sql);
|
|
83
|
+
if (persist) {
|
|
84
|
+
queryCache?.set(key, result, { persist });
|
|
85
|
+
}
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
return result;
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
// query request handler
|
|
65
92
|
return async (res, data) => {
|
|
66
93
|
const t0 = performance.now();
|
|
67
94
|
|
|
@@ -75,64 +102,56 @@ function queryHandler(db) {
|
|
|
75
102
|
}
|
|
76
103
|
|
|
77
104
|
try {
|
|
78
|
-
const { sql, type } = query;
|
|
79
|
-
console.log('
|
|
80
|
-
|
|
81
|
-
// request the lock to serialize requests
|
|
82
|
-
// we do this to avoid DuckDB + Arrow errors
|
|
83
|
-
await res.lock?.();
|
|
105
|
+
const { sql, type = 'json' } = query;
|
|
106
|
+
console.log(`> ${type.toUpperCase()}${sql ? ' ' + sql : ''}`);
|
|
84
107
|
|
|
85
108
|
// process query and return result
|
|
86
109
|
switch (type) {
|
|
87
|
-
case 'arrow':
|
|
88
|
-
// Apache Arrow response format
|
|
89
|
-
await res.stream(await db.arrowStream(sql));
|
|
90
|
-
break;
|
|
91
110
|
case 'exec':
|
|
92
111
|
// Execute query with no return value
|
|
93
112
|
await db.exec(sql);
|
|
94
113
|
res.done();
|
|
95
114
|
break;
|
|
96
|
-
|
|
115
|
+
case 'arrow':
|
|
116
|
+
// Apache Arrow response format
|
|
117
|
+
res.arrow(await retrieve(query, sql => db.arrowBuffer(sql)));
|
|
118
|
+
break;
|
|
119
|
+
case 'json':
|
|
97
120
|
// JSON response format
|
|
98
|
-
res.json(await db.query(sql));
|
|
121
|
+
res.json(await retrieve(query, sql => db.query(sql)));
|
|
122
|
+
break;
|
|
123
|
+
case 'create-bundle':
|
|
124
|
+
// Create a named bundle of precomputed resources
|
|
125
|
+
await createBundle(
|
|
126
|
+
db, queryCache, query.queries,
|
|
127
|
+
path.resolve(BUNDLE_DIR, query.name)
|
|
128
|
+
);
|
|
129
|
+
res.done();
|
|
130
|
+
break;
|
|
131
|
+
case 'load-bundle':
|
|
132
|
+
// Load a named bundle of precomputed resources
|
|
133
|
+
await loadBundle(db, queryCache, path.resolve(BUNDLE_DIR, query.name));
|
|
134
|
+
res.done();
|
|
135
|
+
break;
|
|
136
|
+
default:
|
|
137
|
+
res.error(`Unrecognized command: ${type}`, 400);
|
|
99
138
|
}
|
|
100
139
|
} catch (err) {
|
|
101
140
|
res.error(err, 500);
|
|
102
|
-
} finally {
|
|
103
|
-
res.unlock?.();
|
|
104
141
|
}
|
|
105
142
|
|
|
106
|
-
console.log('REQUEST',
|
|
143
|
+
console.log('REQUEST', (performance.now() - t0).toFixed(1));
|
|
107
144
|
};
|
|
108
145
|
}
|
|
109
146
|
|
|
110
|
-
let locked = false;
|
|
111
|
-
const queue = [];
|
|
112
|
-
|
|
113
147
|
function httpResponse(res) {
|
|
114
148
|
return {
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
return locked
|
|
119
|
-
? new Promise(resolve => queue.push(resolve))
|
|
120
|
-
: (locked = true);
|
|
121
|
-
},
|
|
122
|
-
unlock() {
|
|
123
|
-
locked = queue.length > 0;
|
|
124
|
-
if (locked) {
|
|
125
|
-
// resolve the next promise in the queue
|
|
126
|
-
queue.shift()();
|
|
127
|
-
}
|
|
128
|
-
},
|
|
129
|
-
async stream(iter) {
|
|
130
|
-
for await (const chunk of iter) {
|
|
131
|
-
res.write(chunk);
|
|
132
|
-
}
|
|
133
|
-
res.end();
|
|
149
|
+
arrow(data) {
|
|
150
|
+
res.setHeader('Content-Type', 'application/vnd.apache.arrow.stream');
|
|
151
|
+
res.end(data);
|
|
134
152
|
},
|
|
135
153
|
json(data) {
|
|
154
|
+
res.setHeader('Content-Type', 'application/json');
|
|
136
155
|
res.end(JSON.stringify(data));
|
|
137
156
|
},
|
|
138
157
|
done() {
|
|
@@ -149,16 +168,11 @@ function httpResponse(res) {
|
|
|
149
168
|
|
|
150
169
|
function socketResponse(ws) {
|
|
151
170
|
const STRING = { binary: false, fin: true };
|
|
152
|
-
const
|
|
153
|
-
const DONE = { binary: true, fin: true };
|
|
154
|
-
const NULL = new Uint8Array(0);
|
|
171
|
+
const BINARY = { binary: true, fin: true };
|
|
155
172
|
|
|
156
173
|
return {
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
ws.send(chunk, FRAGMENT);
|
|
160
|
-
}
|
|
161
|
-
ws.send(NULL, DONE);
|
|
174
|
+
arrow(data) {
|
|
175
|
+
ws.send(data, BINARY);
|
|
162
176
|
},
|
|
163
177
|
json(data) {
|
|
164
178
|
ws.send(JSON.stringify(data), STRING);
|
package/src/index.js
CHANGED
|
@@ -1,2 +1,6 @@
|
|
|
1
1
|
export { DuckDB } from './DuckDB.js';
|
|
2
2
|
export { dataServer } from './data-server.js';
|
|
3
|
+
export { loadArrow } from './load/arrow.js';
|
|
4
|
+
export { loadCSV } from './load/csv.js';
|
|
5
|
+
export { loadJSON } from './load/json.js';
|
|
6
|
+
export { loadParquet } from './load/parquet.js';
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
import { readFile } from 'node:fs/promises';
|
|
2
|
+
|
|
3
|
+
export async function loadArrow(db, tableName, buffer) {
|
|
4
|
+
const arrowData = ArrayBuffer.isView(buffer) ? buffer : await readFile(buffer);
|
|
5
|
+
return new Promise((resolve, reject) => {
|
|
6
|
+
db.con.register_buffer(tableName, [arrowData], true, err => {
|
|
7
|
+
if (err) {
|
|
8
|
+
console.error(err);
|
|
9
|
+
reject(err);
|
|
10
|
+
}
|
|
11
|
+
resolve();
|
|
12
|
+
});
|
|
13
|
+
});
|
|
14
|
+
}
|
|
@@ -0,0 +1,75 @@
|
|
|
1
|
+
import fs from 'node:fs/promises';
|
|
2
|
+
import path from 'node:path';
|
|
3
|
+
import { cacheKey } from '../Cache.js';
|
|
4
|
+
|
|
5
|
+
async function retrieve(db, cache, sql, type) {
|
|
6
|
+
const key = cacheKey(sql, type);
|
|
7
|
+
const cached = cache.get(key);
|
|
8
|
+
if (cached) return cached;
|
|
9
|
+
switch (type) {
|
|
10
|
+
case 'arrow':
|
|
11
|
+
return db.arrowBuffer(sql);
|
|
12
|
+
case 'json':
|
|
13
|
+
return JSON.stringify(await db.query(sql));
|
|
14
|
+
default:
|
|
15
|
+
throw new Error(`Unsupported query type: ${type}`);
|
|
16
|
+
}
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
export async function createBundle(db, cache, queries, dir) {
|
|
20
|
+
const describe_re = /^DESCRIBE /;
|
|
21
|
+
const pragma_re = /^PRAGMA /;
|
|
22
|
+
const view_re = /^CREATE( TEMP| TEMPORARY)? VIEW/;
|
|
23
|
+
const table_re = /^CREATE( TEMP| TEMPORARY)? TABLE( IF NOT EXISTS)? ([^\s]+)/;
|
|
24
|
+
|
|
25
|
+
const manifest = { tables: [], queries: [] };
|
|
26
|
+
|
|
27
|
+
await fs.mkdir(dir, { recursive: true });
|
|
28
|
+
|
|
29
|
+
const querySet = new Set(queries);
|
|
30
|
+
for (const query of querySet) {
|
|
31
|
+
const sql = typeof query === 'string' ? query : query.sql;
|
|
32
|
+
if (query.alias) {
|
|
33
|
+
const table = query.alias;
|
|
34
|
+
const file = path.resolve(dir, `${table}.parquet`);
|
|
35
|
+
await db.exec(`COPY (${sql}) TO '${file}' (FORMAT PARQUET)`);
|
|
36
|
+
manifest.tables.push(table);
|
|
37
|
+
} else if (sql.startsWith('CREATE ')) {
|
|
38
|
+
// table or view
|
|
39
|
+
if (view_re.test(sql)) continue; // ignore views
|
|
40
|
+
const table = sql.match(table_re)?.[3];
|
|
41
|
+
const file = path.resolve(dir, `${table}.parquet`);
|
|
42
|
+
await db.exec(`${sql}`);
|
|
43
|
+
await db.exec(`COPY ${table} TO '${file}' (FORMAT PARQUET)`);
|
|
44
|
+
manifest.tables.push(table);
|
|
45
|
+
} else if (!pragma_re.test(sql)) {
|
|
46
|
+
// select query
|
|
47
|
+
const type = describe_re.test(sql) ? 'json' : 'arrow';
|
|
48
|
+
const key = cacheKey(sql, type);
|
|
49
|
+
const result = await retrieve(db, cache, sql, type);
|
|
50
|
+
await fs.writeFile(path.resolve(dir, key), result);
|
|
51
|
+
manifest.queries.push(key);
|
|
52
|
+
}
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
await fs.writeFile(path.resolve(dir, 'bundle.json'), JSON.stringify(manifest, 0, 2));
|
|
56
|
+
return manifest;
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
export async function loadBundle(db, cache, dir) {
|
|
60
|
+
const manifest = JSON.parse(await fs.readFile(path.resolve(dir, 'bundle.json')));
|
|
61
|
+
|
|
62
|
+
// load precomputed query results into the cache
|
|
63
|
+
for (const key of manifest.queries) {
|
|
64
|
+
const file = path.resolve(dir, key);
|
|
65
|
+
const json = path.extname(file) === '.json';
|
|
66
|
+
const data = await fs.readFile(file);
|
|
67
|
+
cache.set(key, json ? JSON.parse(data) : data);
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
// load precomputed temp tables into the database
|
|
71
|
+
for (const table of manifest.tables) {
|
|
72
|
+
const file = path.resolve(dir, `${table}.parquet`);
|
|
73
|
+
await db.exec(`CREATE TEMP TABLE IF NOT EXISTS ${table} AS SELECT * FROM '${file}'`);
|
|
74
|
+
}
|
|
75
|
+
}
|
|
@@ -0,0 +1,6 @@
|
|
|
1
|
+
export function createTable(db, name, as, options = {}) {
|
|
2
|
+
const { temp, replace } = options;
|
|
3
|
+
const create = `CREATE${replace ? ' OR REPLACE' : ''}`;
|
|
4
|
+
const type = `${temp ? 'TEMP ' : ''}TABLE${replace ? '' : ' IF NOT EXISTS'}`;
|
|
5
|
+
return db.exec(`${create} ${type} ${name} AS ${as}`);
|
|
6
|
+
}
|
package/src/load/csv.js
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
import { createTable } from './create-table.js';
|
|
2
|
+
import { parameters } from './parameters.js';
|
|
3
|
+
|
|
4
|
+
export function loadCSV(db, tableName, fileName, options = {}) {
|
|
5
|
+
const { select = ['*'], temp, replace, ...csvOptions } = options;
|
|
6
|
+
const params = parameters({ auto_detect: true, sample_size: -1, ...csvOptions });
|
|
7
|
+
const query = `SELECT ${select.join(', ')} FROM read_csv('${fileName}', ${params})`;
|
|
8
|
+
return createTable(db, tableName, query, { temp, replace });
|
|
9
|
+
}
|
package/src/load/json.js
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
import { createTable } from './create-table.js';
|
|
2
|
+
import { parameters } from './parameters.js';
|
|
3
|
+
|
|
4
|
+
export function loadJSON(db, tableName, fileName, options = {}) {
|
|
5
|
+
const { select = ['*'], temp, replace, ...jsonOptions } = options;
|
|
6
|
+
const params = parameters({ auto_detect: true, json_format: 'auto', ...jsonOptions });
|
|
7
|
+
const query = `SELECT ${select.join(', ')} FROM read_json('${fileName}', ${params})`;
|
|
8
|
+
return createTable(db, tableName, query, { temp, replace });
|
|
9
|
+
}
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
export function parameters(options) {
|
|
2
|
+
return Object.entries(options)
|
|
3
|
+
.map(([key, value]) => {
|
|
4
|
+
const t = typeof value;
|
|
5
|
+
const v = t === 'boolean' ? String(value)
|
|
6
|
+
: t === 'string' ? `'${value}'`
|
|
7
|
+
: value;
|
|
8
|
+
return `${key}=${v}`;
|
|
9
|
+
})
|
|
10
|
+
.join(', ');
|
|
11
|
+
}
|
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
import { createTable } from './create-table.js';
|
|
2
|
+
|
|
3
|
+
export function loadParquet(db, tableName, fileName, options = {}) {
|
|
4
|
+
const { select = ['*'], ...tableOptions } = options;
|
|
5
|
+
const query = `SELECT ${select.join(', ')} FROM read_parquet('${fileName}')`;
|
|
6
|
+
return createTable(db, tableName, query, tableOptions);
|
|
7
|
+
}
|
package/src/merge-buffers.js
CHANGED
|
@@ -1,10 +1,12 @@
|
|
|
1
1
|
export function mergeBuffers(buffers) {
|
|
2
|
-
const len = buffers.reduce((a, b) => a + b.length, 0);
|
|
2
|
+
const len = buffers.reduce((a, b) => a + (b ? b.length : 0), 0);
|
|
3
3
|
const buf = new Uint8Array(len);
|
|
4
4
|
|
|
5
5
|
for (let i = 0, offset = 0; i < buffers.length; ++i) {
|
|
6
|
-
|
|
7
|
-
|
|
6
|
+
if (buffers[i]) {
|
|
7
|
+
buf.set(buffers[i], offset);
|
|
8
|
+
offset += buffers[i].length;
|
|
9
|
+
}
|
|
8
10
|
}
|
|
9
11
|
|
|
10
12
|
return buf;
|