@uwdata/mosaic-duckdb 0.1.0 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/bin/csv2arrow.js CHANGED
@@ -8,7 +8,7 @@ const db = new DuckDB();
8
8
  await db.csv('data', process.argv[2]);
9
9
 
10
10
  // get output stream of arrow bytes
11
- const stream = await db.arrowStream('SELECT * FROM data');
11
+ const stream = await db.arrowBuffer('SELECT * FROM data');
12
12
 
13
13
  // determine the output stream
14
14
  const output = process.argv[3]
@@ -0,0 +1,3 @@
1
+ #! /usr/bin/env node
2
+ import { DuckDB, dataServer } from '../src/index.js';
3
+ dataServer(new DuckDB(), { rest: true, socket: true });
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@uwdata/mosaic-duckdb",
3
- "version": "0.1.0",
3
+ "version": "0.2.0",
4
4
  "description": "A Promise-based DuckDB API and Node.js data server.",
5
5
  "keywords": [
6
6
  "duckdb",
@@ -20,12 +20,13 @@
20
20
  },
21
21
  "scripts": {
22
22
  "lint": "eslint src test --ext .js",
23
+ "server": "node bin/run-server.js",
23
24
  "test": "mocha 'test/**/*-test.js'",
24
25
  "prepublishOnly": "npm run test && npm run lint"
25
26
  },
26
27
  "dependencies": {
27
- "duckdb": "~0.6.1",
28
- "ws": "^8.12.0"
28
+ "duckdb": "~0.7.1",
29
+ "ws": "^8.13.0"
29
30
  },
30
- "gitHead": "a7967c35349bdf7f00abb113ce1dd9abb233cd62"
31
+ "gitHead": "e53cd914c807f99aabe78dcbe618dd9543e2f438"
31
32
  }
package/src/Cache.js ADDED
@@ -0,0 +1,113 @@
1
+ import { createHash } from 'node:crypto';
2
+ import fs from 'node:fs/promises';
3
+ import path from 'node:path';
4
+
5
+ const DEFAULT_CACHE_DIR = '.cache';
6
+ const DEFAULT_TTL = 1000 * 60 * 60 * 24 * 7; // 7 days
7
+
8
+ export function cacheKey(hashable, type) {
9
+ return createHash('sha256').update(hashable).digest('hex') + '.' + type;
10
+ }
11
+
12
+ class CacheEntry {
13
+ constructor(data, ttl = DEFAULT_TTL) {
14
+ this.data = data;
15
+ this.touch(ttl);
16
+ }
17
+ touch(ttl = DEFAULT_TTL) {
18
+ this.last = Math.round(Math.max(this.last, performance.now() + ttl));
19
+ return this;
20
+ }
21
+ }
22
+
23
+ export class Cache {
24
+ constructor({
25
+ max = 10000, // max entries
26
+ dir = DEFAULT_CACHE_DIR,
27
+ ttl = DEFAULT_TTL
28
+ }) {
29
+ this.cache = new Map;
30
+ this.max = max;
31
+ this.dir = dir;
32
+ this.ttl = ttl;
33
+ readEntries(dir, this.cache);
34
+ }
35
+
36
+ has(key) {
37
+ return this.cache.has(key);
38
+ }
39
+
40
+ delete(key) {
41
+ const deleted = this.cache.delete(key);
42
+ if (deleted) {
43
+ fs.rm(path.resolve(this.dir, key), { force: true });
44
+ }
45
+ return deleted;
46
+ }
47
+
48
+ get(key) {
49
+ return this.cache.get(key)?.touch(this.ttl).data;
50
+ }
51
+
52
+ set(key, data, { persist = false, ttl = this.ttl } = {}) {
53
+ const entry = new CacheEntry(data, persist ? Infinity : ttl);
54
+ this.cache.set(key, entry);
55
+ if (persist) writeEntry(this.dir, key, entry);
56
+ if (this.shouldEvict()) setTimeout(() => this.evict());
57
+ return this;
58
+ }
59
+
60
+ shouldEvict() {
61
+ return this.cache.size > this.max;
62
+ }
63
+
64
+ evict() {
65
+ const expire = performance.now();
66
+ let lruKey = null;
67
+ let lruLast = Infinity;
68
+
69
+ for (const [key, entry] of this.cache) {
70
+ const { last } = entry;
71
+ if (last === Infinity) continue;
72
+
73
+ // least recently used entry seen so far
74
+ if (last < lruLast) {
75
+ lruKey = key;
76
+ lruLast = last;
77
+ }
78
+
79
+ // remove if time since last access exceeds ttl
80
+ if (expire > last) {
81
+ this.cache.delete(key);
82
+ }
83
+ }
84
+
85
+ // remove lru entry
86
+ if (this.cache.size > this.max && lruKey) {
87
+ this.cache.delete(lruKey);
88
+ }
89
+ }
90
+ }
91
+
92
+ async function readEntries(dir, cache) {
93
+ let files;
94
+ try {
95
+ files = await fs.readdir(dir);
96
+ } catch (err) {
97
+ return; // dir does not exist, nothing to do
98
+ }
99
+ await Promise.allSettled(files.map(async file => {
100
+ const m = file.match(/.*\.(arrow|json)/);
101
+ const key = m?.[1] || null;
102
+ if (key) {
103
+ const data = await fs.readFile(path.resolve(dir, file));
104
+ cache.set(key, new CacheEntry(data, Infinity));
105
+ }
106
+ }));
107
+ }
108
+
109
+ function writeEntry(dir, key, entry) {
110
+ return fs.mkdir(dir, { recursive: true }).then(
111
+ () => fs.writeFile(path.resolve(dir, key), entry.data)
112
+ );
113
+ }
package/src/DuckDB.js CHANGED
@@ -1,9 +1,10 @@
1
1
  import duckdb from 'duckdb';
2
- import { readFile } from 'node:fs/promises';
3
2
  import { mergeBuffers } from './merge-buffers.js';
4
3
 
4
+ const TEMP_DIR = '.duckdb';
5
+
5
6
  const CONFIG = [
6
- `PRAGMA temp_directory='./duckdb.tmp'`,
7
+ `PRAGMA temp_directory='${TEMP_DIR}'`,
7
8
  `INSTALL arrow`,
8
9
  `INSTALL httpfs`,
9
10
  `LOAD arrow`,
@@ -29,35 +30,6 @@ export class DuckDB {
29
30
  });
30
31
  }
31
32
 
32
- async csv(tableName, fileName, options = {}) {
33
- const opt = Object.entries({ sample_size: -1, ...options })
34
- .map(([key, value]) => {
35
- const t = typeof value;
36
- const v = t === 'boolean' ? String(value).toUpperCase()
37
- : t === 'string' ? `'${value}'`
38
- : value;
39
- return `${key.toUpperCase()}=${v}`;
40
- })
41
- .join(', ');
42
- return this.exec(`CREATE TABLE ${tableName} AS SELECT *
43
- FROM read_csv_auto('${fileName}', ${opt});`);
44
- }
45
-
46
- async parquet(tableName, fileName) {
47
- return this.exec(`CREATE TABLE ${tableName} AS SELECT *
48
- FROM read_parquet('${fileName}');`);
49
- }
50
-
51
- async ipc(tableName, buffer) {
52
- const bufName = `__ipc__${tableName}`;
53
- const arrowData = ArrayBuffer.isView(buffer) ? buffer : await readFile(buffer);
54
- this.con.register_buffer(bufName, [arrowData], true, err => {
55
- if (err) console.error(err);
56
- });
57
- await this.exec(`CREATE TABLE ${tableName} AS SELECT * FROM ${bufName}`);
58
- this.con.unregister_buffer(bufName);
59
- }
60
-
61
33
  prepare(sql) {
62
34
  return new DuckDBStatement(this.con.prepare(sql));
63
35
  }
@@ -97,10 +69,6 @@ export class DuckDB {
97
69
  });
98
70
  });
99
71
  }
100
-
101
- arrowStream(sql) {
102
- return this.con.arrowIPCStream(sql);
103
- }
104
72
  }
105
73
 
106
74
  export class DuckDBStatement {
@@ -151,8 +119,4 @@ export class DuckDBStatement {
151
119
  });
152
120
  });
153
121
  }
154
-
155
- arrowStream(params) {
156
- return this.statement.arrowIPCStream(...params);
157
- }
158
122
  }
@@ -1,13 +1,21 @@
1
1
  import http from 'node:http';
2
+ import path from 'node:path';
2
3
  import url from 'node:url';
3
4
  import { WebSocketServer } from 'ws';
5
+ import { Cache, cacheKey } from './Cache.js';
6
+ import { createBundle, loadBundle } from './load/bundle.js';
7
+
8
+ const CACHE_DIR = '.mosaic/cache';
9
+ const BUNDLE_DIR = '.mosaic/bundle';
4
10
 
5
11
  export function dataServer(db, {
12
+ cache = true,
6
13
  rest = true,
7
14
  socket = true,
8
15
  port = 3000
9
16
  } = {}) {
10
- const handleQuery = queryHandler(db);
17
+ const queryCache = cache ? new Cache({ dir: CACHE_DIR }) : null;
18
+ const handleQuery = queryHandler(db, queryCache);
11
19
  const app = createHTTPServer(handleQuery, rest);
12
20
  if (socket) createSocketServer(app, handleQuery);
13
21
 
@@ -25,7 +33,6 @@ function createHTTPServer(handleQuery, rest) {
25
33
  return;
26
34
  }
27
35
 
28
- resp.setHeader('Content-Type', 'application/json');
29
36
  resp.setHeader('Access-Control-Allow-Origin', '*');
30
37
  resp.setHeader('Access-Control-Request-Method', '*');
31
38
  resp.setHeader('Access-Control-Allow-Methods', 'OPTIONS, POST, GET');
@@ -61,7 +68,27 @@ function createSocketServer(server, handleQuery) {
61
68
  });
62
69
  }
63
70
 
64
- function queryHandler(db) {
71
+ function queryHandler(db, queryCache) {
72
+
73
+ // retrieve query result
74
+ async function retrieve(query, get) {
75
+ const { sql, type, persist } = query;
76
+ const key = cacheKey(sql, type);
77
+ let result = queryCache?.get(key);
78
+
79
+ if (result) {
80
+ console.log('CACHE HIT');
81
+ } else {
82
+ result = await get(sql);
83
+ if (persist) {
84
+ queryCache?.set(key, result, { persist });
85
+ }
86
+ }
87
+
88
+ return result;
89
+ }
90
+
91
+ // query request handler
65
92
  return async (res, data) => {
66
93
  const t0 = performance.now();
67
94
 
@@ -75,64 +102,56 @@ function queryHandler(db) {
75
102
  }
76
103
 
77
104
  try {
78
- const { sql, type } = query;
79
- console.log('QUERY', sql);
80
-
81
- // request the lock to serialize requests
82
- // we do this to avoid DuckDB + Arrow errors
83
- await res.lock?.();
105
+ const { sql, type = 'json' } = query;
106
+ console.log(`> ${type.toUpperCase()}${sql ? ' ' + sql : ''}`);
84
107
 
85
108
  // process query and return result
86
109
  switch (type) {
87
- case 'arrow':
88
- // Apache Arrow response format
89
- await res.stream(await db.arrowStream(sql));
90
- break;
91
110
  case 'exec':
92
111
  // Execute query with no return value
93
112
  await db.exec(sql);
94
113
  res.done();
95
114
  break;
96
- default:
115
+ case 'arrow':
116
+ // Apache Arrow response format
117
+ res.arrow(await retrieve(query, sql => db.arrowBuffer(sql)));
118
+ break;
119
+ case 'json':
97
120
  // JSON response format
98
- res.json(await db.query(sql));
121
+ res.json(await retrieve(query, sql => db.query(sql)));
122
+ break;
123
+ case 'create-bundle':
124
+ // Create a named bundle of precomputed resources
125
+ await createBundle(
126
+ db, queryCache, query.queries,
127
+ path.resolve(BUNDLE_DIR, query.name)
128
+ );
129
+ res.done();
130
+ break;
131
+ case 'load-bundle':
132
+ // Load a named bundle of precomputed resources
133
+ await loadBundle(db, queryCache, path.resolve(BUNDLE_DIR, query.name));
134
+ res.done();
135
+ break;
136
+ default:
137
+ res.error(`Unrecognized command: ${type}`, 400);
99
138
  }
100
139
  } catch (err) {
101
140
  res.error(err, 500);
102
- } finally {
103
- res.unlock?.();
104
141
  }
105
142
 
106
- console.log('REQUEST', Math.round(performance.now() - t0));
143
+ console.log('REQUEST', (performance.now() - t0).toFixed(1));
107
144
  };
108
145
  }
109
146
 
110
- let locked = false;
111
- const queue = [];
112
-
113
147
  function httpResponse(res) {
114
148
  return {
115
- lock() {
116
- // if locked, add a promise to the queue
117
- // otherwise, grab the lock and proceed
118
- return locked
119
- ? new Promise(resolve => queue.push(resolve))
120
- : (locked = true);
121
- },
122
- unlock() {
123
- locked = queue.length > 0;
124
- if (locked) {
125
- // resolve the next promise in the queue
126
- queue.shift()();
127
- }
128
- },
129
- async stream(iter) {
130
- for await (const chunk of iter) {
131
- res.write(chunk);
132
- }
133
- res.end();
149
+ arrow(data) {
150
+ res.setHeader('Content-Type', 'application/vnd.apache.arrow.stream');
151
+ res.end(data);
134
152
  },
135
153
  json(data) {
154
+ res.setHeader('Content-Type', 'application/json');
136
155
  res.end(JSON.stringify(data));
137
156
  },
138
157
  done() {
@@ -149,16 +168,11 @@ function httpResponse(res) {
149
168
 
150
169
  function socketResponse(ws) {
151
170
  const STRING = { binary: false, fin: true };
152
- const FRAGMENT = { binary: true, fin: false };
153
- const DONE = { binary: true, fin: true };
154
- const NULL = new Uint8Array(0);
171
+ const BINARY = { binary: true, fin: true };
155
172
 
156
173
  return {
157
- async stream(iter) {
158
- for await (const chunk of iter) {
159
- ws.send(chunk, FRAGMENT);
160
- }
161
- ws.send(NULL, DONE);
174
+ arrow(data) {
175
+ ws.send(data, BINARY);
162
176
  },
163
177
  json(data) {
164
178
  ws.send(JSON.stringify(data), STRING);
package/src/index.js CHANGED
@@ -1,2 +1,6 @@
1
1
  export { DuckDB } from './DuckDB.js';
2
2
  export { dataServer } from './data-server.js';
3
+ export { loadArrow } from './load/arrow.js';
4
+ export { loadCSV } from './load/csv.js';
5
+ export { loadJSON } from './load/json.js';
6
+ export { loadParquet } from './load/parquet.js';
@@ -0,0 +1,14 @@
1
+ import { readFile } from 'node:fs/promises';
2
+
3
+ export async function loadArrow(db, tableName, buffer) {
4
+ const arrowData = ArrayBuffer.isView(buffer) ? buffer : await readFile(buffer);
5
+ return new Promise((resolve, reject) => {
6
+ db.con.register_buffer(tableName, [arrowData], true, err => {
7
+ if (err) {
8
+ console.error(err);
9
+ reject(err);
10
+ }
11
+ resolve();
12
+ });
13
+ });
14
+ }
@@ -0,0 +1,75 @@
1
+ import fs from 'node:fs/promises';
2
+ import path from 'node:path';
3
+ import { cacheKey } from '../Cache.js';
4
+
5
+ async function retrieve(db, cache, sql, type) {
6
+ const key = cacheKey(sql, type);
7
+ const cached = cache.get(key);
8
+ if (cached) return cached;
9
+ switch (type) {
10
+ case 'arrow':
11
+ return db.arrowBuffer(sql);
12
+ case 'json':
13
+ return JSON.stringify(await db.query(sql));
14
+ default:
15
+ throw new Error(`Unsupported query type: ${type}`);
16
+ }
17
+ }
18
+
19
+ export async function createBundle(db, cache, queries, dir) {
20
+ const describe_re = /^DESCRIBE /;
21
+ const pragma_re = /^PRAGMA /;
22
+ const view_re = /^CREATE( TEMP| TEMPORARY)? VIEW/;
23
+ const table_re = /^CREATE( TEMP| TEMPORARY)? TABLE( IF NOT EXISTS)? ([^\s]+)/;
24
+
25
+ const manifest = { tables: [], queries: [] };
26
+
27
+ await fs.mkdir(dir, { recursive: true });
28
+
29
+ const querySet = new Set(queries);
30
+ for (const query of querySet) {
31
+ const sql = typeof query === 'string' ? query : query.sql;
32
+ if (query.alias) {
33
+ const table = query.alias;
34
+ const file = path.resolve(dir, `${table}.parquet`);
35
+ await db.exec(`COPY (${sql}) TO '${file}' (FORMAT PARQUET)`);
36
+ manifest.tables.push(table);
37
+ } else if (sql.startsWith('CREATE ')) {
38
+ // table or view
39
+ if (view_re.test(sql)) continue; // ignore views
40
+ const table = sql.match(table_re)?.[3];
41
+ const file = path.resolve(dir, `${table}.parquet`);
42
+ await db.exec(`${sql}`);
43
+ await db.exec(`COPY ${table} TO '${file}' (FORMAT PARQUET)`);
44
+ manifest.tables.push(table);
45
+ } else if (!pragma_re.test(sql)) {
46
+ // select query
47
+ const type = describe_re.test(sql) ? 'json' : 'arrow';
48
+ const key = cacheKey(sql, type);
49
+ const result = await retrieve(db, cache, sql, type);
50
+ await fs.writeFile(path.resolve(dir, key), result);
51
+ manifest.queries.push(key);
52
+ }
53
+ }
54
+
55
+ await fs.writeFile(path.resolve(dir, 'bundle.json'), JSON.stringify(manifest, 0, 2));
56
+ return manifest;
57
+ }
58
+
59
+ export async function loadBundle(db, cache, dir) {
60
+ const manifest = JSON.parse(await fs.readFile(path.resolve(dir, 'bundle.json')));
61
+
62
+ // load precomputed query results into the cache
63
+ for (const key of manifest.queries) {
64
+ const file = path.resolve(dir, key);
65
+ const json = path.extname(file) === '.json';
66
+ const data = await fs.readFile(file);
67
+ cache.set(key, json ? JSON.parse(data) : data);
68
+ }
69
+
70
+ // load precomputed temp tables into the database
71
+ for (const table of manifest.tables) {
72
+ const file = path.resolve(dir, `${table}.parquet`);
73
+ await db.exec(`CREATE TEMP TABLE IF NOT EXISTS ${table} AS SELECT * FROM '${file}'`);
74
+ }
75
+ }
@@ -0,0 +1,6 @@
1
+ export function createTable(db, name, as, options = {}) {
2
+ const { temp, replace } = options;
3
+ const create = `CREATE${replace ? ' OR REPLACE' : ''}`;
4
+ const type = `${temp ? 'TEMP ' : ''}TABLE${replace ? '' : ' IF NOT EXISTS'}`;
5
+ return db.exec(`${create} ${type} ${name} AS ${as}`);
6
+ }
@@ -0,0 +1,9 @@
1
+ import { createTable } from './create-table.js';
2
+ import { parameters } from './parameters.js';
3
+
4
+ export function loadCSV(db, tableName, fileName, options = {}) {
5
+ const { select = ['*'], temp, replace, ...csvOptions } = options;
6
+ const params = parameters({ auto_detect: true, sample_size: -1, ...csvOptions });
7
+ const query = `SELECT ${select.join(', ')} FROM read_csv('${fileName}', ${params})`;
8
+ return createTable(db, tableName, query, { temp, replace });
9
+ }
@@ -0,0 +1,9 @@
1
+ import { createTable } from './create-table.js';
2
+ import { parameters } from './parameters.js';
3
+
4
+ export function loadJSON(db, tableName, fileName, options = {}) {
5
+ const { select = ['*'], temp, replace, ...jsonOptions } = options;
6
+ const params = parameters({ auto_detect: true, json_format: 'auto', ...jsonOptions });
7
+ const query = `SELECT ${select.join(', ')} FROM read_json('${fileName}', ${params})`;
8
+ return createTable(db, tableName, query, { temp, replace });
9
+ }
@@ -0,0 +1,11 @@
1
+ export function parameters(options) {
2
+ return Object.entries(options)
3
+ .map(([key, value]) => {
4
+ const t = typeof value;
5
+ const v = t === 'boolean' ? String(value)
6
+ : t === 'string' ? `'${value}'`
7
+ : value;
8
+ return `${key}=${v}`;
9
+ })
10
+ .join(', ');
11
+ }
@@ -0,0 +1,7 @@
1
+ import { createTable } from './create-table.js';
2
+
3
+ export function loadParquet(db, tableName, fileName, options = {}) {
4
+ const { select = ['*'], ...tableOptions } = options;
5
+ const query = `SELECT ${select.join(', ')} FROM read_parquet('${fileName}')`;
6
+ return createTable(db, tableName, query, tableOptions);
7
+ }
@@ -1,10 +1,12 @@
1
1
  export function mergeBuffers(buffers) {
2
- const len = buffers.reduce((a, b) => a + b.length, 0);
2
+ const len = buffers.reduce((a, b) => a + (b ? b.length : 0), 0);
3
3
  const buf = new Uint8Array(len);
4
4
 
5
5
  for (let i = 0, offset = 0; i < buffers.length; ++i) {
6
- buf.set(buffers[i], offset);
7
- offset += buffers[i].length;
6
+ if (buffers[i]) {
7
+ buf.set(buffers[i], offset);
8
+ offset += buffers[i].length;
9
+ }
8
10
  }
9
11
 
10
12
  return buf;