eigen-db 4.1.0 → 4.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +8 -0
- package/README.md +79 -27
- package/dist/eigen-db.js +317 -195
- package/dist/eigen-db.js.map +1 -1
- package/dist/eigen-db.umd.cjs +1 -1
- package/dist/eigen-db.umd.cjs.map +1 -1
- package/package.json +1 -1
- package/src/lib/__tests__/result-set.test.ts +19 -19
- package/src/lib/__tests__/vector-db.test.ts +429 -16
- package/src/lib/memory-manager.ts +8 -0
- package/src/lib/result-set.ts +16 -15
- package/src/lib/simd-binary.ts +1 -1
- package/src/lib/simd-optimized.wat +362 -0
- package/src/lib/simd.wat +42 -248
- package/src/lib/types.ts +4 -6
- package/src/lib/vector-db.ts +241 -9
package/src/lib/vector-db.ts
CHANGED
|
@@ -8,6 +8,7 @@
|
|
|
8
8
|
* - set/get/setMany/getMany for key-value CRUD
|
|
9
9
|
* - query for similarity search (dot product on normalized vectors)
|
|
10
10
|
* - flush to persist, close to flush+release, clear to wipe
|
|
11
|
+
* - Streaming export/import using Web Streams API
|
|
11
12
|
* - Last-write-wins semantics for duplicate keys (append-only storage)
|
|
12
13
|
*/
|
|
13
14
|
|
|
@@ -19,13 +20,21 @@ import type { ResultItem } from "./result-set";
|
|
|
19
20
|
import { iterableResults, topKResults } from "./result-set";
|
|
20
21
|
import { getSimdWasmBinary } from "./simd-binary";
|
|
21
22
|
import type { StorageProvider } from "./storage";
|
|
22
|
-
import {
|
|
23
|
+
import { InMemoryStorageProvider } from "./storage";
|
|
23
24
|
import type { OpenOptions, OpenOptionsInternal, QueryOptions, SetOptions, VectorInput } from "./types";
|
|
24
25
|
import { instantiateWasm, type WasmExports } from "./wasm-compute";
|
|
25
26
|
|
|
26
27
|
const VECTORS_FILE = "vectors.bin";
|
|
27
28
|
const KEYS_FILE = "keys.bin";
|
|
28
29
|
|
|
30
|
+
/** Binary export format magic bytes: "EGDB" */
|
|
31
|
+
const EXPORT_MAGIC = 0x42444745; // "EGDB" in little-endian
|
|
32
|
+
const EXPORT_VERSION = 1;
|
|
33
|
+
const EXPORT_HEADER_BYTES = 24;
|
|
34
|
+
|
|
35
|
+
/** Chunk size for streaming export/import (64KB, matches WASM page size) */
|
|
36
|
+
const STREAM_CHUNK_SIZE = 65536;
|
|
37
|
+
|
|
29
38
|
export class VectorDB {
|
|
30
39
|
private readonly memoryManager: MemoryManager;
|
|
31
40
|
private readonly storage: StorageProvider;
|
|
@@ -67,8 +76,7 @@ export class VectorDB {
|
|
|
67
76
|
static async open(options: OpenOptions): Promise<VectorDB>;
|
|
68
77
|
static async open(options: OpenOptionsInternal): Promise<VectorDB>;
|
|
69
78
|
static async open(options: OpenOptionsInternal): Promise<VectorDB> {
|
|
70
|
-
const
|
|
71
|
-
const storage = options.storage ?? new OPFSStorageProvider(name);
|
|
79
|
+
const storage = options.storage ?? new InMemoryStorageProvider();
|
|
72
80
|
const shouldNormalize = options.normalize !== false;
|
|
73
81
|
|
|
74
82
|
// Load existing data from storage
|
|
@@ -185,12 +193,13 @@ export class VectorDB {
|
|
|
185
193
|
/**
|
|
186
194
|
* Search for the most similar vectors to the given query vector.
|
|
187
195
|
*
|
|
188
|
-
* Default: returns a plain ResultItem[] sorted by
|
|
196
|
+
* Default: returns a plain ResultItem[] sorted by descending similarity.
|
|
189
197
|
* With `{ iterable: true }`: returns a lazy Iterable<ResultItem> where keys
|
|
190
198
|
* are resolved only as each item is consumed.
|
|
191
199
|
*
|
|
192
|
-
*
|
|
193
|
-
* this equals cosine
|
|
200
|
+
* Similarity is the dot product of query and stored vectors. With
|
|
201
|
+
* normalization (default), this equals cosine similarity: 1 = identical,
|
|
202
|
+
* -1 = opposite.
|
|
194
203
|
*/
|
|
195
204
|
query(value: VectorInput, options: QueryOptions & { iterable: true }): Iterable<ResultItem>;
|
|
196
205
|
query(value: VectorInput, options?: QueryOptions): ResultItem[];
|
|
@@ -198,7 +207,7 @@ export class VectorDB {
|
|
|
198
207
|
this.assertOpen();
|
|
199
208
|
|
|
200
209
|
const k = options?.topK ?? Infinity;
|
|
201
|
-
const
|
|
210
|
+
const minSimilarity = options?.minSimilarity;
|
|
202
211
|
const iterable = options && "iterable" in options && options.iterable;
|
|
203
212
|
|
|
204
213
|
if (this.size === 0) {
|
|
@@ -260,9 +269,9 @@ export class VectorDB {
|
|
|
260
269
|
};
|
|
261
270
|
|
|
262
271
|
if (iterable) {
|
|
263
|
-
return iterableResults(scores, resolveKey, k,
|
|
272
|
+
return iterableResults(scores, resolveKey, k, minSimilarity);
|
|
264
273
|
}
|
|
265
|
-
return topKResults(scores, resolveKey, k,
|
|
274
|
+
return topKResults(scores, resolveKey, k, minSimilarity);
|
|
266
275
|
}
|
|
267
276
|
|
|
268
277
|
/**
|
|
@@ -313,6 +322,140 @@ export class VectorDB {
|
|
|
313
322
|
await this.storage.destroy();
|
|
314
323
|
}
|
|
315
324
|
|
|
325
|
+
/**
|
|
326
|
+
* Export the entire database as a ReadableStream of binary chunks.
|
|
327
|
+
*
|
|
328
|
+
* Format: [Header 24 bytes][Vector data][Keys data]
|
|
329
|
+
* Header: magic(4) + version(4) + dimensions(4) + vectorCount(4) + vectorDataLen(4) + keysDataLen(4)
|
|
330
|
+
*
|
|
331
|
+
* Vectors are streamed in 64KB chunks from WASM memory to avoid large
|
|
332
|
+
* heap allocations.
|
|
333
|
+
*/
|
|
334
|
+
async export(): Promise<ReadableStream<Uint8Array>> {
|
|
335
|
+
this.assertOpen();
|
|
336
|
+
|
|
337
|
+
const totalVectors = this.memoryManager.vectorCount;
|
|
338
|
+
const vectorDataLen = totalVectors * this.dimensions * 4;
|
|
339
|
+
|
|
340
|
+
// Encode keys (typically much smaller than vectors)
|
|
341
|
+
const keysBytes = encodeLexicon(this.slotToKey);
|
|
342
|
+
const keysDataLen = keysBytes.byteLength;
|
|
343
|
+
|
|
344
|
+
// Build header
|
|
345
|
+
const header = new ArrayBuffer(EXPORT_HEADER_BYTES);
|
|
346
|
+
const headerView = new DataView(header);
|
|
347
|
+
headerView.setUint32(0, EXPORT_MAGIC, true);
|
|
348
|
+
headerView.setUint32(4, EXPORT_VERSION, true);
|
|
349
|
+
headerView.setUint32(8, this.dimensions, true);
|
|
350
|
+
headerView.setUint32(12, totalVectors, true);
|
|
351
|
+
headerView.setUint32(16, vectorDataLen, true);
|
|
352
|
+
headerView.setUint32(20, keysDataLen, true);
|
|
353
|
+
|
|
354
|
+
const mm = this.memoryManager;
|
|
355
|
+
let phase: "header" | "vectors" | "keys" | "done" = "header";
|
|
356
|
+
let vectorOffset = 0;
|
|
357
|
+
|
|
358
|
+
return new ReadableStream<Uint8Array>({
|
|
359
|
+
pull(controller) {
|
|
360
|
+
switch (phase) {
|
|
361
|
+
case "header":
|
|
362
|
+
controller.enqueue(new Uint8Array(header));
|
|
363
|
+
phase = vectorDataLen > 0 ? "vectors" : keysDataLen > 0 ? "keys" : "done";
|
|
364
|
+
if (phase === "done") controller.close();
|
|
365
|
+
break;
|
|
366
|
+
|
|
367
|
+
case "vectors": {
|
|
368
|
+
const remaining = vectorDataLen - vectorOffset;
|
|
369
|
+
const chunkSize = Math.min(remaining, STREAM_CHUNK_SIZE);
|
|
370
|
+
const chunk = new Uint8Array(chunkSize);
|
|
371
|
+
chunk.set(new Uint8Array(mm.memory.buffer, mm.dbOffset + vectorOffset, chunkSize));
|
|
372
|
+
controller.enqueue(chunk);
|
|
373
|
+
vectorOffset += chunkSize;
|
|
374
|
+
if (vectorOffset >= vectorDataLen) {
|
|
375
|
+
phase = keysDataLen > 0 ? "keys" : "done";
|
|
376
|
+
if (phase === "done") controller.close();
|
|
377
|
+
}
|
|
378
|
+
break;
|
|
379
|
+
}
|
|
380
|
+
|
|
381
|
+
case "keys":
|
|
382
|
+
controller.enqueue(keysBytes);
|
|
383
|
+
phase = "done";
|
|
384
|
+
controller.close();
|
|
385
|
+
break;
|
|
386
|
+
}
|
|
387
|
+
},
|
|
388
|
+
});
|
|
389
|
+
}
|
|
390
|
+
|
|
391
|
+
/**
|
|
392
|
+
* Import data from a ReadableStream, replacing all existing data.
|
|
393
|
+
* Performs a dimension check against the configured dimensions.
|
|
394
|
+
*
|
|
395
|
+
* Vectors are streamed directly into WASM memory in chunks to avoid
|
|
396
|
+
* large heap allocations.
|
|
397
|
+
*/
|
|
398
|
+
async import(stream: ReadableStream<Uint8Array>): Promise<void> {
|
|
399
|
+
this.assertOpen();
|
|
400
|
+
|
|
401
|
+
const sr = new StreamReader(stream);
|
|
402
|
+
|
|
403
|
+
// Read and validate header
|
|
404
|
+
const headerBytes = await sr.readExact(EXPORT_HEADER_BYTES);
|
|
405
|
+
const headerView = new DataView(headerBytes.buffer, headerBytes.byteOffset, headerBytes.byteLength);
|
|
406
|
+
|
|
407
|
+
const magic = headerView.getUint32(0, true);
|
|
408
|
+
if (magic !== EXPORT_MAGIC) {
|
|
409
|
+
throw new Error("Invalid import data: unrecognized format");
|
|
410
|
+
}
|
|
411
|
+
|
|
412
|
+
const version = headerView.getUint32(4, true);
|
|
413
|
+
if (version !== EXPORT_VERSION) {
|
|
414
|
+
throw new Error(`Unsupported import version: expected ${EXPORT_VERSION}, got ${version}`);
|
|
415
|
+
}
|
|
416
|
+
|
|
417
|
+
const dimensions = headerView.getUint32(8, true);
|
|
418
|
+
if (dimensions !== this.dimensions) {
|
|
419
|
+
throw new Error(`Import dimension mismatch: expected ${this.dimensions}, got ${dimensions}`);
|
|
420
|
+
}
|
|
421
|
+
|
|
422
|
+
const vectorCount = headerView.getUint32(12, true);
|
|
423
|
+
const vectorDataLen = headerView.getUint32(16, true);
|
|
424
|
+
const keysDataLen = headerView.getUint32(20, true);
|
|
425
|
+
|
|
426
|
+
// Clear existing state
|
|
427
|
+
this.keyToSlot.clear();
|
|
428
|
+
this.slotToKey.length = 0;
|
|
429
|
+
this.memoryManager.reset();
|
|
430
|
+
|
|
431
|
+
// Stream vectors directly into WASM memory in chunks
|
|
432
|
+
if (vectorCount > 0) {
|
|
433
|
+
this.memoryManager.ensureCapacity(vectorCount);
|
|
434
|
+
|
|
435
|
+
let offset = 0;
|
|
436
|
+
await sr.readChunked(vectorDataLen, (chunk) => {
|
|
437
|
+
new Uint8Array(this.memoryManager.memory.buffer, this.memoryManager.dbOffset + offset, chunk.byteLength).set(
|
|
438
|
+
chunk,
|
|
439
|
+
);
|
|
440
|
+
offset += chunk.byteLength;
|
|
441
|
+
});
|
|
442
|
+
|
|
443
|
+
this.memoryManager.setVectorCount(vectorCount);
|
|
444
|
+
}
|
|
445
|
+
|
|
446
|
+
// Read and decode keys (typically much smaller than vectors)
|
|
447
|
+
if (keysDataLen > 0) {
|
|
448
|
+
const keysBytes = await sr.readExact(keysDataLen);
|
|
449
|
+
const keys = decodeLexicon(keysBytes);
|
|
450
|
+
for (let i = 0; i < keys.length; i++) {
|
|
451
|
+
this.keyToSlot.set(keys[i], i);
|
|
452
|
+
this.slotToKey[i] = keys[i];
|
|
453
|
+
}
|
|
454
|
+
}
|
|
455
|
+
|
|
456
|
+
sr.release();
|
|
457
|
+
}
|
|
458
|
+
|
|
316
459
|
/**
|
|
317
460
|
* Normalize a vector using WASM (if available) or JS fallback.
|
|
318
461
|
*/
|
|
@@ -334,3 +477,92 @@ export class VectorDB {
|
|
|
334
477
|
}
|
|
335
478
|
}
|
|
336
479
|
}
|
|
480
|
+
|
|
481
|
+
/**
|
|
482
|
+
* Helper for reading exact byte counts from a ReadableStream.
|
|
483
|
+
* Handles chunk boundaries transparently.
|
|
484
|
+
*/
|
|
485
|
+
class StreamReader {
|
|
486
|
+
private reader: ReadableStreamDefaultReader<Uint8Array>;
|
|
487
|
+
private buffer = new Uint8Array(0);
|
|
488
|
+
|
|
489
|
+
constructor(stream: ReadableStream<Uint8Array>) {
|
|
490
|
+
this.reader = stream.getReader();
|
|
491
|
+
}
|
|
492
|
+
|
|
493
|
+
/** Read exactly `n` bytes, accumulating from stream chunks as needed. */
|
|
494
|
+
async readExact(n: number): Promise<Uint8Array> {
|
|
495
|
+
if (n === 0) return new Uint8Array(0);
|
|
496
|
+
|
|
497
|
+
// Fast path: buffer already has enough data
|
|
498
|
+
if (this.buffer.byteLength >= n) {
|
|
499
|
+
const result = this.buffer.subarray(0, n);
|
|
500
|
+
this.buffer = this.buffer.subarray(n);
|
|
501
|
+
return result;
|
|
502
|
+
}
|
|
503
|
+
|
|
504
|
+
// Collect chunks until we have enough bytes
|
|
505
|
+
const chunks: Uint8Array[] = [];
|
|
506
|
+
let total = this.buffer.byteLength;
|
|
507
|
+
if (total > 0) {
|
|
508
|
+
chunks.push(this.buffer);
|
|
509
|
+
this.buffer = new Uint8Array(0);
|
|
510
|
+
}
|
|
511
|
+
|
|
512
|
+
while (total < n) {
|
|
513
|
+
const { done, value } = await this.reader.read();
|
|
514
|
+
if (done) throw new Error("Invalid import data: unexpected end of stream");
|
|
515
|
+
chunks.push(value);
|
|
516
|
+
total += value.byteLength;
|
|
517
|
+
}
|
|
518
|
+
|
|
519
|
+
// Combine into a single buffer
|
|
520
|
+
const combined = new Uint8Array(total);
|
|
521
|
+
let offset = 0;
|
|
522
|
+
for (const chunk of chunks) {
|
|
523
|
+
combined.set(chunk, offset);
|
|
524
|
+
offset += chunk.byteLength;
|
|
525
|
+
}
|
|
526
|
+
|
|
527
|
+
if (total > n) {
|
|
528
|
+
this.buffer = combined.subarray(n);
|
|
529
|
+
return combined.subarray(0, n);
|
|
530
|
+
}
|
|
531
|
+
return combined;
|
|
532
|
+
}
|
|
533
|
+
|
|
534
|
+
/**
|
|
535
|
+
* Read `totalBytes` from the stream in chunks, calling `onChunk` for each.
|
|
536
|
+
* Avoids allocating a single large buffer for the full data.
|
|
537
|
+
*/
|
|
538
|
+
async readChunked(totalBytes: number, onChunk: (chunk: Uint8Array) => void): Promise<void> {
|
|
539
|
+
let remaining = totalBytes;
|
|
540
|
+
|
|
541
|
+
// Drain buffered leftover first
|
|
542
|
+
if (this.buffer.byteLength > 0) {
|
|
543
|
+
const take = Math.min(this.buffer.byteLength, remaining);
|
|
544
|
+
onChunk(this.buffer.subarray(0, take));
|
|
545
|
+
remaining -= take;
|
|
546
|
+
this.buffer = this.buffer.subarray(take);
|
|
547
|
+
}
|
|
548
|
+
|
|
549
|
+
while (remaining > 0) {
|
|
550
|
+
const { done, value } = await this.reader.read();
|
|
551
|
+
if (done) throw new Error("Invalid import data: unexpected end of stream");
|
|
552
|
+
|
|
553
|
+
if (value.byteLength <= remaining) {
|
|
554
|
+
onChunk(value);
|
|
555
|
+
remaining -= value.byteLength;
|
|
556
|
+
} else {
|
|
557
|
+
onChunk(value.subarray(0, remaining));
|
|
558
|
+
this.buffer = value.slice(remaining);
|
|
559
|
+
remaining = 0;
|
|
560
|
+
}
|
|
561
|
+
}
|
|
562
|
+
}
|
|
563
|
+
|
|
564
|
+
/** Release the stream reader lock. */
|
|
565
|
+
release(): void {
|
|
566
|
+
this.reader.releaseLock();
|
|
567
|
+
}
|
|
568
|
+
}
|