eigen-db 4.1.0 → 4.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -8,6 +8,7 @@
8
8
  * - set/get/setMany/getMany for key-value CRUD
9
9
  * - query for similarity search (dot product on normalized vectors)
10
10
  * - flush to persist, close to flush+release, clear to wipe
11
+ * - Streaming export/import using Web Streams API
11
12
  * - Last-write-wins semantics for duplicate keys (append-only storage)
12
13
  */
13
14
 
@@ -19,13 +20,21 @@ import type { ResultItem } from "./result-set";
19
20
  import { iterableResults, topKResults } from "./result-set";
20
21
  import { getSimdWasmBinary } from "./simd-binary";
21
22
  import type { StorageProvider } from "./storage";
22
- import { OPFSStorageProvider } from "./storage";
23
+ import { InMemoryStorageProvider } from "./storage";
23
24
  import type { OpenOptions, OpenOptionsInternal, QueryOptions, SetOptions, VectorInput } from "./types";
24
25
  import { instantiateWasm, type WasmExports } from "./wasm-compute";
25
26
 
26
27
  const VECTORS_FILE = "vectors.bin";
27
28
  const KEYS_FILE = "keys.bin";
28
29
 
30
+ /** Binary export format magic bytes: "EGDB" */
31
+ const EXPORT_MAGIC = 0x42444745; // "EGDB" in little-endian
32
+ const EXPORT_VERSION = 1;
33
+ const EXPORT_HEADER_BYTES = 24;
34
+
35
+ /** Chunk size for streaming export/import (64KB, matches WASM page size) */
36
+ const STREAM_CHUNK_SIZE = 65536;
37
+
29
38
  export class VectorDB {
30
39
  private readonly memoryManager: MemoryManager;
31
40
  private readonly storage: StorageProvider;
@@ -67,8 +76,7 @@ export class VectorDB {
67
76
  static async open(options: OpenOptions): Promise<VectorDB>;
68
77
  static async open(options: OpenOptionsInternal): Promise<VectorDB>;
69
78
  static async open(options: OpenOptionsInternal): Promise<VectorDB> {
70
- const name = options.name ?? "default";
71
- const storage = options.storage ?? new OPFSStorageProvider(name);
79
+ const storage = options.storage ?? new InMemoryStorageProvider();
72
80
  const shouldNormalize = options.normalize !== false;
73
81
 
74
82
  // Load existing data from storage
@@ -185,12 +193,13 @@ export class VectorDB {
185
193
  /**
186
194
  * Search for the most similar vectors to the given query vector.
187
195
  *
188
- * Default: returns a plain ResultItem[] sorted by ascending distance.
196
+ * Default: returns a plain ResultItem[] sorted by descending similarity.
189
197
  * With `{ iterable: true }`: returns a lazy Iterable<ResultItem> where keys
190
198
  * are resolved only as each item is consumed.
191
199
  *
192
- * Distance is defined as `1 - dotProduct`. With normalization (default),
193
- * this equals cosine distance: 0 = identical, 2 = opposite.
200
+ * Similarity is the dot product of query and stored vectors. With
201
+ * normalization (default), this equals cosine similarity: 1 = identical,
202
+ * -1 = opposite.
194
203
  */
195
204
  query(value: VectorInput, options: QueryOptions & { iterable: true }): Iterable<ResultItem>;
196
205
  query(value: VectorInput, options?: QueryOptions): ResultItem[];
@@ -198,7 +207,7 @@ export class VectorDB {
198
207
  this.assertOpen();
199
208
 
200
209
  const k = options?.topK ?? Infinity;
201
- const maxDistance = options?.maxDistance;
210
+ const minSimilarity = options?.minSimilarity;
202
211
  const iterable = options && "iterable" in options && options.iterable;
203
212
 
204
213
  if (this.size === 0) {
@@ -260,9 +269,9 @@ export class VectorDB {
260
269
  };
261
270
 
262
271
  if (iterable) {
263
- return iterableResults(scores, resolveKey, k, maxDistance);
272
+ return iterableResults(scores, resolveKey, k, minSimilarity);
264
273
  }
265
- return topKResults(scores, resolveKey, k, maxDistance);
274
+ return topKResults(scores, resolveKey, k, minSimilarity);
266
275
  }
267
276
 
268
277
  /**
@@ -313,6 +322,140 @@ export class VectorDB {
313
322
  await this.storage.destroy();
314
323
  }
315
324
 
325
+ /**
326
+ * Export the entire database as a ReadableStream of binary chunks.
327
+ *
328
+ * Format: [Header 24 bytes][Vector data][Keys data]
329
+ * Header: magic(4) + version(4) + dimensions(4) + vectorCount(4) + vectorDataLen(4) + keysDataLen(4)
330
+ *
331
+ * Vectors are streamed in 64KB chunks from WASM memory to avoid large
332
+ * heap allocations.
333
+ */
334
+ async export(): Promise<ReadableStream<Uint8Array>> {
335
+ this.assertOpen();
336
+
337
+ const totalVectors = this.memoryManager.vectorCount;
338
+ const vectorDataLen = totalVectors * this.dimensions * 4;
339
+
340
+ // Encode keys (typically much smaller than vectors)
341
+ const keysBytes = encodeLexicon(this.slotToKey);
342
+ const keysDataLen = keysBytes.byteLength;
343
+
344
+ // Build header
345
+ const header = new ArrayBuffer(EXPORT_HEADER_BYTES);
346
+ const headerView = new DataView(header);
347
+ headerView.setUint32(0, EXPORT_MAGIC, true);
348
+ headerView.setUint32(4, EXPORT_VERSION, true);
349
+ headerView.setUint32(8, this.dimensions, true);
350
+ headerView.setUint32(12, totalVectors, true);
351
+ headerView.setUint32(16, vectorDataLen, true);
352
+ headerView.setUint32(20, keysDataLen, true);
353
+
354
+ const mm = this.memoryManager;
355
+ let phase: "header" | "vectors" | "keys" | "done" = "header";
356
+ let vectorOffset = 0;
357
+
358
+ return new ReadableStream<Uint8Array>({
359
+ pull(controller) {
360
+ switch (phase) {
361
+ case "header":
362
+ controller.enqueue(new Uint8Array(header));
363
+ phase = vectorDataLen > 0 ? "vectors" : keysDataLen > 0 ? "keys" : "done";
364
+ if (phase === "done") controller.close();
365
+ break;
366
+
367
+ case "vectors": {
368
+ const remaining = vectorDataLen - vectorOffset;
369
+ const chunkSize = Math.min(remaining, STREAM_CHUNK_SIZE);
370
+ const chunk = new Uint8Array(chunkSize);
371
+ chunk.set(new Uint8Array(mm.memory.buffer, mm.dbOffset + vectorOffset, chunkSize));
372
+ controller.enqueue(chunk);
373
+ vectorOffset += chunkSize;
374
+ if (vectorOffset >= vectorDataLen) {
375
+ phase = keysDataLen > 0 ? "keys" : "done";
376
+ if (phase === "done") controller.close();
377
+ }
378
+ break;
379
+ }
380
+
381
+ case "keys":
382
+ controller.enqueue(keysBytes);
383
+ phase = "done";
384
+ controller.close();
385
+ break;
386
+ }
387
+ },
388
+ });
389
+ }
390
+
391
+ /**
392
+ * Import data from a ReadableStream, replacing all existing data.
393
+ * Performs a dimension check against the configured dimensions.
394
+ *
395
+ * Vectors are streamed directly into WASM memory in chunks to avoid
396
+ * large heap allocations.
397
+ */
398
+ async import(stream: ReadableStream<Uint8Array>): Promise<void> {
399
+ this.assertOpen();
400
+
401
+ const sr = new StreamReader(stream);
402
+
403
+ // Read and validate header
404
+ const headerBytes = await sr.readExact(EXPORT_HEADER_BYTES);
405
+ const headerView = new DataView(headerBytes.buffer, headerBytes.byteOffset, headerBytes.byteLength);
406
+
407
+ const magic = headerView.getUint32(0, true);
408
+ if (magic !== EXPORT_MAGIC) {
409
+ throw new Error("Invalid import data: unrecognized format");
410
+ }
411
+
412
+ const version = headerView.getUint32(4, true);
413
+ if (version !== EXPORT_VERSION) {
414
+ throw new Error(`Unsupported import version: expected ${EXPORT_VERSION}, got ${version}`);
415
+ }
416
+
417
+ const dimensions = headerView.getUint32(8, true);
418
+ if (dimensions !== this.dimensions) {
419
+ throw new Error(`Import dimension mismatch: expected ${this.dimensions}, got ${dimensions}`);
420
+ }
421
+
422
+ const vectorCount = headerView.getUint32(12, true);
423
+ const vectorDataLen = headerView.getUint32(16, true);
424
+ const keysDataLen = headerView.getUint32(20, true);
425
+
426
+ // Clear existing state
427
+ this.keyToSlot.clear();
428
+ this.slotToKey.length = 0;
429
+ this.memoryManager.reset();
430
+
431
+ // Stream vectors directly into WASM memory in chunks
432
+ if (vectorCount > 0) {
433
+ this.memoryManager.ensureCapacity(vectorCount);
434
+
435
+ let offset = 0;
436
+ await sr.readChunked(vectorDataLen, (chunk) => {
437
+ new Uint8Array(this.memoryManager.memory.buffer, this.memoryManager.dbOffset + offset, chunk.byteLength).set(
438
+ chunk,
439
+ );
440
+ offset += chunk.byteLength;
441
+ });
442
+
443
+ this.memoryManager.setVectorCount(vectorCount);
444
+ }
445
+
446
+ // Read and decode keys (typically much smaller than vectors)
447
+ if (keysDataLen > 0) {
448
+ const keysBytes = await sr.readExact(keysDataLen);
449
+ const keys = decodeLexicon(keysBytes);
450
+ for (let i = 0; i < keys.length; i++) {
451
+ this.keyToSlot.set(keys[i], i);
452
+ this.slotToKey[i] = keys[i];
453
+ }
454
+ }
455
+
456
+ sr.release();
457
+ }
458
+
316
459
  /**
317
460
  * Normalize a vector using WASM (if available) or JS fallback.
318
461
  */
@@ -334,3 +477,92 @@ export class VectorDB {
334
477
  }
335
478
  }
336
479
  }
480
+
481
+ /**
482
+ * Helper for reading exact byte counts from a ReadableStream.
483
+ * Handles chunk boundaries transparently.
484
+ */
485
+ class StreamReader {
486
+ private reader: ReadableStreamDefaultReader<Uint8Array>;
487
+ private buffer = new Uint8Array(0);
488
+
489
+ constructor(stream: ReadableStream<Uint8Array>) {
490
+ this.reader = stream.getReader();
491
+ }
492
+
493
+ /** Read exactly `n` bytes, accumulating from stream chunks as needed. */
494
+ async readExact(n: number): Promise<Uint8Array> {
495
+ if (n === 0) return new Uint8Array(0);
496
+
497
+ // Fast path: buffer already has enough data
498
+ if (this.buffer.byteLength >= n) {
499
+ const result = this.buffer.subarray(0, n);
500
+ this.buffer = this.buffer.subarray(n);
501
+ return result;
502
+ }
503
+
504
+ // Collect chunks until we have enough bytes
505
+ const chunks: Uint8Array[] = [];
506
+ let total = this.buffer.byteLength;
507
+ if (total > 0) {
508
+ chunks.push(this.buffer);
509
+ this.buffer = new Uint8Array(0);
510
+ }
511
+
512
+ while (total < n) {
513
+ const { done, value } = await this.reader.read();
514
+ if (done) throw new Error("Invalid import data: unexpected end of stream");
515
+ chunks.push(value);
516
+ total += value.byteLength;
517
+ }
518
+
519
+ // Combine into a single buffer
520
+ const combined = new Uint8Array(total);
521
+ let offset = 0;
522
+ for (const chunk of chunks) {
523
+ combined.set(chunk, offset);
524
+ offset += chunk.byteLength;
525
+ }
526
+
527
+ if (total > n) {
528
+ this.buffer = combined.subarray(n);
529
+ return combined.subarray(0, n);
530
+ }
531
+ return combined;
532
+ }
533
+
534
+ /**
535
+ * Read `totalBytes` from the stream in chunks, calling `onChunk` for each.
536
+ * Avoids allocating a single large buffer for the full data.
537
+ */
538
+ async readChunked(totalBytes: number, onChunk: (chunk: Uint8Array) => void): Promise<void> {
539
+ let remaining = totalBytes;
540
+
541
+ // Drain buffered leftover first
542
+ if (this.buffer.byteLength > 0) {
543
+ const take = Math.min(this.buffer.byteLength, remaining);
544
+ onChunk(this.buffer.subarray(0, take));
545
+ remaining -= take;
546
+ this.buffer = this.buffer.subarray(take);
547
+ }
548
+
549
+ while (remaining > 0) {
550
+ const { done, value } = await this.reader.read();
551
+ if (done) throw new Error("Invalid import data: unexpected end of stream");
552
+
553
+ if (value.byteLength <= remaining) {
554
+ onChunk(value);
555
+ remaining -= value.byteLength;
556
+ } else {
557
+ onChunk(value.subarray(0, remaining));
558
+ this.buffer = value.slice(remaining);
559
+ remaining = 0;
560
+ }
561
+ }
562
+ }
563
+
564
+ /** Release the stream reader lock. */
565
+ release(): void {
566
+ this.reader.releaseLock();
567
+ }
568
+ }