npm - eigen-db - Versions diffs - 4.1.0 → 4.3.0 - Mend

eigen-db 4.1.0 → 4.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (16) hide show

package/CHANGELOG.md +8 -0
package/README.md +79 -27
package/dist/eigen-db.js +317 -195
package/dist/eigen-db.js.map +1 -1
package/dist/eigen-db.umd.cjs +1 -1
package/dist/eigen-db.umd.cjs.map +1 -1
package/package.json +1 -1
package/src/lib/__tests__/result-set.test.ts +19 -19
package/src/lib/__tests__/vector-db.test.ts +429 -16
package/src/lib/memory-manager.ts +8 -0
package/src/lib/result-set.ts +16 -15
package/src/lib/simd-binary.ts +1 -1
package/src/lib/simd-optimized.wat +362 -0
package/src/lib/simd.wat +42 -248
package/src/lib/types.ts +4 -6
package/src/lib/vector-db.ts +241 -9

package/src/lib/vector-db.ts CHANGED Viewed

@@ -8,6 +8,7 @@
  * - set/get/setMany/getMany for key-value CRUD
  * - query for similarity search (dot product on normalized vectors)
  * - flush to persist, close to flush+release, clear to wipe
+ * - Streaming export/import using Web Streams API
  * - Last-write-wins semantics for duplicate keys (append-only storage)
  */
@@ -19,13 +20,21 @@ import type { ResultItem } from "./result-set";
 import { iterableResults, topKResults } from "./result-set";
 import { getSimdWasmBinary } from "./simd-binary";
 import type { StorageProvider } from "./storage";
-import { OPFSStorageProvider } from "./storage";
+import { InMemoryStorageProvider } from "./storage";
 import type { OpenOptions, OpenOptionsInternal, QueryOptions, SetOptions, VectorInput } from "./types";
 import { instantiateWasm, type WasmExports } from "./wasm-compute";
 const VECTORS_FILE = "vectors.bin";
 const KEYS_FILE = "keys.bin";
+/** Binary export format magic bytes: "EGDB" */
+const EXPORT_MAGIC = 0x42444745; // "EGDB" in little-endian
+const EXPORT_VERSION = 1;
+const EXPORT_HEADER_BYTES = 24;
+/** Chunk size for streaming export/import (64KB, matches WASM page size) */
+const STREAM_CHUNK_SIZE = 65536;
 export class VectorDB {
   private readonly memoryManager: MemoryManager;
   private readonly storage: StorageProvider;
@@ -67,8 +76,7 @@ export class VectorDB {
   static async open(options: OpenOptions): Promise<VectorDB>;
   static async open(options: OpenOptionsInternal): Promise<VectorDB>;
   static async open(options: OpenOptionsInternal): Promise<VectorDB> {
-    const name = options.name ?? "default";
-    const storage = options.storage ?? new OPFSStorageProvider(name);
+    const storage = options.storage ?? new InMemoryStorageProvider();
     const shouldNormalize = options.normalize !== false;
     // Load existing data from storage
@@ -185,12 +193,13 @@ export class VectorDB {
   /**
    * Search for the most similar vectors to the given query vector.
    *
-   * Default: returns a plain ResultItem[] sorted by ascending distance.
+   * Default: returns a plain ResultItem[] sorted by descending similarity.
    * With `{ iterable: true }`: returns a lazy Iterable<ResultItem> where keys
    * are resolved only as each item is consumed.
    *
-   * Distance is defined as `1 - dotProduct`. With normalization (default),
-   * this equals cosine distance: 0 = identical, 2 = opposite.
+   * Similarity is the dot product of query and stored vectors. With
+   * normalization (default), this equals cosine similarity: 1 = identical,
+   * -1 = opposite.
    */
   query(value: VectorInput, options: QueryOptions & { iterable: true }): Iterable<ResultItem>;
   query(value: VectorInput, options?: QueryOptions): ResultItem[];
@@ -198,7 +207,7 @@ export class VectorDB {
     this.assertOpen();
     const k = options?.topK ?? Infinity;
-    const maxDistance = options?.maxDistance;
+    const minSimilarity = options?.minSimilarity;
     const iterable = options && "iterable" in options && options.iterable;
     if (this.size === 0) {
@@ -260,9 +269,9 @@ export class VectorDB {
     };
     if (iterable) {
-      return iterableResults(scores, resolveKey, k, maxDistance);
+      return iterableResults(scores, resolveKey, k, minSimilarity);
     }
-    return topKResults(scores, resolveKey, k, maxDistance);
+    return topKResults(scores, resolveKey, k, minSimilarity);
   }
   /**
@@ -313,6 +322,140 @@ export class VectorDB {
     await this.storage.destroy();
   }
+  /**
+   * Export the entire database as a ReadableStream of binary chunks.
+   *
+   * Format: [Header 24 bytes][Vector data][Keys data]
+   * Header: magic(4) + version(4) + dimensions(4) + vectorCount(4) + vectorDataLen(4) + keysDataLen(4)
+   *
+   * Vectors are streamed in 64KB chunks from WASM memory to avoid large
+   * heap allocations.
+   */
+  async export(): Promise<ReadableStream<Uint8Array>> {
+    this.assertOpen();
+    const totalVectors = this.memoryManager.vectorCount;
+    const vectorDataLen = totalVectors * this.dimensions * 4;
+    // Encode keys (typically much smaller than vectors)
+    const keysBytes = encodeLexicon(this.slotToKey);
+    const keysDataLen = keysBytes.byteLength;
+    // Build header
+    const header = new ArrayBuffer(EXPORT_HEADER_BYTES);
+    const headerView = new DataView(header);
+    headerView.setUint32(0, EXPORT_MAGIC, true);
+    headerView.setUint32(4, EXPORT_VERSION, true);
+    headerView.setUint32(8, this.dimensions, true);
+    headerView.setUint32(12, totalVectors, true);
+    headerView.setUint32(16, vectorDataLen, true);
+    headerView.setUint32(20, keysDataLen, true);
+    const mm = this.memoryManager;
+    let phase: "header" | "vectors" | "keys" | "done" = "header";
+    let vectorOffset = 0;
+    return new ReadableStream<Uint8Array>({
+      pull(controller) {
+        switch (phase) {
+          case "header":
+            controller.enqueue(new Uint8Array(header));
+            phase = vectorDataLen > 0 ? "vectors" : keysDataLen > 0 ? "keys" : "done";
+            if (phase === "done") controller.close();
+            break;
+          case "vectors": {
+            const remaining = vectorDataLen - vectorOffset;
+            const chunkSize = Math.min(remaining, STREAM_CHUNK_SIZE);
+            const chunk = new Uint8Array(chunkSize);
+            chunk.set(new Uint8Array(mm.memory.buffer, mm.dbOffset + vectorOffset, chunkSize));
+            controller.enqueue(chunk);
+            vectorOffset += chunkSize;
+            if (vectorOffset >= vectorDataLen) {
+              phase = keysDataLen > 0 ? "keys" : "done";
+              if (phase === "done") controller.close();
+            }
+            break;
+          }
+          case "keys":
+            controller.enqueue(keysBytes);
+            phase = "done";
+            controller.close();
+            break;
+        }
+      },
+    });
+  }
+  /**
+   * Import data from a ReadableStream, replacing all existing data.
+   * Performs a dimension check against the configured dimensions.
+   *
+   * Vectors are streamed directly into WASM memory in chunks to avoid
+   * large heap allocations.
+   */
+  async import(stream: ReadableStream<Uint8Array>): Promise<void> {
+    this.assertOpen();
+    const sr = new StreamReader(stream);
+    // Read and validate header
+    const headerBytes = await sr.readExact(EXPORT_HEADER_BYTES);
+    const headerView = new DataView(headerBytes.buffer, headerBytes.byteOffset, headerBytes.byteLength);
+    const magic = headerView.getUint32(0, true);
+    if (magic !== EXPORT_MAGIC) {
+      throw new Error("Invalid import data: unrecognized format");
+    }
+    const version = headerView.getUint32(4, true);
+    if (version !== EXPORT_VERSION) {
+      throw new Error(`Unsupported import version: expected ${EXPORT_VERSION}, got ${version}`);
+    }
+    const dimensions = headerView.getUint32(8, true);
+    if (dimensions !== this.dimensions) {
+      throw new Error(`Import dimension mismatch: expected ${this.dimensions}, got ${dimensions}`);
+    }
+    const vectorCount = headerView.getUint32(12, true);
+    const vectorDataLen = headerView.getUint32(16, true);
+    const keysDataLen = headerView.getUint32(20, true);
+    // Clear existing state
+    this.keyToSlot.clear();
+    this.slotToKey.length = 0;
+    this.memoryManager.reset();
+    // Stream vectors directly into WASM memory in chunks
+    if (vectorCount > 0) {
+      this.memoryManager.ensureCapacity(vectorCount);
+      let offset = 0;
+      await sr.readChunked(vectorDataLen, (chunk) => {
+        new Uint8Array(this.memoryManager.memory.buffer, this.memoryManager.dbOffset + offset, chunk.byteLength).set(
+          chunk,
+        );
+        offset += chunk.byteLength;
+      });
+      this.memoryManager.setVectorCount(vectorCount);
+    }
+    // Read and decode keys (typically much smaller than vectors)
+    if (keysDataLen > 0) {
+      const keysBytes = await sr.readExact(keysDataLen);
+      const keys = decodeLexicon(keysBytes);
+      for (let i = 0; i < keys.length; i++) {
+        this.keyToSlot.set(keys[i], i);
+        this.slotToKey[i] = keys[i];
+      }
+    }
+    sr.release();
+  }
   /**
    * Normalize a vector using WASM (if available) or JS fallback.
    */
@@ -334,3 +477,92 @@ export class VectorDB {
     }
   }
 }
+/**
+ * Helper for reading exact byte counts from a ReadableStream.
+ * Handles chunk boundaries transparently.
+ */
+class StreamReader {
+  private reader: ReadableStreamDefaultReader<Uint8Array>;
+  private buffer = new Uint8Array(0);
+  constructor(stream: ReadableStream<Uint8Array>) {
+    this.reader = stream.getReader();
+  }
+  /** Read exactly `n` bytes, accumulating from stream chunks as needed. */
+  async readExact(n: number): Promise<Uint8Array> {
+    if (n === 0) return new Uint8Array(0);
+    // Fast path: buffer already has enough data
+    if (this.buffer.byteLength >= n) {
+      const result = this.buffer.subarray(0, n);
+      this.buffer = this.buffer.subarray(n);
+      return result;
+    }
+    // Collect chunks until we have enough bytes
+    const chunks: Uint8Array[] = [];
+    let total = this.buffer.byteLength;
+    if (total > 0) {
+      chunks.push(this.buffer);
+      this.buffer = new Uint8Array(0);
+    }
+    while (total < n) {
+      const { done, value } = await this.reader.read();
+      if (done) throw new Error("Invalid import data: unexpected end of stream");
+      chunks.push(value);
+      total += value.byteLength;
+    }
+    // Combine into a single buffer
+    const combined = new Uint8Array(total);
+    let offset = 0;
+    for (const chunk of chunks) {
+      combined.set(chunk, offset);
+      offset += chunk.byteLength;
+    }
+    if (total > n) {
+      this.buffer = combined.subarray(n);
+      return combined.subarray(0, n);
+    }
+    return combined;
+  }
+  /**
+   * Read `totalBytes` from the stream in chunks, calling `onChunk` for each.
+   * Avoids allocating a single large buffer for the full data.
+   */
+  async readChunked(totalBytes: number, onChunk: (chunk: Uint8Array) => void): Promise<void> {
+    let remaining = totalBytes;
+    // Drain buffered leftover first
+    if (this.buffer.byteLength > 0) {
+      const take = Math.min(this.buffer.byteLength, remaining);
+      onChunk(this.buffer.subarray(0, take));
+      remaining -= take;
+      this.buffer = this.buffer.subarray(take);
+    }
+    while (remaining > 0) {
+      const { done, value } = await this.reader.read();
+      if (done) throw new Error("Invalid import data: unexpected end of stream");
+      if (value.byteLength <= remaining) {
+        onChunk(value);
+        remaining -= value.byteLength;
+      } else {
+        onChunk(value.subarray(0, remaining));
+        this.buffer = value.slice(remaining);
+        remaining = 0;
+      }
+    }
+  }
+  /** Release the stream reader lock. */
+  release(): void {
+    this.reader.releaseLock();
+  }
+}