npm - @gmickel/gno - Versions diffs - 0.27.3 → 0.28.0 - Mend

@gmickel/gno 0.27.3 → 0.28.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (2) hide show

package/package.json +1 -1
package/src/llm/nodeLlamaCpp/embedding.ts +185 -70

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@gmickel/gno",
-  "version": "0.27.3",
+  "version": "0.28.0",
   "description": "Local semantic search for your documents. Index Markdown, PDF, and Office files with hybrid BM25 + vector search.",
   "keywords": [
     "embeddings",

package/src/llm/nodeLlamaCpp/embedding.ts CHANGED Viewed

@@ -24,23 +24,32 @@ type LlamaEmbeddingContext = Awaited<
   ReturnType<LlamaModel["createEmbeddingContext"]>
 >;
+type Llama = Awaited<ReturnType<typeof import("node-llama-cpp").getLlama>>;
+interface EmbeddingWorker {
+  context: LlamaEmbeddingContext;
+  pending: number;
+}
 // ─────────────────────────────────────────────────────────────────────────────
 // Constants
 // ─────────────────────────────────────────────────────────────────────────────
-// Max concurrent embedding operations per batch to avoid overwhelming the context.
-// node-llama-cpp contexts may not handle high concurrency well; this provides
-// a safe default while still allowing parallelism within chunks.
-const MAX_CONCURRENT_EMBEDDINGS = 16;
+// Aim for a small pool so CPU-only runs can exploit parallel contexts without
+// multiplying RAM usage too aggressively. Additional contexts fall back
+// gracefully if memory is tight.
+const MAX_EMBEDDING_CONTEXTS = 4;
+const TARGET_CORES_PER_EMBEDDING_CONTEXT = 4;
 // ─────────────────────────────────────────────────────────────────────────────
 // Implementation
 // ─────────────────────────────────────────────────────────────────────────────
 export class NodeLlamaCppEmbedding implements EmbeddingPort {
-  private context: LlamaEmbeddingContext | null = null;
-  private contextPromise: Promise<LlmResult<LlamaEmbeddingContext>> | null =
+  private workers: EmbeddingWorker[] = [];
+  private contextsPromise: Promise<LlmResult<LlamaEmbeddingContext[]>> | null =
     null;
+  private lifecycleVersion = 0;
   private dims: number | null = null;
   private readonly manager: ModelManager;
   readonly modelUri: string;
@@ -53,21 +62,23 @@ export class NodeLlamaCppEmbedding implements EmbeddingPort {
   }
   async init(): Promise<LlmResult<void>> {
-    const ctx = await this.getContext();
-    if (!ctx.ok) {
-      return ctx;
+    const contexts = await this.getContexts();
+    if (!contexts.ok) {
+      return contexts;
     }
     return { ok: true, value: undefined };
   }
   async embed(text: string): Promise<LlmResult<number[]>> {
-    const ctx = await this.getContext();
-    if (!ctx.ok) {
-      return ctx;
+    const contexts = await this.getContexts();
+    if (!contexts.ok) {
+      return contexts;
     }
     try {
-      const embedding = await ctx.value.getEmbeddingFor(text);
+      const embedding = await this.runOnWorker((worker) =>
+        worker.context.getEmbeddingFor(text)
+      );
       const vector = Array.from(embedding.vector) as number[];
       // Cache dimensions on first call
@@ -82,9 +93,9 @@ export class NodeLlamaCppEmbedding implements EmbeddingPort {
   }
   async embedBatch(texts: string[]): Promise<LlmResult<number[][]>> {
-    const ctx = await this.getContext();
-    if (!ctx.ok) {
-      return ctx;
+    const contexts = await this.getContexts();
+    if (!contexts.ok) {
+      return contexts;
     }
     if (texts.length === 0) {
@@ -92,39 +103,40 @@ export class NodeLlamaCppEmbedding implements EmbeddingPort {
     }
     try {
-      // Process in chunks to avoid overwhelming the embedding context.
-      // node-llama-cpp v3.x only exposes getEmbeddingFor (single text), not a native
-      // batch method. We use allSettled within chunks to ensure all in-flight ops
-      // complete before returning (prevents orphaned operations on early failure).
-      const allResults: number[][] = [];
-      for (let i = 0; i < texts.length; i += MAX_CONCURRENT_EMBEDDINGS) {
-        const chunk = texts.slice(i, i + MAX_CONCURRENT_EMBEDDINGS);
-        const settled = await Promise.allSettled(
-          chunk.map((text) => ctx.value.getEmbeddingFor(text))
-        );
-        // Check for any failures in this chunk
-        const firstRejection = settled.find(
-          (r): r is PromiseRejectedResult => r.status === "rejected"
-        );
-        if (firstRejection) {
-          return {
-            ok: false,
-            error: inferenceFailedError(this.modelUri, firstRejection.reason),
-          };
-        }
-        // Extract results from this chunk (cast safe after rejection check)
-        const chunkResults = (
-          settled as Array<
-            PromiseFulfilledResult<
-              Awaited<ReturnType<typeof ctx.value.getEmbeddingFor>>
-            >
-          >
-        ).map((r) => Array.from(r.value.vector) as number[]);
-        allResults.push(...chunkResults);
+      const allResults = Array.from(
+        { length: texts.length },
+        () => [] as number[]
+      );
+      let nextIndex = 0;
+      const settled = await Promise.allSettled(
+        this.workers.map(async (worker) => {
+          while (true) {
+            const index = nextIndex;
+            nextIndex += 1;
+            if (index >= texts.length) {
+              return;
+            }
+            const embedding = await this.runOnSpecificWorker(
+              worker,
+              (current) =>
+                current.context.getEmbeddingFor(texts[index] as string)
+            );
+            allResults[index] = Array.from(embedding.vector) as number[];
+          }
+        })
+      );
+      const firstRejection = settled.find(
+        (result): result is PromiseRejectedResult =>
+          result.status === "rejected"
+      );
+      if (firstRejection) {
+        return {
+          ok: false,
+          error: inferenceFailedError(this.modelUri, firstRejection.reason),
+        };
       }
       // Cache dimensions from first result
@@ -147,15 +159,17 @@ export class NodeLlamaCppEmbedding implements EmbeddingPort {
   }
   async dispose(): Promise<void> {
-    // Clear promise first to prevent reuse of disposed context
-    this.contextPromise = null;
-    if (this.context) {
+    this.lifecycleVersion += 1;
+    this.contextsPromise = null;
+    const workers = this.workers;
+    this.workers = [];
+    for (const worker of workers) {
       try {
-        await this.context.dispose();
+        await worker.context.dispose();
       } catch {
         // Ignore disposal errors
       }
-      this.context = null;
     }
   }
@@ -163,46 +177,147 @@ export class NodeLlamaCppEmbedding implements EmbeddingPort {
   // Private
   // ───────────────────────────────────────────────────────────────────────────
-  private getContext(): Promise<LlmResult<LlamaEmbeddingContext>> {
-    // Return cached context
-    if (this.context) {
-      return Promise.resolve({ ok: true, value: this.context });
+  private async runOnWorker<T>(
+    task: (worker: EmbeddingWorker) => Promise<T>
+  ): Promise<T> {
+    const worker = this.getLeastBusyWorker();
+    return this.runOnSpecificWorker(worker, task);
+  }
+  private async runOnSpecificWorker<T>(
+    worker: EmbeddingWorker,
+    task: (worker: EmbeddingWorker) => Promise<T>
+  ): Promise<T> {
+    worker.pending += 1;
+    try {
+      return await task(worker);
+    } finally {
+      worker.pending -= 1;
+    }
+  }
+  private getLeastBusyWorker(): EmbeddingWorker {
+    const firstWorker = this.workers[0];
+    if (!firstWorker) {
+      throw new Error("Embedding context not initialized");
+    }
+    let bestWorker = firstWorker;
+    for (const worker of this.workers) {
+      if (worker.pending < bestWorker.pending) {
+        bestWorker = worker;
+      }
+    }
+    return bestWorker;
+  }
+  private getContexts(): Promise<LlmResult<LlamaEmbeddingContext[]>> {
+    if (this.workers.length > 0) {
+      return Promise.resolve({
+        ok: true,
+        value: this.workers.map((worker) => worker.context),
+      });
     }
-    // Reuse in-flight promise to prevent concurrent context creation
-    if (this.contextPromise) {
-      return this.contextPromise;
+    if (this.contextsPromise) {
+      return this.contextsPromise;
     }
-    this.contextPromise = this.createContext();
-    return this.contextPromise;
+    this.contextsPromise = this.createContexts();
+    return this.contextsPromise;
   }
-  private async createContext(): Promise<LlmResult<LlamaEmbeddingContext>> {
+  private resolveTargetPoolSize(llama: Llama): number {
+    if (llama.gpu !== false) {
+      return 1;
+    }
+    const cpuMathCores = Math.max(1, llama.cpuMathCores);
+    return Math.max(
+      1,
+      Math.min(
+        MAX_EMBEDDING_CONTEXTS,
+        Math.ceil(cpuMathCores / TARGET_CORES_PER_EMBEDDING_CONTEXT)
+      )
+    );
+  }
+  private resolveThreadsPerContext(llama: Llama, poolSize: number): number {
+    if (llama.gpu !== false) {
+      return 0;
+    }
+    return Math.max(1, Math.floor(Math.max(1, llama.cpuMathCores) / poolSize));
+  }
+  private async createContexts(): Promise<LlmResult<LlamaEmbeddingContext[]>> {
     const model = await this.manager.loadModel(
       this.modelPath,
       this.modelUri,
       "embed"
     );
     if (!model.ok) {
-      this.contextPromise = null; // Allow retry
+      this.contextsPromise = null;
       return model;
     }
     try {
-      // Cast to access createEmbeddingContext
       const llamaModel = model.value.model as LlamaModel;
-      this.context = await llamaModel.createEmbeddingContext();
+      const llama = await this.manager.getLlama();
+      const lifecycleVersion = this.lifecycleVersion;
+      const targetPoolSize = this.resolveTargetPoolSize(llama);
+      const threadsPerContext = this.resolveThreadsPerContext(
+        llama,
+        targetPoolSize
+      );
+      const contextOptions =
+        llama.gpu === false ? { threads: threadsPerContext } : undefined;
+      const contexts: LlamaEmbeddingContext[] = [];
+      for (let i = 0; i < targetPoolSize; i += 1) {
+        try {
+          const context =
+            await llamaModel.createEmbeddingContext(contextOptions);
+          contexts.push(context);
+        } catch (error) {
+          if (contexts.length === 0) {
+            this.contextsPromise = null;
+            return {
+              ok: false,
+              error: inferenceFailedError(this.modelUri, error),
+            };
+          }
+          break;
+        }
+      }
+      if (lifecycleVersion !== this.lifecycleVersion) {
+        for (const context of contexts) {
+          try {
+            await context.dispose();
+          } catch {
+            // Ignore disposal errors
+          }
+        }
+        return {
+          ok: false,
+          error: inferenceFailedError(
+            this.modelUri,
+            new Error("Embedding context disposed during initialization")
+          ),
+        };
+      }
+      this.workers = contexts.map((context) => ({ context, pending: 0 }));
-      // Cache dimensions from model (available without running embed)
       const size = llamaModel.embeddingVectorSize;
       if (this.dims === null && typeof size === "number" && size > 0) {
         this.dims = size;
       }
-      return { ok: true, value: this.context };
+      return { ok: true, value: contexts };
     } catch (e) {
-      this.contextPromise = null; // Allow retry
+      this.contextsPromise = null;
       return { ok: false, error: inferenceFailedError(this.modelUri, e) };
     }
   }