npm - @gmickel/gno - Versions diffs - 1.5.1 → 1.6.0 - Mend

@gmickel/gno 1.5.1 → 1.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (4) hide show

package/package.json +3 -1
package/src/cli/commands/doctor.ts +7 -5
package/src/llm/nodeLlamaCpp/embedding.ts +147 -29
package/src/llm/nodeLlamaCpp/lifecycle.ts +113 -11

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@gmickel/gno",
-  "version": "1.5.1",
+  "version": "1.6.0",
   "description": "Local semantic search for your documents. Index Markdown, PDF, and Office files with hybrid BM25 + vector search.",
   "keywords": [
     "embeddings",
@@ -74,6 +74,8 @@
     "bench:code-embeddings:write": "bun scripts/code-embedding-benchmark.ts --write",
     "bench:general-embeddings": "bun scripts/general-embedding-benchmark.ts",
     "bench:general-embeddings:write": "bun scripts/general-embedding-benchmark.ts --write",
+    "bench:cpu-embeddings": "bun scripts/cpu-embed-autoresearch.ts",
+    "bench:cpu-embeddings:native-batch-probe": "bun scripts/native-embedding-batch-probe.ts",
     "eval:retrieval-candidates": "bun scripts/retrieval-candidate-benchmark.ts",
     "eval:retrieval-candidates:write": "bun scripts/retrieval-candidate-benchmark.ts --write",
     "eval:watch": "bun --bun evalite watch",

package/src/cli/commands/doctor.ts CHANGED Viewed

@@ -16,6 +16,7 @@ import { getIndexDbPath, getModelsCachePath } from "../../app/constants";
 import { getConfigPaths, isInitialized, loadConfig } from "../../config";
 import { getCodeChunkingStatus } from "../../ingestion/chunker";
 import { ModelCache } from "../../llm/cache";
+import { LlmAdapter } from "../../llm/nodeLlamaCpp/adapter";
 import { getActivePreset } from "../../llm/registry";
 import { loadFts5Snowball } from "../../store/sqlite/fts5-snowball";
 import {
@@ -136,11 +137,10 @@ function checkCodeChunking(): DoctorCheck {
   };
 }
-async function checkNodeLlamaCpp(): Promise<DoctorCheck> {
+async function checkNodeLlamaCpp(config: Config): Promise<DoctorCheck> {
+  const llm = new LlmAdapter(config);
   try {
-    const { getLlama } = await import("node-llama-cpp");
-    // Just check that we can get the llama instance
-    await getLlama();
+    await llm.getManager().getLlama();
     return {
       name: "node-llama-cpp",
       status: "ok",
@@ -153,6 +153,8 @@ async function checkNodeLlamaCpp(): Promise<DoctorCheck> {
       status: "error",
       message: `node-llama-cpp failed: ${message}`,
     };
+  } finally {
+    await llm.dispose();
   }
 }
@@ -330,7 +332,7 @@ export async function doctor(
   checks.push(...modelChecks);
   // node-llama-cpp check
-  checks.push(await checkNodeLlamaCpp());
+  checks.push(await checkNodeLlamaCpp(config));
   // SQLite extension checks
   const sqliteChecks = await checkSqliteExtensions();

package/src/llm/nodeLlamaCpp/embedding.ts CHANGED Viewed

@@ -4,6 +4,8 @@
  * @module src/llm/nodeLlamaCpp/embedding
  */
+import { platform, totalmem } from "node:os";
 import type { EmbeddingPort, LlmResult } from "../types";
 import type { ModelManager } from "./lifecycle";
@@ -37,6 +39,8 @@ interface TokenizingModel {
   detokenize(tokens: readonly number[]): string;
 }
+type EmbeddingInput = Parameters<LlamaEmbeddingContext["getEmbeddingFor"]>[0];
 // ─────────────────────────────────────────────────────────────────────────────
 // Constants
 // ─────────────────────────────────────────────────────────────────────────────
@@ -44,8 +48,104 @@ interface TokenizingModel {
 // Aim for a small pool so CPU-only runs can exploit parallel contexts without
 // multiplying RAM usage too aggressively. Additional contexts fall back
 // gracefully if memory is tight.
-const MAX_EMBEDDING_CONTEXTS = 4;
+const MAX_DEFAULT_EMBEDDING_CONTEXTS = 2;
+const MAX_EMBEDDING_CONTEXTS_OVERRIDE = 4;
 const TARGET_CORES_PER_EMBEDDING_CONTEXT = 4;
+const CONSTRAINED_WINDOWS_THRESHOLD_BYTES = 16 * 1024 * 1024 * 1024;
+const MID_MEMORY_WINDOWS_THRESHOLD_BYTES = 24 * 1024 * 1024 * 1024;
+const LOW_MEMORY_WINDOWS_CONTEXTS = 1;
+const MID_MEMORY_WINDOWS_CONTEXTS = 2;
+const DEFAULT_EMBEDDING_CONTEXT_SIZE = 2_048;
+function embeddingVectorToArray(vector: readonly number[]): number[] {
+  return Array.isArray(vector) ? (vector as number[]) : Array.from(vector);
+}
+function resolveEmbeddingContextPoolOverride(
+  env: NodeJS.ProcessEnv = process.env
+): number | undefined {
+  const raw = env.GNO_EMBED_CONTEXTS;
+  if (!raw) {
+    return undefined;
+  }
+  const parsed = Number.parseInt(raw, 10);
+  if (!(Number.isFinite(parsed) && parsed > 0)) {
+    return undefined;
+  }
+  return Math.max(1, Math.min(MAX_EMBEDDING_CONTEXTS_OVERRIDE, parsed));
+}
+function resolveThreadsPerContextOverride(
+  env: NodeJS.ProcessEnv = process.env
+): number | undefined {
+  const raw = env.GNO_EMBED_THREADS;
+  if (!raw) {
+    return undefined;
+  }
+  const parsed = Number.parseInt(raw, 10);
+  if (!(Number.isFinite(parsed) && parsed > 0)) {
+    return undefined;
+  }
+  return Math.max(1, parsed);
+}
+function resolveEmbeddingContextSizeOverride(
+  env: NodeJS.ProcessEnv = process.env
+): number | undefined {
+  const raw = env.GNO_EMBED_CONTEXT_SIZE;
+  if (!raw) {
+    return undefined;
+  }
+  const parsed = Number.parseInt(raw, 10);
+  if (!(Number.isFinite(parsed) && parsed > 0)) {
+    return undefined;
+  }
+  return Math.max(128, parsed);
+}
+export function resolveEmbeddingContextPoolSize(options: {
+  gpu: Llama["gpu"];
+  cpuMathCores: number;
+  env?: NodeJS.ProcessEnv;
+  platformName?: NodeJS.Platform;
+  totalMemoryBytes?: number;
+}): number {
+  if (options.gpu !== false) {
+    return 1;
+  }
+  const override = resolveEmbeddingContextPoolOverride(options.env);
+  if (override !== undefined) {
+    return override;
+  }
+  const platformName = options.platformName ?? platform();
+  const totalMemoryBytes = options.totalMemoryBytes ?? totalmem();
+  if (
+    platformName === "win32" &&
+    totalMemoryBytes < CONSTRAINED_WINDOWS_THRESHOLD_BYTES
+  ) {
+    return LOW_MEMORY_WINDOWS_CONTEXTS;
+  }
+  const cpuMathCores = Math.max(1, options.cpuMathCores);
+  const adaptivePoolSize = Math.max(
+    1,
+    Math.min(
+      MAX_DEFAULT_EMBEDDING_CONTEXTS,
+      Math.ceil(cpuMathCores / TARGET_CORES_PER_EMBEDDING_CONTEXT)
+    )
+  );
+  if (
+    platformName === "win32" &&
+    totalMemoryBytes < MID_MEMORY_WINDOWS_THRESHOLD_BYTES
+  ) {
+    return Math.min(MID_MEMORY_WINDOWS_CONTEXTS, adaptivePoolSize);
+  }
+  return adaptivePoolSize;
+}
 // ─────────────────────────────────────────────────────────────────────────────
 // Implementation
@@ -58,6 +158,7 @@ export class NodeLlamaCppEmbedding implements EmbeddingPort {
   private lifecycleVersion = 0;
   private dims: number | null = null;
   private llamaModel: TokenizingModel | null = null;
+  private embeddingContextSize = DEFAULT_EMBEDDING_CONTEXT_SIZE;
   private warnedSingleTruncation = false;
   private warnedBatchTruncation = false;
   private readonly manager: ModelManager;
@@ -90,9 +191,9 @@ export class NodeLlamaCppEmbedding implements EmbeddingPort {
         return { ok: false, error: prepared.error };
       }
       const embedding = await this.runOnWorker((worker) =>
-        worker.context.getEmbeddingFor(prepared.value.text)
+        worker.context.getEmbeddingFor(prepared.value.input)
       );
-      const vector = Array.from(embedding.vector) as number[];
+      const vector = embeddingVectorToArray(embedding.vector);
       // Cache dimensions on first call
       if (this.dims === null) {
@@ -116,13 +217,13 @@ export class NodeLlamaCppEmbedding implements EmbeddingPort {
     }
     try {
-      const preparedTexts: string[] = [];
+      const preparedInputs: EmbeddingInput[] = [];
       for (const text of texts) {
         const prepared = this.truncateForEmbedding(text, "batch");
         if (!prepared.ok) {
           return { ok: false, error: prepared.error };
         }
-        preparedTexts.push(prepared.value.text);
+        preparedInputs.push(prepared.value.input);
       }
       const allResults = Array.from(
@@ -136,16 +237,19 @@ export class NodeLlamaCppEmbedding implements EmbeddingPort {
           while (true) {
             const index = nextIndex;
             nextIndex += 1;
-            if (index >= preparedTexts.length) {
+            if (index >= preparedInputs.length) {
               return;
             }
+            const input = preparedInputs[index];
+            if (input === undefined) {
+              return;
+            }
             const embedding = await this.runOnSpecificWorker(
               worker,
-              (current) =>
-                current.context.getEmbeddingFor(preparedTexts[index] as string)
+              (current) => current.context.getEmbeddingFor(input)
             );
-            allResults[index] = Array.from(embedding.vector) as number[];
+            allResults[index] = embeddingVectorToArray(embedding.vector);
           }
         })
       );
@@ -250,18 +354,10 @@ export class NodeLlamaCppEmbedding implements EmbeddingPort {
   }
   private resolveTargetPoolSize(llama: Llama): number {
-    if (llama.gpu !== false) {
-      return 1;
-    }
-    const cpuMathCores = Math.max(1, llama.cpuMathCores);
-    return Math.max(
-      1,
-      Math.min(
-        MAX_EMBEDDING_CONTEXTS,
-        Math.ceil(cpuMathCores / TARGET_CORES_PER_EMBEDDING_CONTEXT)
-      )
-    );
+    return resolveEmbeddingContextPoolSize({
+      gpu: llama.gpu,
+      cpuMathCores: llama.cpuMathCores,
+    });
   }
   private resolveThreadsPerContext(llama: Llama, poolSize: number): number {
@@ -269,6 +365,11 @@ export class NodeLlamaCppEmbedding implements EmbeddingPort {
       return 0;
     }
+    const override = resolveThreadsPerContextOverride();
+    if (override !== undefined) {
+      return override;
+    }
     return Math.max(1, Math.floor(Math.max(1, llama.cpuMathCores) / poolSize));
   }
@@ -288,13 +389,20 @@ export class NodeLlamaCppEmbedding implements EmbeddingPort {
       this.llamaModel = llamaModel as TokenizingModel;
       const llama = await this.manager.getLlama();
       const lifecycleVersion = this.lifecycleVersion;
+      this.embeddingContextSize =
+        resolveEmbeddingContextSizeOverride() ?? DEFAULT_EMBEDDING_CONTEXT_SIZE;
       const targetPoolSize = this.resolveTargetPoolSize(llama);
       const threadsPerContext = this.resolveThreadsPerContext(
         llama,
         targetPoolSize
       );
       const contextOptions =
-        llama.gpu === false ? { threads: threadsPerContext } : undefined;
+        llama.gpu === false
+          ? {
+              contextSize: this.embeddingContextSize,
+              threads: threadsPerContext,
+            }
+          : { contextSize: this.embeddingContextSize };
       const contexts: LlamaEmbeddingContext[] = [];
       for (let i = 0; i < targetPoolSize; i += 1) {
@@ -348,26 +456,33 @@ export class NodeLlamaCppEmbedding implements EmbeddingPort {
   private truncateForEmbedding(
     text: string,
     mode: "single" | "batch"
-  ): LlmResult<{ text: string }> {
+  ): LlmResult<{ input: EmbeddingInput }> {
     const model = this.llamaModel;
-    const rawLimit =
+    const modelLimit =
       typeof model?.trainContextSize === "number" &&
       Number.isFinite(model.trainContextSize) &&
       model.trainContextSize > 0
         ? Math.floor(model.trainContextSize)
         : undefined;
-    if (!model || rawLimit === undefined) {
-      return { ok: true, value: { text } };
+    if (!model) {
+      return { ok: true, value: { input: text } };
     }
+    const rawLimit =
+      modelLimit === undefined
+        ? this.embeddingContextSize
+        : Math.min(modelLimit, this.embeddingContextSize);
     const limit = Math.max(1, rawLimit - 4);
     try {
       const tokens = model.tokenize(text);
       if (tokens.length <= limit) {
-        return { ok: true, value: { text } };
+        return {
+          ok: true,
+          value: { input: tokens as EmbeddingInput },
+        };
       }
-      const truncatedText = model.detokenize(tokens.slice(0, limit));
+      const truncatedTokens = tokens.slice(0, limit);
       const shouldWarn =
         mode === "single"
           ? !this.warnedSingleTruncation
@@ -382,7 +497,10 @@ export class NodeLlamaCppEmbedding implements EmbeddingPort {
           `[llama] Truncated embedding input from ${tokens.length} to ${limit} tokens`
         );
       }
-      return { ok: true, value: { text: truncatedText } };
+      return {
+        ok: true,
+        value: { input: truncatedTokens as EmbeddingInput },
+      };
     } catch (error) {
       return { ok: false, error: inferenceFailedError(this.modelUri, error) };
     }

package/src/llm/nodeLlamaCpp/lifecycle.ts CHANGED Viewed

@@ -5,6 +5,10 @@
  * @module src/llm/nodeLlamaCpp/lifecycle
  */
+import type { LlamaOptions } from "node-llama-cpp";
+import { platform } from "node:os";
 import type { ModelConfig } from "../../config/types";
 import type { LlmResult, LoadedModel, ModelType } from "../types";
@@ -17,6 +21,12 @@ import { loadFailedError, outOfMemoryError, timeoutError } from "../errors";
 type Llama = Awaited<ReturnType<typeof import("node-llama-cpp").getLlama>>;
 type LlamaModel = Awaited<ReturnType<Llama["loadModel"]>>;
 export type LlamaGpuMode = "auto" | "metal" | "vulkan" | "cuda" | false;
+export type LlamaBuildMode = "never" | "autoAttempt";
+type LlamaInitOptions = LlamaOptions & {
+  build: LlamaBuildMode;
+  gpu: LlamaGpuMode;
+};
 interface CachedModel {
   uri: string;
@@ -26,7 +36,11 @@ interface CachedModel {
 }
 let invalidGpuModeWarned = false;
+let invalidBuildModeWarned = false;
 let gpuFallbackWarned = false;
+let backendTimeoutWarned = false;
+const DEFAULT_BACKEND_INIT_TIMEOUT_MS = 30_000;
 export function resolveLlamaGpuMode(
   env: NodeJS.ProcessEnv = process.env
@@ -59,6 +73,56 @@ export function resolveLlamaGpuMode(
   return "auto";
 }
+export function resolveLlamaBuildMode(
+  env: NodeJS.ProcessEnv = process.env
+): LlamaBuildMode {
+  const raw = (env.GNO_LLAMA_BUILD ?? "never").trim().toLowerCase();
+  if (
+    !raw ||
+    raw === "never" ||
+    raw === "prebuilt" ||
+    raw === "prebuilt-only"
+  ) {
+    return "never";
+  }
+  if (
+    raw === "autoattempt" ||
+    raw === "auto-attempt" ||
+    raw === "source" ||
+    raw === "build"
+  ) {
+    return "autoAttempt";
+  }
+  if (!invalidBuildModeWarned) {
+    invalidBuildModeWarned = true;
+    console.warn(`[llama] Invalid GNO_LLAMA_BUILD value "${raw}", using never`);
+  }
+  return "never";
+}
+export function resolveLlamaBackendInitTimeoutMs(
+  env: NodeJS.ProcessEnv = process.env
+): number {
+  const raw = env.GNO_LLAMA_INIT_TIMEOUT_MS;
+  if (!raw) {
+    return DEFAULT_BACKEND_INIT_TIMEOUT_MS;
+  }
+  const parsed = Number.parseInt(raw, 10);
+  return Number.isFinite(parsed) && parsed > 0
+    ? parsed
+    : DEFAULT_BACKEND_INIT_TIMEOUT_MS;
+}
+export function shouldRetryLlamaWithCpu(
+  gpu: LlamaGpuMode,
+  platformName = platform()
+): boolean {
+  if (gpu === false) {
+    return false;
+  }
+  return gpu !== "auto" || platformName === "win32";
+}
 // ─────────────────────────────────────────────────────────────────────────────
 // ModelManager
 // ─────────────────────────────────────────────────────────────────────────────
@@ -84,15 +148,21 @@ export class ModelManager {
     if (!this.llama) {
       const { getLlama, LlamaLogLevel } = await import("node-llama-cpp");
       const gpu = resolveLlamaGpuMode();
+      const build = resolveLlamaBuildMode();
+      const timeoutMs = resolveLlamaBackendInitTimeoutMs();
       // Suppress model loading warnings (vocab tokens, pooling type)
       try {
-        this.llama = await getLlama({
-          build: "autoAttempt",
-          gpu,
-          logLevel: LlamaLogLevel.error,
-        });
+        this.llama = await this.getLlamaWithTimeout(
+          getLlama,
+          {
+            build,
+            gpu,
+            logLevel: LlamaLogLevel.error,
+          },
+          timeoutMs
+        );
       } catch (error) {
-        if (gpu === "auto" || gpu === false) {
+        if (!shouldRetryLlamaWithCpu(gpu)) {
           throw error;
         }
         if (!gpuFallbackWarned) {
@@ -103,16 +173,48 @@ export class ModelManager {
             }`
           );
         }
-        this.llama = await getLlama({
-          build: "autoAttempt",
-          gpu: false,
-          logLevel: LlamaLogLevel.error,
-        });
+        this.llama = await this.getLlamaWithTimeout(
+          getLlama,
+          {
+            build,
+            gpu: false,
+            logLevel: LlamaLogLevel.error,
+          },
+          timeoutMs
+        );
       }
     }
     return this.llama;
   }
+  private async getLlamaWithTimeout(
+    getLlama: (options: LlamaInitOptions) => Promise<Llama>,
+    options: LlamaInitOptions,
+    timeoutMs: number
+  ): Promise<Llama> {
+    let timeoutId: ReturnType<typeof setTimeout> | null = null;
+    try {
+      return await Promise.race([
+        getLlama(options),
+        new Promise<never>((_, reject) => {
+          timeoutId = setTimeout(() => {
+            if (!backendTimeoutWarned) {
+              backendTimeoutWarned = true;
+              console.warn(
+                `[llama] Backend initialization timed out after ${timeoutMs}ms`
+              );
+            }
+            reject(new Error(`Backend init timeout after ${timeoutMs}ms`));
+          }, timeoutMs);
+        }),
+      ]);
+    } finally {
+      if (timeoutId) {
+        clearTimeout(timeoutId);
+      }
+    }
+  }
   /**
    * Load a model by path.
    * Uses caching, inflight deduplication, and TTL-based disposal.