npm - @gmickel/gno - Versions diffs - 1.5.1 → 1.5.2 - Mend

@gmickel/gno 1.5.1 → 1.5.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (4) hide show

package/package.json +1 -1
package/src/cli/commands/doctor.ts +7 -5
package/src/llm/nodeLlamaCpp/embedding.ts +71 -15
package/src/llm/nodeLlamaCpp/lifecycle.ts +113 -11

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@gmickel/gno",
-  "version": "1.5.1",
+  "version": "1.5.2",
   "description": "Local semantic search for your documents. Index Markdown, PDF, and Office files with hybrid BM25 + vector search.",
   "keywords": [
     "embeddings",

package/src/cli/commands/doctor.ts CHANGED Viewed

@@ -16,6 +16,7 @@ import { getIndexDbPath, getModelsCachePath } from "../../app/constants";
 import { getConfigPaths, isInitialized, loadConfig } from "../../config";
 import { getCodeChunkingStatus } from "../../ingestion/chunker";
 import { ModelCache } from "../../llm/cache";
+import { LlmAdapter } from "../../llm/nodeLlamaCpp/adapter";
 import { getActivePreset } from "../../llm/registry";
 import { loadFts5Snowball } from "../../store/sqlite/fts5-snowball";
 import {
@@ -136,11 +137,10 @@ function checkCodeChunking(): DoctorCheck {
   };
 }
-async function checkNodeLlamaCpp(): Promise<DoctorCheck> {
+async function checkNodeLlamaCpp(config: Config): Promise<DoctorCheck> {
+  const llm = new LlmAdapter(config);
   try {
-    const { getLlama } = await import("node-llama-cpp");
-    // Just check that we can get the llama instance
-    await getLlama();
+    await llm.getManager().getLlama();
     return {
       name: "node-llama-cpp",
       status: "ok",
@@ -153,6 +153,8 @@ async function checkNodeLlamaCpp(): Promise<DoctorCheck> {
       status: "error",
       message: `node-llama-cpp failed: ${message}`,
     };
+  } finally {
+    await llm.dispose();
   }
 }
@@ -330,7 +332,7 @@ export async function doctor(
   checks.push(...modelChecks);
   // node-llama-cpp check
-  checks.push(await checkNodeLlamaCpp());
+  checks.push(await checkNodeLlamaCpp(config));
   // SQLite extension checks
   const sqliteChecks = await checkSqliteExtensions();

package/src/llm/nodeLlamaCpp/embedding.ts CHANGED Viewed

@@ -4,6 +4,8 @@
  * @module src/llm/nodeLlamaCpp/embedding
  */
+import { platform, totalmem } from "node:os";
 import type { EmbeddingPort, LlmResult } from "../types";
 import type { ModelManager } from "./lifecycle";
@@ -46,6 +48,58 @@ interface TokenizingModel {
 // gracefully if memory is tight.
 const MAX_EMBEDDING_CONTEXTS = 4;
 const TARGET_CORES_PER_EMBEDDING_CONTEXT = 4;
+const LOW_MEMORY_WINDOWS_THRESHOLD_BYTES = 24 * 1024 * 1024 * 1024;
+const LOW_MEMORY_WINDOWS_CONTEXTS = 1;
+const DEFAULT_EMBEDDING_CONTEXT_SIZE = 2_048;
+function resolveEmbeddingContextPoolOverride(
+  env: NodeJS.ProcessEnv = process.env
+): number | undefined {
+  const raw = env.GNO_EMBED_CONTEXTS;
+  if (!raw) {
+    return undefined;
+  }
+  const parsed = Number.parseInt(raw, 10);
+  if (!(Number.isFinite(parsed) && parsed > 0)) {
+    return undefined;
+  }
+  return Math.max(1, Math.min(MAX_EMBEDDING_CONTEXTS, parsed));
+}
+export function resolveEmbeddingContextPoolSize(options: {
+  gpu: Llama["gpu"];
+  cpuMathCores: number;
+  env?: NodeJS.ProcessEnv;
+  platformName?: NodeJS.Platform;
+  totalMemoryBytes?: number;
+}): number {
+  if (options.gpu !== false) {
+    return 1;
+  }
+  const override = resolveEmbeddingContextPoolOverride(options.env);
+  if (override !== undefined) {
+    return override;
+  }
+  const platformName = options.platformName ?? platform();
+  const totalMemoryBytes = options.totalMemoryBytes ?? totalmem();
+  if (
+    platformName === "win32" &&
+    totalMemoryBytes <= LOW_MEMORY_WINDOWS_THRESHOLD_BYTES
+  ) {
+    return LOW_MEMORY_WINDOWS_CONTEXTS;
+  }
+  const cpuMathCores = Math.max(1, options.cpuMathCores);
+  return Math.max(
+    1,
+    Math.min(
+      MAX_EMBEDDING_CONTEXTS,
+      Math.ceil(cpuMathCores / TARGET_CORES_PER_EMBEDDING_CONTEXT)
+    )
+  );
+}
 // ─────────────────────────────────────────────────────────────────────────────
 // Implementation
@@ -58,6 +112,7 @@ export class NodeLlamaCppEmbedding implements EmbeddingPort {
   private lifecycleVersion = 0;
   private dims: number | null = null;
   private llamaModel: TokenizingModel | null = null;
+  private embeddingContextSize = DEFAULT_EMBEDDING_CONTEXT_SIZE;
   private warnedSingleTruncation = false;
   private warnedBatchTruncation = false;
   private readonly manager: ModelManager;
@@ -250,18 +305,10 @@ export class NodeLlamaCppEmbedding implements EmbeddingPort {
   }
   private resolveTargetPoolSize(llama: Llama): number {
-    if (llama.gpu !== false) {
-      return 1;
-    }
-    const cpuMathCores = Math.max(1, llama.cpuMathCores);
-    return Math.max(
-      1,
-      Math.min(
-        MAX_EMBEDDING_CONTEXTS,
-        Math.ceil(cpuMathCores / TARGET_CORES_PER_EMBEDDING_CONTEXT)
-      )
-    );
+    return resolveEmbeddingContextPoolSize({
+      gpu: llama.gpu,
+      cpuMathCores: llama.cpuMathCores,
+    });
   }
   private resolveThreadsPerContext(llama: Llama, poolSize: number): number {
@@ -294,7 +341,12 @@ export class NodeLlamaCppEmbedding implements EmbeddingPort {
         targetPoolSize
       );
       const contextOptions =
-        llama.gpu === false ? { threads: threadsPerContext } : undefined;
+        llama.gpu === false
+          ? {
+              contextSize: this.embeddingContextSize,
+              threads: threadsPerContext,
+            }
+          : { contextSize: this.embeddingContextSize };
       const contexts: LlamaEmbeddingContext[] = [];
       for (let i = 0; i < targetPoolSize; i += 1) {
@@ -350,16 +402,20 @@ export class NodeLlamaCppEmbedding implements EmbeddingPort {
     mode: "single" | "batch"
   ): LlmResult<{ text: string }> {
     const model = this.llamaModel;
-    const rawLimit =
+    const modelLimit =
       typeof model?.trainContextSize === "number" &&
       Number.isFinite(model.trainContextSize) &&
       model.trainContextSize > 0
         ? Math.floor(model.trainContextSize)
         : undefined;
-    if (!model || rawLimit === undefined) {
+    if (!model) {
       return { ok: true, value: { text } };
     }
+    const rawLimit =
+      modelLimit === undefined
+        ? this.embeddingContextSize
+        : Math.min(modelLimit, this.embeddingContextSize);
     const limit = Math.max(1, rawLimit - 4);
     try {
       const tokens = model.tokenize(text);

package/src/llm/nodeLlamaCpp/lifecycle.ts CHANGED Viewed

@@ -5,6 +5,10 @@
  * @module src/llm/nodeLlamaCpp/lifecycle
  */
+import type { LlamaOptions } from "node-llama-cpp";
+import { platform } from "node:os";
 import type { ModelConfig } from "../../config/types";
 import type { LlmResult, LoadedModel, ModelType } from "../types";
@@ -17,6 +21,12 @@ import { loadFailedError, outOfMemoryError, timeoutError } from "../errors";
 type Llama = Awaited<ReturnType<typeof import("node-llama-cpp").getLlama>>;
 type LlamaModel = Awaited<ReturnType<Llama["loadModel"]>>;
 export type LlamaGpuMode = "auto" | "metal" | "vulkan" | "cuda" | false;
+export type LlamaBuildMode = "never" | "autoAttempt";
+type LlamaInitOptions = LlamaOptions & {
+  build: LlamaBuildMode;
+  gpu: LlamaGpuMode;
+};
 interface CachedModel {
   uri: string;
@@ -26,7 +36,11 @@ interface CachedModel {
 }
 let invalidGpuModeWarned = false;
+let invalidBuildModeWarned = false;
 let gpuFallbackWarned = false;
+let backendTimeoutWarned = false;
+const DEFAULT_BACKEND_INIT_TIMEOUT_MS = 30_000;
 export function resolveLlamaGpuMode(
   env: NodeJS.ProcessEnv = process.env
@@ -59,6 +73,56 @@ export function resolveLlamaGpuMode(
   return "auto";
 }
+export function resolveLlamaBuildMode(
+  env: NodeJS.ProcessEnv = process.env
+): LlamaBuildMode {
+  const raw = (env.GNO_LLAMA_BUILD ?? "never").trim().toLowerCase();
+  if (
+    !raw ||
+    raw === "never" ||
+    raw === "prebuilt" ||
+    raw === "prebuilt-only"
+  ) {
+    return "never";
+  }
+  if (
+    raw === "autoattempt" ||
+    raw === "auto-attempt" ||
+    raw === "source" ||
+    raw === "build"
+  ) {
+    return "autoAttempt";
+  }
+  if (!invalidBuildModeWarned) {
+    invalidBuildModeWarned = true;
+    console.warn(`[llama] Invalid GNO_LLAMA_BUILD value "${raw}", using never`);
+  }
+  return "never";
+}
+export function resolveLlamaBackendInitTimeoutMs(
+  env: NodeJS.ProcessEnv = process.env
+): number {
+  const raw = env.GNO_LLAMA_INIT_TIMEOUT_MS;
+  if (!raw) {
+    return DEFAULT_BACKEND_INIT_TIMEOUT_MS;
+  }
+  const parsed = Number.parseInt(raw, 10);
+  return Number.isFinite(parsed) && parsed > 0
+    ? parsed
+    : DEFAULT_BACKEND_INIT_TIMEOUT_MS;
+}
+export function shouldRetryLlamaWithCpu(
+  gpu: LlamaGpuMode,
+  platformName = platform()
+): boolean {
+  if (gpu === false) {
+    return false;
+  }
+  return gpu !== "auto" || platformName === "win32";
+}
 // ─────────────────────────────────────────────────────────────────────────────
 // ModelManager
 // ─────────────────────────────────────────────────────────────────────────────
@@ -84,15 +148,21 @@ export class ModelManager {
     if (!this.llama) {
       const { getLlama, LlamaLogLevel } = await import("node-llama-cpp");
       const gpu = resolveLlamaGpuMode();
+      const build = resolveLlamaBuildMode();
+      const timeoutMs = resolveLlamaBackendInitTimeoutMs();
       // Suppress model loading warnings (vocab tokens, pooling type)
       try {
-        this.llama = await getLlama({
-          build: "autoAttempt",
-          gpu,
-          logLevel: LlamaLogLevel.error,
-        });
+        this.llama = await this.getLlamaWithTimeout(
+          getLlama,
+          {
+            build,
+            gpu,
+            logLevel: LlamaLogLevel.error,
+          },
+          timeoutMs
+        );
       } catch (error) {
-        if (gpu === "auto" || gpu === false) {
+        if (!shouldRetryLlamaWithCpu(gpu)) {
           throw error;
         }
         if (!gpuFallbackWarned) {
@@ -103,16 +173,48 @@ export class ModelManager {
             }`
           );
         }
-        this.llama = await getLlama({
-          build: "autoAttempt",
-          gpu: false,
-          logLevel: LlamaLogLevel.error,
-        });
+        this.llama = await this.getLlamaWithTimeout(
+          getLlama,
+          {
+            build,
+            gpu: false,
+            logLevel: LlamaLogLevel.error,
+          },
+          timeoutMs
+        );
       }
     }
     return this.llama;
   }
+  private async getLlamaWithTimeout(
+    getLlama: (options: LlamaInitOptions) => Promise<Llama>,
+    options: LlamaInitOptions,
+    timeoutMs: number
+  ): Promise<Llama> {
+    let timeoutId: ReturnType<typeof setTimeout> | null = null;
+    try {
+      return await Promise.race([
+        getLlama(options),
+        new Promise<never>((_, reject) => {
+          timeoutId = setTimeout(() => {
+            if (!backendTimeoutWarned) {
+              backendTimeoutWarned = true;
+              console.warn(
+                `[llama] Backend initialization timed out after ${timeoutMs}ms`
+              );
+            }
+            reject(new Error(`Backend init timeout after ${timeoutMs}ms`));
+          }, timeoutMs);
+        }),
+      ]);
+    } finally {
+      if (timeoutId) {
+        clearTimeout(timeoutId);
+      }
+    }
+  }
   /**
    * Load a model by path.
    * Uses caching, inflight deduplication, and TTL-based disposal.