@gmickel/gno 1.5.1 → 1.5.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@gmickel/gno",
3
- "version": "1.5.1",
3
+ "version": "1.5.2",
4
4
  "description": "Local semantic search for your documents. Index Markdown, PDF, and Office files with hybrid BM25 + vector search.",
5
5
  "keywords": [
6
6
  "embeddings",
@@ -16,6 +16,7 @@ import { getIndexDbPath, getModelsCachePath } from "../../app/constants";
16
16
  import { getConfigPaths, isInitialized, loadConfig } from "../../config";
17
17
  import { getCodeChunkingStatus } from "../../ingestion/chunker";
18
18
  import { ModelCache } from "../../llm/cache";
19
+ import { LlmAdapter } from "../../llm/nodeLlamaCpp/adapter";
19
20
  import { getActivePreset } from "../../llm/registry";
20
21
  import { loadFts5Snowball } from "../../store/sqlite/fts5-snowball";
21
22
  import {
@@ -136,11 +137,10 @@ function checkCodeChunking(): DoctorCheck {
136
137
  };
137
138
  }
138
139
 
139
- async function checkNodeLlamaCpp(): Promise<DoctorCheck> {
140
+ async function checkNodeLlamaCpp(config: Config): Promise<DoctorCheck> {
141
+ const llm = new LlmAdapter(config);
140
142
  try {
141
- const { getLlama } = await import("node-llama-cpp");
142
- // Just check that we can get the llama instance
143
- await getLlama();
143
+ await llm.getManager().getLlama();
144
144
  return {
145
145
  name: "node-llama-cpp",
146
146
  status: "ok",
@@ -153,6 +153,8 @@ async function checkNodeLlamaCpp(): Promise<DoctorCheck> {
153
153
  status: "error",
154
154
  message: `node-llama-cpp failed: ${message}`,
155
155
  };
156
+ } finally {
157
+ await llm.dispose();
156
158
  }
157
159
  }
158
160
 
@@ -330,7 +332,7 @@ export async function doctor(
330
332
  checks.push(...modelChecks);
331
333
 
332
334
  // node-llama-cpp check
333
- checks.push(await checkNodeLlamaCpp());
335
+ checks.push(await checkNodeLlamaCpp(config));
334
336
 
335
337
  // SQLite extension checks
336
338
  const sqliteChecks = await checkSqliteExtensions();
@@ -4,6 +4,8 @@
4
4
  * @module src/llm/nodeLlamaCpp/embedding
5
5
  */
6
6
 
7
+ import { platform, totalmem } from "node:os";
8
+
7
9
  import type { EmbeddingPort, LlmResult } from "../types";
8
10
  import type { ModelManager } from "./lifecycle";
9
11
 
@@ -46,6 +48,58 @@ interface TokenizingModel {
46
48
  // gracefully if memory is tight.
47
49
  const MAX_EMBEDDING_CONTEXTS = 4;
48
50
  const TARGET_CORES_PER_EMBEDDING_CONTEXT = 4;
51
+ const LOW_MEMORY_WINDOWS_THRESHOLD_BYTES = 24 * 1024 * 1024 * 1024;
52
+ const LOW_MEMORY_WINDOWS_CONTEXTS = 1;
53
+ const DEFAULT_EMBEDDING_CONTEXT_SIZE = 2_048;
54
+
55
+ function resolveEmbeddingContextPoolOverride(
56
+ env: NodeJS.ProcessEnv = process.env
57
+ ): number | undefined {
58
+ const raw = env.GNO_EMBED_CONTEXTS;
59
+ if (!raw) {
60
+ return undefined;
61
+ }
62
+ const parsed = Number.parseInt(raw, 10);
63
+ if (!(Number.isFinite(parsed) && parsed > 0)) {
64
+ return undefined;
65
+ }
66
+ return Math.max(1, Math.min(MAX_EMBEDDING_CONTEXTS, parsed));
67
+ }
68
+
69
+ export function resolveEmbeddingContextPoolSize(options: {
70
+ gpu: Llama["gpu"];
71
+ cpuMathCores: number;
72
+ env?: NodeJS.ProcessEnv;
73
+ platformName?: NodeJS.Platform;
74
+ totalMemoryBytes?: number;
75
+ }): number {
76
+ if (options.gpu !== false) {
77
+ return 1;
78
+ }
79
+
80
+ const override = resolveEmbeddingContextPoolOverride(options.env);
81
+ if (override !== undefined) {
82
+ return override;
83
+ }
84
+
85
+ const platformName = options.platformName ?? platform();
86
+ const totalMemoryBytes = options.totalMemoryBytes ?? totalmem();
87
+ if (
88
+ platformName === "win32" &&
89
+ totalMemoryBytes <= LOW_MEMORY_WINDOWS_THRESHOLD_BYTES
90
+ ) {
91
+ return LOW_MEMORY_WINDOWS_CONTEXTS;
92
+ }
93
+
94
+ const cpuMathCores = Math.max(1, options.cpuMathCores);
95
+ return Math.max(
96
+ 1,
97
+ Math.min(
98
+ MAX_EMBEDDING_CONTEXTS,
99
+ Math.ceil(cpuMathCores / TARGET_CORES_PER_EMBEDDING_CONTEXT)
100
+ )
101
+ );
102
+ }
49
103
 
50
104
  // ─────────────────────────────────────────────────────────────────────────────
51
105
  // Implementation
@@ -58,6 +112,7 @@ export class NodeLlamaCppEmbedding implements EmbeddingPort {
58
112
  private lifecycleVersion = 0;
59
113
  private dims: number | null = null;
60
114
  private llamaModel: TokenizingModel | null = null;
115
+ private embeddingContextSize = DEFAULT_EMBEDDING_CONTEXT_SIZE;
61
116
  private warnedSingleTruncation = false;
62
117
  private warnedBatchTruncation = false;
63
118
  private readonly manager: ModelManager;
@@ -250,18 +305,10 @@ export class NodeLlamaCppEmbedding implements EmbeddingPort {
250
305
  }
251
306
 
252
307
  private resolveTargetPoolSize(llama: Llama): number {
253
- if (llama.gpu !== false) {
254
- return 1;
255
- }
256
-
257
- const cpuMathCores = Math.max(1, llama.cpuMathCores);
258
- return Math.max(
259
- 1,
260
- Math.min(
261
- MAX_EMBEDDING_CONTEXTS,
262
- Math.ceil(cpuMathCores / TARGET_CORES_PER_EMBEDDING_CONTEXT)
263
- )
264
- );
308
+ return resolveEmbeddingContextPoolSize({
309
+ gpu: llama.gpu,
310
+ cpuMathCores: llama.cpuMathCores,
311
+ });
265
312
  }
266
313
 
267
314
  private resolveThreadsPerContext(llama: Llama, poolSize: number): number {
@@ -294,7 +341,12 @@ export class NodeLlamaCppEmbedding implements EmbeddingPort {
294
341
  targetPoolSize
295
342
  );
296
343
  const contextOptions =
297
- llama.gpu === false ? { threads: threadsPerContext } : undefined;
344
+ llama.gpu === false
345
+ ? {
346
+ contextSize: this.embeddingContextSize,
347
+ threads: threadsPerContext,
348
+ }
349
+ : { contextSize: this.embeddingContextSize };
298
350
  const contexts: LlamaEmbeddingContext[] = [];
299
351
 
300
352
  for (let i = 0; i < targetPoolSize; i += 1) {
@@ -350,16 +402,20 @@ export class NodeLlamaCppEmbedding implements EmbeddingPort {
350
402
  mode: "single" | "batch"
351
403
  ): LlmResult<{ text: string }> {
352
404
  const model = this.llamaModel;
353
- const rawLimit =
405
+ const modelLimit =
354
406
  typeof model?.trainContextSize === "number" &&
355
407
  Number.isFinite(model.trainContextSize) &&
356
408
  model.trainContextSize > 0
357
409
  ? Math.floor(model.trainContextSize)
358
410
  : undefined;
359
- if (!model || rawLimit === undefined) {
411
+ if (!model) {
360
412
  return { ok: true, value: { text } };
361
413
  }
362
414
 
415
+ const rawLimit =
416
+ modelLimit === undefined
417
+ ? this.embeddingContextSize
418
+ : Math.min(modelLimit, this.embeddingContextSize);
363
419
  const limit = Math.max(1, rawLimit - 4);
364
420
  try {
365
421
  const tokens = model.tokenize(text);
@@ -5,6 +5,10 @@
5
5
  * @module src/llm/nodeLlamaCpp/lifecycle
6
6
  */
7
7
 
8
+ import type { LlamaOptions } from "node-llama-cpp";
9
+
10
+ import { platform } from "node:os";
11
+
8
12
  import type { ModelConfig } from "../../config/types";
9
13
  import type { LlmResult, LoadedModel, ModelType } from "../types";
10
14
 
@@ -17,6 +21,12 @@ import { loadFailedError, outOfMemoryError, timeoutError } from "../errors";
17
21
  type Llama = Awaited<ReturnType<typeof import("node-llama-cpp").getLlama>>;
18
22
  type LlamaModel = Awaited<ReturnType<Llama["loadModel"]>>;
19
23
  export type LlamaGpuMode = "auto" | "metal" | "vulkan" | "cuda" | false;
24
+ export type LlamaBuildMode = "never" | "autoAttempt";
25
+
26
+ type LlamaInitOptions = LlamaOptions & {
27
+ build: LlamaBuildMode;
28
+ gpu: LlamaGpuMode;
29
+ };
20
30
 
21
31
  interface CachedModel {
22
32
  uri: string;
@@ -26,7 +36,11 @@ interface CachedModel {
26
36
  }
27
37
 
28
38
  let invalidGpuModeWarned = false;
39
+ let invalidBuildModeWarned = false;
29
40
  let gpuFallbackWarned = false;
41
+ let backendTimeoutWarned = false;
42
+
43
+ const DEFAULT_BACKEND_INIT_TIMEOUT_MS = 30_000;
30
44
 
31
45
  export function resolveLlamaGpuMode(
32
46
  env: NodeJS.ProcessEnv = process.env
@@ -59,6 +73,56 @@ export function resolveLlamaGpuMode(
59
73
  return "auto";
60
74
  }
61
75
 
76
+ export function resolveLlamaBuildMode(
77
+ env: NodeJS.ProcessEnv = process.env
78
+ ): LlamaBuildMode {
79
+ const raw = (env.GNO_LLAMA_BUILD ?? "never").trim().toLowerCase();
80
+ if (
81
+ !raw ||
82
+ raw === "never" ||
83
+ raw === "prebuilt" ||
84
+ raw === "prebuilt-only"
85
+ ) {
86
+ return "never";
87
+ }
88
+ if (
89
+ raw === "autoattempt" ||
90
+ raw === "auto-attempt" ||
91
+ raw === "source" ||
92
+ raw === "build"
93
+ ) {
94
+ return "autoAttempt";
95
+ }
96
+ if (!invalidBuildModeWarned) {
97
+ invalidBuildModeWarned = true;
98
+ console.warn(`[llama] Invalid GNO_LLAMA_BUILD value "${raw}", using never`);
99
+ }
100
+ return "never";
101
+ }
102
+
103
+ export function resolveLlamaBackendInitTimeoutMs(
104
+ env: NodeJS.ProcessEnv = process.env
105
+ ): number {
106
+ const raw = env.GNO_LLAMA_INIT_TIMEOUT_MS;
107
+ if (!raw) {
108
+ return DEFAULT_BACKEND_INIT_TIMEOUT_MS;
109
+ }
110
+ const parsed = Number.parseInt(raw, 10);
111
+ return Number.isFinite(parsed) && parsed > 0
112
+ ? parsed
113
+ : DEFAULT_BACKEND_INIT_TIMEOUT_MS;
114
+ }
115
+
116
+ export function shouldRetryLlamaWithCpu(
117
+ gpu: LlamaGpuMode,
118
+ platformName = platform()
119
+ ): boolean {
120
+ if (gpu === false) {
121
+ return false;
122
+ }
123
+ return gpu !== "auto" || platformName === "win32";
124
+ }
125
+
62
126
  // ─────────────────────────────────────────────────────────────────────────────
63
127
  // ModelManager
64
128
  // ─────────────────────────────────────────────────────────────────────────────
@@ -84,15 +148,21 @@ export class ModelManager {
84
148
  if (!this.llama) {
85
149
  const { getLlama, LlamaLogLevel } = await import("node-llama-cpp");
86
150
  const gpu = resolveLlamaGpuMode();
151
+ const build = resolveLlamaBuildMode();
152
+ const timeoutMs = resolveLlamaBackendInitTimeoutMs();
87
153
  // Suppress model loading warnings (vocab tokens, pooling type)
88
154
  try {
89
- this.llama = await getLlama({
90
- build: "autoAttempt",
91
- gpu,
92
- logLevel: LlamaLogLevel.error,
93
- });
155
+ this.llama = await this.getLlamaWithTimeout(
156
+ getLlama,
157
+ {
158
+ build,
159
+ gpu,
160
+ logLevel: LlamaLogLevel.error,
161
+ },
162
+ timeoutMs
163
+ );
94
164
  } catch (error) {
95
- if (gpu === "auto" || gpu === false) {
165
+ if (!shouldRetryLlamaWithCpu(gpu)) {
96
166
  throw error;
97
167
  }
98
168
  if (!gpuFallbackWarned) {
@@ -103,16 +173,48 @@ export class ModelManager {
103
173
  }`
104
174
  );
105
175
  }
106
- this.llama = await getLlama({
107
- build: "autoAttempt",
108
- gpu: false,
109
- logLevel: LlamaLogLevel.error,
110
- });
176
+ this.llama = await this.getLlamaWithTimeout(
177
+ getLlama,
178
+ {
179
+ build,
180
+ gpu: false,
181
+ logLevel: LlamaLogLevel.error,
182
+ },
183
+ timeoutMs
184
+ );
111
185
  }
112
186
  }
113
187
  return this.llama;
114
188
  }
115
189
 
190
+ private async getLlamaWithTimeout(
191
+ getLlama: (options: LlamaInitOptions) => Promise<Llama>,
192
+ options: LlamaInitOptions,
193
+ timeoutMs: number
194
+ ): Promise<Llama> {
195
+ let timeoutId: ReturnType<typeof setTimeout> | null = null;
196
+ try {
197
+ return await Promise.race([
198
+ getLlama(options),
199
+ new Promise<never>((_, reject) => {
200
+ timeoutId = setTimeout(() => {
201
+ if (!backendTimeoutWarned) {
202
+ backendTimeoutWarned = true;
203
+ console.warn(
204
+ `[llama] Backend initialization timed out after ${timeoutMs}ms`
205
+ );
206
+ }
207
+ reject(new Error(`Backend init timeout after ${timeoutMs}ms`));
208
+ }, timeoutMs);
209
+ }),
210
+ ]);
211
+ } finally {
212
+ if (timeoutId) {
213
+ clearTimeout(timeoutId);
214
+ }
215
+ }
216
+ }
217
+
116
218
  /**
117
219
  * Load a model by path.
118
220
  * Uses caching, inflight deduplication, and TTL-based disposal.