@gmickel/gno 1.5.1 → 1.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@gmickel/gno",
3
- "version": "1.5.1",
3
+ "version": "1.6.0",
4
4
  "description": "Local semantic search for your documents. Index Markdown, PDF, and Office files with hybrid BM25 + vector search.",
5
5
  "keywords": [
6
6
  "embeddings",
@@ -74,6 +74,8 @@
74
74
  "bench:code-embeddings:write": "bun scripts/code-embedding-benchmark.ts --write",
75
75
  "bench:general-embeddings": "bun scripts/general-embedding-benchmark.ts",
76
76
  "bench:general-embeddings:write": "bun scripts/general-embedding-benchmark.ts --write",
77
+ "bench:cpu-embeddings": "bun scripts/cpu-embed-autoresearch.ts",
78
+ "bench:cpu-embeddings:native-batch-probe": "bun scripts/native-embedding-batch-probe.ts",
77
79
  "eval:retrieval-candidates": "bun scripts/retrieval-candidate-benchmark.ts",
78
80
  "eval:retrieval-candidates:write": "bun scripts/retrieval-candidate-benchmark.ts --write",
79
81
  "eval:watch": "bun --bun evalite watch",
@@ -16,6 +16,7 @@ import { getIndexDbPath, getModelsCachePath } from "../../app/constants";
16
16
  import { getConfigPaths, isInitialized, loadConfig } from "../../config";
17
17
  import { getCodeChunkingStatus } from "../../ingestion/chunker";
18
18
  import { ModelCache } from "../../llm/cache";
19
+ import { LlmAdapter } from "../../llm/nodeLlamaCpp/adapter";
19
20
  import { getActivePreset } from "../../llm/registry";
20
21
  import { loadFts5Snowball } from "../../store/sqlite/fts5-snowball";
21
22
  import {
@@ -136,11 +137,10 @@ function checkCodeChunking(): DoctorCheck {
136
137
  };
137
138
  }
138
139
 
139
- async function checkNodeLlamaCpp(): Promise<DoctorCheck> {
140
+ async function checkNodeLlamaCpp(config: Config): Promise<DoctorCheck> {
141
+ const llm = new LlmAdapter(config);
140
142
  try {
141
- const { getLlama } = await import("node-llama-cpp");
142
- // Just check that we can get the llama instance
143
- await getLlama();
143
+ await llm.getManager().getLlama();
144
144
  return {
145
145
  name: "node-llama-cpp",
146
146
  status: "ok",
@@ -153,6 +153,8 @@ async function checkNodeLlamaCpp(): Promise<DoctorCheck> {
153
153
  status: "error",
154
154
  message: `node-llama-cpp failed: ${message}`,
155
155
  };
156
+ } finally {
157
+ await llm.dispose();
156
158
  }
157
159
  }
158
160
 
@@ -330,7 +332,7 @@ export async function doctor(
330
332
  checks.push(...modelChecks);
331
333
 
332
334
  // node-llama-cpp check
333
- checks.push(await checkNodeLlamaCpp());
335
+ checks.push(await checkNodeLlamaCpp(config));
334
336
 
335
337
  // SQLite extension checks
336
338
  const sqliteChecks = await checkSqliteExtensions();
@@ -4,6 +4,8 @@
4
4
  * @module src/llm/nodeLlamaCpp/embedding
5
5
  */
6
6
 
7
+ import { platform, totalmem } from "node:os";
8
+
7
9
  import type { EmbeddingPort, LlmResult } from "../types";
8
10
  import type { ModelManager } from "./lifecycle";
9
11
 
@@ -37,6 +39,8 @@ interface TokenizingModel {
37
39
  detokenize(tokens: readonly number[]): string;
38
40
  }
39
41
 
42
+ type EmbeddingInput = Parameters<LlamaEmbeddingContext["getEmbeddingFor"]>[0];
43
+
40
44
  // ─────────────────────────────────────────────────────────────────────────────
41
45
  // Constants
42
46
  // ─────────────────────────────────────────────────────────────────────────────
@@ -44,8 +48,104 @@ interface TokenizingModel {
44
48
  // Aim for a small pool so CPU-only runs can exploit parallel contexts without
45
49
  // multiplying RAM usage too aggressively. Additional contexts fall back
46
50
  // gracefully if memory is tight.
47
- const MAX_EMBEDDING_CONTEXTS = 4;
51
+ const MAX_DEFAULT_EMBEDDING_CONTEXTS = 2;
52
+ const MAX_EMBEDDING_CONTEXTS_OVERRIDE = 4;
48
53
  const TARGET_CORES_PER_EMBEDDING_CONTEXT = 4;
54
+ const CONSTRAINED_WINDOWS_THRESHOLD_BYTES = 16 * 1024 * 1024 * 1024;
55
+ const MID_MEMORY_WINDOWS_THRESHOLD_BYTES = 24 * 1024 * 1024 * 1024;
56
+ const LOW_MEMORY_WINDOWS_CONTEXTS = 1;
57
+ const MID_MEMORY_WINDOWS_CONTEXTS = 2;
58
+ const DEFAULT_EMBEDDING_CONTEXT_SIZE = 2_048;
59
+
60
+ function embeddingVectorToArray(vector: readonly number[]): number[] {
61
+ return Array.isArray(vector) ? (vector as number[]) : Array.from(vector);
62
+ }
63
+
64
+ function resolveEmbeddingContextPoolOverride(
65
+ env: NodeJS.ProcessEnv = process.env
66
+ ): number | undefined {
67
+ const raw = env.GNO_EMBED_CONTEXTS;
68
+ if (!raw) {
69
+ return undefined;
70
+ }
71
+ const parsed = Number.parseInt(raw, 10);
72
+ if (!(Number.isFinite(parsed) && parsed > 0)) {
73
+ return undefined;
74
+ }
75
+ return Math.max(1, Math.min(MAX_EMBEDDING_CONTEXTS_OVERRIDE, parsed));
76
+ }
77
+
78
+ function resolveThreadsPerContextOverride(
79
+ env: NodeJS.ProcessEnv = process.env
80
+ ): number | undefined {
81
+ const raw = env.GNO_EMBED_THREADS;
82
+ if (!raw) {
83
+ return undefined;
84
+ }
85
+ const parsed = Number.parseInt(raw, 10);
86
+ if (!(Number.isFinite(parsed) && parsed > 0)) {
87
+ return undefined;
88
+ }
89
+ return Math.max(1, parsed);
90
+ }
91
+
92
+ function resolveEmbeddingContextSizeOverride(
93
+ env: NodeJS.ProcessEnv = process.env
94
+ ): number | undefined {
95
+ const raw = env.GNO_EMBED_CONTEXT_SIZE;
96
+ if (!raw) {
97
+ return undefined;
98
+ }
99
+ const parsed = Number.parseInt(raw, 10);
100
+ if (!(Number.isFinite(parsed) && parsed > 0)) {
101
+ return undefined;
102
+ }
103
+ return Math.max(128, parsed);
104
+ }
105
+
106
+ export function resolveEmbeddingContextPoolSize(options: {
107
+ gpu: Llama["gpu"];
108
+ cpuMathCores: number;
109
+ env?: NodeJS.ProcessEnv;
110
+ platformName?: NodeJS.Platform;
111
+ totalMemoryBytes?: number;
112
+ }): number {
113
+ if (options.gpu !== false) {
114
+ return 1;
115
+ }
116
+
117
+ const override = resolveEmbeddingContextPoolOverride(options.env);
118
+ if (override !== undefined) {
119
+ return override;
120
+ }
121
+
122
+ const platformName = options.platformName ?? platform();
123
+ const totalMemoryBytes = options.totalMemoryBytes ?? totalmem();
124
+ if (
125
+ platformName === "win32" &&
126
+ totalMemoryBytes < CONSTRAINED_WINDOWS_THRESHOLD_BYTES
127
+ ) {
128
+ return LOW_MEMORY_WINDOWS_CONTEXTS;
129
+ }
130
+
131
+ const cpuMathCores = Math.max(1, options.cpuMathCores);
132
+ const adaptivePoolSize = Math.max(
133
+ 1,
134
+ Math.min(
135
+ MAX_DEFAULT_EMBEDDING_CONTEXTS,
136
+ Math.ceil(cpuMathCores / TARGET_CORES_PER_EMBEDDING_CONTEXT)
137
+ )
138
+ );
139
+
140
+ if (
141
+ platformName === "win32" &&
142
+ totalMemoryBytes < MID_MEMORY_WINDOWS_THRESHOLD_BYTES
143
+ ) {
144
+ return Math.min(MID_MEMORY_WINDOWS_CONTEXTS, adaptivePoolSize);
145
+ }
146
+
147
+ return adaptivePoolSize;
148
+ }
49
149
 
50
150
  // ─────────────────────────────────────────────────────────────────────────────
51
151
  // Implementation
@@ -58,6 +158,7 @@ export class NodeLlamaCppEmbedding implements EmbeddingPort {
58
158
  private lifecycleVersion = 0;
59
159
  private dims: number | null = null;
60
160
  private llamaModel: TokenizingModel | null = null;
161
+ private embeddingContextSize = DEFAULT_EMBEDDING_CONTEXT_SIZE;
61
162
  private warnedSingleTruncation = false;
62
163
  private warnedBatchTruncation = false;
63
164
  private readonly manager: ModelManager;
@@ -90,9 +191,9 @@ export class NodeLlamaCppEmbedding implements EmbeddingPort {
90
191
  return { ok: false, error: prepared.error };
91
192
  }
92
193
  const embedding = await this.runOnWorker((worker) =>
93
- worker.context.getEmbeddingFor(prepared.value.text)
194
+ worker.context.getEmbeddingFor(prepared.value.input)
94
195
  );
95
- const vector = Array.from(embedding.vector) as number[];
196
+ const vector = embeddingVectorToArray(embedding.vector);
96
197
 
97
198
  // Cache dimensions on first call
98
199
  if (this.dims === null) {
@@ -116,13 +217,13 @@ export class NodeLlamaCppEmbedding implements EmbeddingPort {
116
217
  }
117
218
 
118
219
  try {
119
- const preparedTexts: string[] = [];
220
+ const preparedInputs: EmbeddingInput[] = [];
120
221
  for (const text of texts) {
121
222
  const prepared = this.truncateForEmbedding(text, "batch");
122
223
  if (!prepared.ok) {
123
224
  return { ok: false, error: prepared.error };
124
225
  }
125
- preparedTexts.push(prepared.value.text);
226
+ preparedInputs.push(prepared.value.input);
126
227
  }
127
228
 
128
229
  const allResults = Array.from(
@@ -136,16 +237,19 @@ export class NodeLlamaCppEmbedding implements EmbeddingPort {
136
237
  while (true) {
137
238
  const index = nextIndex;
138
239
  nextIndex += 1;
139
- if (index >= preparedTexts.length) {
240
+ if (index >= preparedInputs.length) {
140
241
  return;
141
242
  }
142
243
 
244
+ const input = preparedInputs[index];
245
+ if (input === undefined) {
246
+ return;
247
+ }
143
248
  const embedding = await this.runOnSpecificWorker(
144
249
  worker,
145
- (current) =>
146
- current.context.getEmbeddingFor(preparedTexts[index] as string)
250
+ (current) => current.context.getEmbeddingFor(input)
147
251
  );
148
- allResults[index] = Array.from(embedding.vector) as number[];
252
+ allResults[index] = embeddingVectorToArray(embedding.vector);
149
253
  }
150
254
  })
151
255
  );
@@ -250,18 +354,10 @@ export class NodeLlamaCppEmbedding implements EmbeddingPort {
250
354
  }
251
355
 
252
356
  private resolveTargetPoolSize(llama: Llama): number {
253
- if (llama.gpu !== false) {
254
- return 1;
255
- }
256
-
257
- const cpuMathCores = Math.max(1, llama.cpuMathCores);
258
- return Math.max(
259
- 1,
260
- Math.min(
261
- MAX_EMBEDDING_CONTEXTS,
262
- Math.ceil(cpuMathCores / TARGET_CORES_PER_EMBEDDING_CONTEXT)
263
- )
264
- );
357
+ return resolveEmbeddingContextPoolSize({
358
+ gpu: llama.gpu,
359
+ cpuMathCores: llama.cpuMathCores,
360
+ });
265
361
  }
266
362
 
267
363
  private resolveThreadsPerContext(llama: Llama, poolSize: number): number {
@@ -269,6 +365,11 @@ export class NodeLlamaCppEmbedding implements EmbeddingPort {
269
365
  return 0;
270
366
  }
271
367
 
368
+ const override = resolveThreadsPerContextOverride();
369
+ if (override !== undefined) {
370
+ return override;
371
+ }
372
+
272
373
  return Math.max(1, Math.floor(Math.max(1, llama.cpuMathCores) / poolSize));
273
374
  }
274
375
 
@@ -288,13 +389,20 @@ export class NodeLlamaCppEmbedding implements EmbeddingPort {
288
389
  this.llamaModel = llamaModel as TokenizingModel;
289
390
  const llama = await this.manager.getLlama();
290
391
  const lifecycleVersion = this.lifecycleVersion;
392
+ this.embeddingContextSize =
393
+ resolveEmbeddingContextSizeOverride() ?? DEFAULT_EMBEDDING_CONTEXT_SIZE;
291
394
  const targetPoolSize = this.resolveTargetPoolSize(llama);
292
395
  const threadsPerContext = this.resolveThreadsPerContext(
293
396
  llama,
294
397
  targetPoolSize
295
398
  );
296
399
  const contextOptions =
297
- llama.gpu === false ? { threads: threadsPerContext } : undefined;
400
+ llama.gpu === false
401
+ ? {
402
+ contextSize: this.embeddingContextSize,
403
+ threads: threadsPerContext,
404
+ }
405
+ : { contextSize: this.embeddingContextSize };
298
406
  const contexts: LlamaEmbeddingContext[] = [];
299
407
 
300
408
  for (let i = 0; i < targetPoolSize; i += 1) {
@@ -348,26 +456,33 @@ export class NodeLlamaCppEmbedding implements EmbeddingPort {
348
456
  private truncateForEmbedding(
349
457
  text: string,
350
458
  mode: "single" | "batch"
351
- ): LlmResult<{ text: string }> {
459
+ ): LlmResult<{ input: EmbeddingInput }> {
352
460
  const model = this.llamaModel;
353
- const rawLimit =
461
+ const modelLimit =
354
462
  typeof model?.trainContextSize === "number" &&
355
463
  Number.isFinite(model.trainContextSize) &&
356
464
  model.trainContextSize > 0
357
465
  ? Math.floor(model.trainContextSize)
358
466
  : undefined;
359
- if (!model || rawLimit === undefined) {
360
- return { ok: true, value: { text } };
467
+ if (!model) {
468
+ return { ok: true, value: { input: text } };
361
469
  }
362
470
 
471
+ const rawLimit =
472
+ modelLimit === undefined
473
+ ? this.embeddingContextSize
474
+ : Math.min(modelLimit, this.embeddingContextSize);
363
475
  const limit = Math.max(1, rawLimit - 4);
364
476
  try {
365
477
  const tokens = model.tokenize(text);
366
478
  if (tokens.length <= limit) {
367
- return { ok: true, value: { text } };
479
+ return {
480
+ ok: true,
481
+ value: { input: tokens as EmbeddingInput },
482
+ };
368
483
  }
369
484
 
370
- const truncatedText = model.detokenize(tokens.slice(0, limit));
485
+ const truncatedTokens = tokens.slice(0, limit);
371
486
  const shouldWarn =
372
487
  mode === "single"
373
488
  ? !this.warnedSingleTruncation
@@ -382,7 +497,10 @@ export class NodeLlamaCppEmbedding implements EmbeddingPort {
382
497
  `[llama] Truncated embedding input from ${tokens.length} to ${limit} tokens`
383
498
  );
384
499
  }
385
- return { ok: true, value: { text: truncatedText } };
500
+ return {
501
+ ok: true,
502
+ value: { input: truncatedTokens as EmbeddingInput },
503
+ };
386
504
  } catch (error) {
387
505
  return { ok: false, error: inferenceFailedError(this.modelUri, error) };
388
506
  }
@@ -5,6 +5,10 @@
5
5
  * @module src/llm/nodeLlamaCpp/lifecycle
6
6
  */
7
7
 
8
+ import type { LlamaOptions } from "node-llama-cpp";
9
+
10
+ import { platform } from "node:os";
11
+
8
12
  import type { ModelConfig } from "../../config/types";
9
13
  import type { LlmResult, LoadedModel, ModelType } from "../types";
10
14
 
@@ -17,6 +21,12 @@ import { loadFailedError, outOfMemoryError, timeoutError } from "../errors";
17
21
  type Llama = Awaited<ReturnType<typeof import("node-llama-cpp").getLlama>>;
18
22
  type LlamaModel = Awaited<ReturnType<Llama["loadModel"]>>;
19
23
  export type LlamaGpuMode = "auto" | "metal" | "vulkan" | "cuda" | false;
24
+ export type LlamaBuildMode = "never" | "autoAttempt";
25
+
26
+ type LlamaInitOptions = LlamaOptions & {
27
+ build: LlamaBuildMode;
28
+ gpu: LlamaGpuMode;
29
+ };
20
30
 
21
31
  interface CachedModel {
22
32
  uri: string;
@@ -26,7 +36,11 @@ interface CachedModel {
26
36
  }
27
37
 
28
38
  let invalidGpuModeWarned = false;
39
+ let invalidBuildModeWarned = false;
29
40
  let gpuFallbackWarned = false;
41
+ let backendTimeoutWarned = false;
42
+
43
+ const DEFAULT_BACKEND_INIT_TIMEOUT_MS = 30_000;
30
44
 
31
45
  export function resolveLlamaGpuMode(
32
46
  env: NodeJS.ProcessEnv = process.env
@@ -59,6 +73,56 @@ export function resolveLlamaGpuMode(
59
73
  return "auto";
60
74
  }
61
75
 
76
+ export function resolveLlamaBuildMode(
77
+ env: NodeJS.ProcessEnv = process.env
78
+ ): LlamaBuildMode {
79
+ const raw = (env.GNO_LLAMA_BUILD ?? "never").trim().toLowerCase();
80
+ if (
81
+ !raw ||
82
+ raw === "never" ||
83
+ raw === "prebuilt" ||
84
+ raw === "prebuilt-only"
85
+ ) {
86
+ return "never";
87
+ }
88
+ if (
89
+ raw === "autoattempt" ||
90
+ raw === "auto-attempt" ||
91
+ raw === "source" ||
92
+ raw === "build"
93
+ ) {
94
+ return "autoAttempt";
95
+ }
96
+ if (!invalidBuildModeWarned) {
97
+ invalidBuildModeWarned = true;
98
+ console.warn(`[llama] Invalid GNO_LLAMA_BUILD value "${raw}", using never`);
99
+ }
100
+ return "never";
101
+ }
102
+
103
+ export function resolveLlamaBackendInitTimeoutMs(
104
+ env: NodeJS.ProcessEnv = process.env
105
+ ): number {
106
+ const raw = env.GNO_LLAMA_INIT_TIMEOUT_MS;
107
+ if (!raw) {
108
+ return DEFAULT_BACKEND_INIT_TIMEOUT_MS;
109
+ }
110
+ const parsed = Number.parseInt(raw, 10);
111
+ return Number.isFinite(parsed) && parsed > 0
112
+ ? parsed
113
+ : DEFAULT_BACKEND_INIT_TIMEOUT_MS;
114
+ }
115
+
116
+ export function shouldRetryLlamaWithCpu(
117
+ gpu: LlamaGpuMode,
118
+ platformName = platform()
119
+ ): boolean {
120
+ if (gpu === false) {
121
+ return false;
122
+ }
123
+ return gpu !== "auto" || platformName === "win32";
124
+ }
125
+
62
126
  // ─────────────────────────────────────────────────────────────────────────────
63
127
  // ModelManager
64
128
  // ─────────────────────────────────────────────────────────────────────────────
@@ -84,15 +148,21 @@ export class ModelManager {
84
148
  if (!this.llama) {
85
149
  const { getLlama, LlamaLogLevel } = await import("node-llama-cpp");
86
150
  const gpu = resolveLlamaGpuMode();
151
+ const build = resolveLlamaBuildMode();
152
+ const timeoutMs = resolveLlamaBackendInitTimeoutMs();
87
153
  // Suppress model loading warnings (vocab tokens, pooling type)
88
154
  try {
89
- this.llama = await getLlama({
90
- build: "autoAttempt",
91
- gpu,
92
- logLevel: LlamaLogLevel.error,
93
- });
155
+ this.llama = await this.getLlamaWithTimeout(
156
+ getLlama,
157
+ {
158
+ build,
159
+ gpu,
160
+ logLevel: LlamaLogLevel.error,
161
+ },
162
+ timeoutMs
163
+ );
94
164
  } catch (error) {
95
- if (gpu === "auto" || gpu === false) {
165
+ if (!shouldRetryLlamaWithCpu(gpu)) {
96
166
  throw error;
97
167
  }
98
168
  if (!gpuFallbackWarned) {
@@ -103,16 +173,48 @@ export class ModelManager {
103
173
  }`
104
174
  );
105
175
  }
106
- this.llama = await getLlama({
107
- build: "autoAttempt",
108
- gpu: false,
109
- logLevel: LlamaLogLevel.error,
110
- });
176
+ this.llama = await this.getLlamaWithTimeout(
177
+ getLlama,
178
+ {
179
+ build,
180
+ gpu: false,
181
+ logLevel: LlamaLogLevel.error,
182
+ },
183
+ timeoutMs
184
+ );
111
185
  }
112
186
  }
113
187
  return this.llama;
114
188
  }
115
189
 
190
+ private async getLlamaWithTimeout(
191
+ getLlama: (options: LlamaInitOptions) => Promise<Llama>,
192
+ options: LlamaInitOptions,
193
+ timeoutMs: number
194
+ ): Promise<Llama> {
195
+ let timeoutId: ReturnType<typeof setTimeout> | null = null;
196
+ try {
197
+ return await Promise.race([
198
+ getLlama(options),
199
+ new Promise<never>((_, reject) => {
200
+ timeoutId = setTimeout(() => {
201
+ if (!backendTimeoutWarned) {
202
+ backendTimeoutWarned = true;
203
+ console.warn(
204
+ `[llama] Backend initialization timed out after ${timeoutMs}ms`
205
+ );
206
+ }
207
+ reject(new Error(`Backend init timeout after ${timeoutMs}ms`));
208
+ }, timeoutMs);
209
+ }),
210
+ ]);
211
+ } finally {
212
+ if (timeoutId) {
213
+ clearTimeout(timeoutId);
214
+ }
215
+ }
216
+ }
217
+
116
218
  /**
117
219
  * Load a model by path.
118
220
  * Uses caching, inflight deduplication, and TTL-based disposal.