@gmickel/gno 1.5.2 → 1.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@gmickel/gno",
3
- "version": "1.5.2",
3
+ "version": "1.6.0",
4
4
  "description": "Local semantic search for your documents. Index Markdown, PDF, and Office files with hybrid BM25 + vector search.",
5
5
  "keywords": [
6
6
  "embeddings",
@@ -74,6 +74,8 @@
74
74
  "bench:code-embeddings:write": "bun scripts/code-embedding-benchmark.ts --write",
75
75
  "bench:general-embeddings": "bun scripts/general-embedding-benchmark.ts",
76
76
  "bench:general-embeddings:write": "bun scripts/general-embedding-benchmark.ts --write",
77
+ "bench:cpu-embeddings": "bun scripts/cpu-embed-autoresearch.ts",
78
+ "bench:cpu-embeddings:native-batch-probe": "bun scripts/native-embedding-batch-probe.ts",
77
79
  "eval:retrieval-candidates": "bun scripts/retrieval-candidate-benchmark.ts",
78
80
  "eval:retrieval-candidates:write": "bun scripts/retrieval-candidate-benchmark.ts --write",
79
81
  "eval:watch": "bun --bun evalite watch",
@@ -39,6 +39,8 @@ interface TokenizingModel {
39
39
  detokenize(tokens: readonly number[]): string;
40
40
  }
41
41
 
42
+ type EmbeddingInput = Parameters<LlamaEmbeddingContext["getEmbeddingFor"]>[0];
43
+
42
44
  // ─────────────────────────────────────────────────────────────────────────────
43
45
  // Constants
44
46
  // ─────────────────────────────────────────────────────────────────────────────
@@ -46,12 +48,19 @@ interface TokenizingModel {
46
48
  // Aim for a small pool so CPU-only runs can exploit parallel contexts without
47
49
  // multiplying RAM usage too aggressively. Additional contexts fall back
48
50
  // gracefully if memory is tight.
49
- const MAX_EMBEDDING_CONTEXTS = 4;
51
+ const MAX_DEFAULT_EMBEDDING_CONTEXTS = 2;
52
+ const MAX_EMBEDDING_CONTEXTS_OVERRIDE = 4;
50
53
  const TARGET_CORES_PER_EMBEDDING_CONTEXT = 4;
51
- const LOW_MEMORY_WINDOWS_THRESHOLD_BYTES = 24 * 1024 * 1024 * 1024;
54
+ const CONSTRAINED_WINDOWS_THRESHOLD_BYTES = 16 * 1024 * 1024 * 1024;
55
+ const MID_MEMORY_WINDOWS_THRESHOLD_BYTES = 24 * 1024 * 1024 * 1024;
52
56
  const LOW_MEMORY_WINDOWS_CONTEXTS = 1;
57
+ const MID_MEMORY_WINDOWS_CONTEXTS = 2;
53
58
  const DEFAULT_EMBEDDING_CONTEXT_SIZE = 2_048;
54
59
 
60
+ function embeddingVectorToArray(vector: readonly number[]): number[] {
61
+ return Array.isArray(vector) ? (vector as number[]) : Array.from(vector);
62
+ }
63
+
55
64
  function resolveEmbeddingContextPoolOverride(
56
65
  env: NodeJS.ProcessEnv = process.env
57
66
  ): number | undefined {
@@ -63,7 +72,35 @@ function resolveEmbeddingContextPoolOverride(
63
72
  if (!(Number.isFinite(parsed) && parsed > 0)) {
64
73
  return undefined;
65
74
  }
66
- return Math.max(1, Math.min(MAX_EMBEDDING_CONTEXTS, parsed));
75
+ return Math.max(1, Math.min(MAX_EMBEDDING_CONTEXTS_OVERRIDE, parsed));
76
+ }
77
+
78
+ function resolveThreadsPerContextOverride(
79
+ env: NodeJS.ProcessEnv = process.env
80
+ ): number | undefined {
81
+ const raw = env.GNO_EMBED_THREADS;
82
+ if (!raw) {
83
+ return undefined;
84
+ }
85
+ const parsed = Number.parseInt(raw, 10);
86
+ if (!(Number.isFinite(parsed) && parsed > 0)) {
87
+ return undefined;
88
+ }
89
+ return Math.max(1, parsed);
90
+ }
91
+
92
+ function resolveEmbeddingContextSizeOverride(
93
+ env: NodeJS.ProcessEnv = process.env
94
+ ): number | undefined {
95
+ const raw = env.GNO_EMBED_CONTEXT_SIZE;
96
+ if (!raw) {
97
+ return undefined;
98
+ }
99
+ const parsed = Number.parseInt(raw, 10);
100
+ if (!(Number.isFinite(parsed) && parsed > 0)) {
101
+ return undefined;
102
+ }
103
+ return Math.max(128, parsed);
67
104
  }
68
105
 
69
106
  export function resolveEmbeddingContextPoolSize(options: {
@@ -86,19 +123,28 @@ export function resolveEmbeddingContextPoolSize(options: {
86
123
  const totalMemoryBytes = options.totalMemoryBytes ?? totalmem();
87
124
  if (
88
125
  platformName === "win32" &&
89
- totalMemoryBytes <= LOW_MEMORY_WINDOWS_THRESHOLD_BYTES
126
+ totalMemoryBytes < CONSTRAINED_WINDOWS_THRESHOLD_BYTES
90
127
  ) {
91
128
  return LOW_MEMORY_WINDOWS_CONTEXTS;
92
129
  }
93
130
 
94
131
  const cpuMathCores = Math.max(1, options.cpuMathCores);
95
- return Math.max(
132
+ const adaptivePoolSize = Math.max(
96
133
  1,
97
134
  Math.min(
98
- MAX_EMBEDDING_CONTEXTS,
135
+ MAX_DEFAULT_EMBEDDING_CONTEXTS,
99
136
  Math.ceil(cpuMathCores / TARGET_CORES_PER_EMBEDDING_CONTEXT)
100
137
  )
101
138
  );
139
+
140
+ if (
141
+ platformName === "win32" &&
142
+ totalMemoryBytes < MID_MEMORY_WINDOWS_THRESHOLD_BYTES
143
+ ) {
144
+ return Math.min(MID_MEMORY_WINDOWS_CONTEXTS, adaptivePoolSize);
145
+ }
146
+
147
+ return adaptivePoolSize;
102
148
  }
103
149
 
104
150
  // ─────────────────────────────────────────────────────────────────────────────
@@ -145,9 +191,9 @@ export class NodeLlamaCppEmbedding implements EmbeddingPort {
145
191
  return { ok: false, error: prepared.error };
146
192
  }
147
193
  const embedding = await this.runOnWorker((worker) =>
148
- worker.context.getEmbeddingFor(prepared.value.text)
194
+ worker.context.getEmbeddingFor(prepared.value.input)
149
195
  );
150
- const vector = Array.from(embedding.vector) as number[];
196
+ const vector = embeddingVectorToArray(embedding.vector);
151
197
 
152
198
  // Cache dimensions on first call
153
199
  if (this.dims === null) {
@@ -171,13 +217,13 @@ export class NodeLlamaCppEmbedding implements EmbeddingPort {
171
217
  }
172
218
 
173
219
  try {
174
- const preparedTexts: string[] = [];
220
+ const preparedInputs: EmbeddingInput[] = [];
175
221
  for (const text of texts) {
176
222
  const prepared = this.truncateForEmbedding(text, "batch");
177
223
  if (!prepared.ok) {
178
224
  return { ok: false, error: prepared.error };
179
225
  }
180
- preparedTexts.push(prepared.value.text);
226
+ preparedInputs.push(prepared.value.input);
181
227
  }
182
228
 
183
229
  const allResults = Array.from(
@@ -191,16 +237,19 @@ export class NodeLlamaCppEmbedding implements EmbeddingPort {
191
237
  while (true) {
192
238
  const index = nextIndex;
193
239
  nextIndex += 1;
194
- if (index >= preparedTexts.length) {
240
+ if (index >= preparedInputs.length) {
195
241
  return;
196
242
  }
197
243
 
244
+ const input = preparedInputs[index];
245
+ if (input === undefined) {
246
+ return;
247
+ }
198
248
  const embedding = await this.runOnSpecificWorker(
199
249
  worker,
200
- (current) =>
201
- current.context.getEmbeddingFor(preparedTexts[index] as string)
250
+ (current) => current.context.getEmbeddingFor(input)
202
251
  );
203
- allResults[index] = Array.from(embedding.vector) as number[];
252
+ allResults[index] = embeddingVectorToArray(embedding.vector);
204
253
  }
205
254
  })
206
255
  );
@@ -316,6 +365,11 @@ export class NodeLlamaCppEmbedding implements EmbeddingPort {
316
365
  return 0;
317
366
  }
318
367
 
368
+ const override = resolveThreadsPerContextOverride();
369
+ if (override !== undefined) {
370
+ return override;
371
+ }
372
+
319
373
  return Math.max(1, Math.floor(Math.max(1, llama.cpuMathCores) / poolSize));
320
374
  }
321
375
 
@@ -335,6 +389,8 @@ export class NodeLlamaCppEmbedding implements EmbeddingPort {
335
389
  this.llamaModel = llamaModel as TokenizingModel;
336
390
  const llama = await this.manager.getLlama();
337
391
  const lifecycleVersion = this.lifecycleVersion;
392
+ this.embeddingContextSize =
393
+ resolveEmbeddingContextSizeOverride() ?? DEFAULT_EMBEDDING_CONTEXT_SIZE;
338
394
  const targetPoolSize = this.resolveTargetPoolSize(llama);
339
395
  const threadsPerContext = this.resolveThreadsPerContext(
340
396
  llama,
@@ -400,7 +456,7 @@ export class NodeLlamaCppEmbedding implements EmbeddingPort {
400
456
  private truncateForEmbedding(
401
457
  text: string,
402
458
  mode: "single" | "batch"
403
- ): LlmResult<{ text: string }> {
459
+ ): LlmResult<{ input: EmbeddingInput }> {
404
460
  const model = this.llamaModel;
405
461
  const modelLimit =
406
462
  typeof model?.trainContextSize === "number" &&
@@ -409,7 +465,7 @@ export class NodeLlamaCppEmbedding implements EmbeddingPort {
409
465
  ? Math.floor(model.trainContextSize)
410
466
  : undefined;
411
467
  if (!model) {
412
- return { ok: true, value: { text } };
468
+ return { ok: true, value: { input: text } };
413
469
  }
414
470
 
415
471
  const rawLimit =
@@ -420,10 +476,13 @@ export class NodeLlamaCppEmbedding implements EmbeddingPort {
420
476
  try {
421
477
  const tokens = model.tokenize(text);
422
478
  if (tokens.length <= limit) {
423
- return { ok: true, value: { text } };
479
+ return {
480
+ ok: true,
481
+ value: { input: tokens as EmbeddingInput },
482
+ };
424
483
  }
425
484
 
426
- const truncatedText = model.detokenize(tokens.slice(0, limit));
485
+ const truncatedTokens = tokens.slice(0, limit);
427
486
  const shouldWarn =
428
487
  mode === "single"
429
488
  ? !this.warnedSingleTruncation
@@ -438,7 +497,10 @@ export class NodeLlamaCppEmbedding implements EmbeddingPort {
438
497
  `[llama] Truncated embedding input from ${tokens.length} to ${limit} tokens`
439
498
  );
440
499
  }
441
- return { ok: true, value: { text: truncatedText } };
500
+ return {
501
+ ok: true,
502
+ value: { input: truncatedTokens as EmbeddingInput },
503
+ };
442
504
  } catch (error) {
443
505
  return { ok: false, error: inferenceFailedError(this.modelUri, error) };
444
506
  }