@gmickel/gno 1.5.2 → 1.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@gmickel/gno",
|
|
3
|
-
"version": "1.
|
|
3
|
+
"version": "1.6.0",
|
|
4
4
|
"description": "Local semantic search for your documents. Index Markdown, PDF, and Office files with hybrid BM25 + vector search.",
|
|
5
5
|
"keywords": [
|
|
6
6
|
"embeddings",
|
|
@@ -74,6 +74,8 @@
|
|
|
74
74
|
"bench:code-embeddings:write": "bun scripts/code-embedding-benchmark.ts --write",
|
|
75
75
|
"bench:general-embeddings": "bun scripts/general-embedding-benchmark.ts",
|
|
76
76
|
"bench:general-embeddings:write": "bun scripts/general-embedding-benchmark.ts --write",
|
|
77
|
+
"bench:cpu-embeddings": "bun scripts/cpu-embed-autoresearch.ts",
|
|
78
|
+
"bench:cpu-embeddings:native-batch-probe": "bun scripts/native-embedding-batch-probe.ts",
|
|
77
79
|
"eval:retrieval-candidates": "bun scripts/retrieval-candidate-benchmark.ts",
|
|
78
80
|
"eval:retrieval-candidates:write": "bun scripts/retrieval-candidate-benchmark.ts --write",
|
|
79
81
|
"eval:watch": "bun --bun evalite watch",
|
|
@@ -39,6 +39,8 @@ interface TokenizingModel {
|
|
|
39
39
|
detokenize(tokens: readonly number[]): string;
|
|
40
40
|
}
|
|
41
41
|
|
|
42
|
+
type EmbeddingInput = Parameters<LlamaEmbeddingContext["getEmbeddingFor"]>[0];
|
|
43
|
+
|
|
42
44
|
// ─────────────────────────────────────────────────────────────────────────────
|
|
43
45
|
// Constants
|
|
44
46
|
// ─────────────────────────────────────────────────────────────────────────────
|
|
@@ -46,12 +48,19 @@ interface TokenizingModel {
|
|
|
46
48
|
// Aim for a small pool so CPU-only runs can exploit parallel contexts without
|
|
47
49
|
// multiplying RAM usage too aggressively. Additional contexts fall back
|
|
48
50
|
// gracefully if memory is tight.
|
|
49
|
-
const
|
|
51
|
+
const MAX_DEFAULT_EMBEDDING_CONTEXTS = 2;
|
|
52
|
+
const MAX_EMBEDDING_CONTEXTS_OVERRIDE = 4;
|
|
50
53
|
const TARGET_CORES_PER_EMBEDDING_CONTEXT = 4;
|
|
51
|
-
const
|
|
54
|
+
const CONSTRAINED_WINDOWS_THRESHOLD_BYTES = 16 * 1024 * 1024 * 1024;
|
|
55
|
+
const MID_MEMORY_WINDOWS_THRESHOLD_BYTES = 24 * 1024 * 1024 * 1024;
|
|
52
56
|
const LOW_MEMORY_WINDOWS_CONTEXTS = 1;
|
|
57
|
+
const MID_MEMORY_WINDOWS_CONTEXTS = 2;
|
|
53
58
|
const DEFAULT_EMBEDDING_CONTEXT_SIZE = 2_048;
|
|
54
59
|
|
|
60
|
+
function embeddingVectorToArray(vector: readonly number[]): number[] {
|
|
61
|
+
return Array.isArray(vector) ? (vector as number[]) : Array.from(vector);
|
|
62
|
+
}
|
|
63
|
+
|
|
55
64
|
function resolveEmbeddingContextPoolOverride(
|
|
56
65
|
env: NodeJS.ProcessEnv = process.env
|
|
57
66
|
): number | undefined {
|
|
@@ -63,7 +72,35 @@ function resolveEmbeddingContextPoolOverride(
|
|
|
63
72
|
if (!(Number.isFinite(parsed) && parsed > 0)) {
|
|
64
73
|
return undefined;
|
|
65
74
|
}
|
|
66
|
-
return Math.max(1, Math.min(
|
|
75
|
+
return Math.max(1, Math.min(MAX_EMBEDDING_CONTEXTS_OVERRIDE, parsed));
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
function resolveThreadsPerContextOverride(
|
|
79
|
+
env: NodeJS.ProcessEnv = process.env
|
|
80
|
+
): number | undefined {
|
|
81
|
+
const raw = env.GNO_EMBED_THREADS;
|
|
82
|
+
if (!raw) {
|
|
83
|
+
return undefined;
|
|
84
|
+
}
|
|
85
|
+
const parsed = Number.parseInt(raw, 10);
|
|
86
|
+
if (!(Number.isFinite(parsed) && parsed > 0)) {
|
|
87
|
+
return undefined;
|
|
88
|
+
}
|
|
89
|
+
return Math.max(1, parsed);
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
function resolveEmbeddingContextSizeOverride(
|
|
93
|
+
env: NodeJS.ProcessEnv = process.env
|
|
94
|
+
): number | undefined {
|
|
95
|
+
const raw = env.GNO_EMBED_CONTEXT_SIZE;
|
|
96
|
+
if (!raw) {
|
|
97
|
+
return undefined;
|
|
98
|
+
}
|
|
99
|
+
const parsed = Number.parseInt(raw, 10);
|
|
100
|
+
if (!(Number.isFinite(parsed) && parsed > 0)) {
|
|
101
|
+
return undefined;
|
|
102
|
+
}
|
|
103
|
+
return Math.max(128, parsed);
|
|
67
104
|
}
|
|
68
105
|
|
|
69
106
|
export function resolveEmbeddingContextPoolSize(options: {
|
|
@@ -86,19 +123,28 @@ export function resolveEmbeddingContextPoolSize(options: {
|
|
|
86
123
|
const totalMemoryBytes = options.totalMemoryBytes ?? totalmem();
|
|
87
124
|
if (
|
|
88
125
|
platformName === "win32" &&
|
|
89
|
-
totalMemoryBytes
|
|
126
|
+
totalMemoryBytes < CONSTRAINED_WINDOWS_THRESHOLD_BYTES
|
|
90
127
|
) {
|
|
91
128
|
return LOW_MEMORY_WINDOWS_CONTEXTS;
|
|
92
129
|
}
|
|
93
130
|
|
|
94
131
|
const cpuMathCores = Math.max(1, options.cpuMathCores);
|
|
95
|
-
|
|
132
|
+
const adaptivePoolSize = Math.max(
|
|
96
133
|
1,
|
|
97
134
|
Math.min(
|
|
98
|
-
|
|
135
|
+
MAX_DEFAULT_EMBEDDING_CONTEXTS,
|
|
99
136
|
Math.ceil(cpuMathCores / TARGET_CORES_PER_EMBEDDING_CONTEXT)
|
|
100
137
|
)
|
|
101
138
|
);
|
|
139
|
+
|
|
140
|
+
if (
|
|
141
|
+
platformName === "win32" &&
|
|
142
|
+
totalMemoryBytes < MID_MEMORY_WINDOWS_THRESHOLD_BYTES
|
|
143
|
+
) {
|
|
144
|
+
return Math.min(MID_MEMORY_WINDOWS_CONTEXTS, adaptivePoolSize);
|
|
145
|
+
}
|
|
146
|
+
|
|
147
|
+
return adaptivePoolSize;
|
|
102
148
|
}
|
|
103
149
|
|
|
104
150
|
// ─────────────────────────────────────────────────────────────────────────────
|
|
@@ -145,9 +191,9 @@ export class NodeLlamaCppEmbedding implements EmbeddingPort {
|
|
|
145
191
|
return { ok: false, error: prepared.error };
|
|
146
192
|
}
|
|
147
193
|
const embedding = await this.runOnWorker((worker) =>
|
|
148
|
-
worker.context.getEmbeddingFor(prepared.value.
|
|
194
|
+
worker.context.getEmbeddingFor(prepared.value.input)
|
|
149
195
|
);
|
|
150
|
-
const vector =
|
|
196
|
+
const vector = embeddingVectorToArray(embedding.vector);
|
|
151
197
|
|
|
152
198
|
// Cache dimensions on first call
|
|
153
199
|
if (this.dims === null) {
|
|
@@ -171,13 +217,13 @@ export class NodeLlamaCppEmbedding implements EmbeddingPort {
|
|
|
171
217
|
}
|
|
172
218
|
|
|
173
219
|
try {
|
|
174
|
-
const
|
|
220
|
+
const preparedInputs: EmbeddingInput[] = [];
|
|
175
221
|
for (const text of texts) {
|
|
176
222
|
const prepared = this.truncateForEmbedding(text, "batch");
|
|
177
223
|
if (!prepared.ok) {
|
|
178
224
|
return { ok: false, error: prepared.error };
|
|
179
225
|
}
|
|
180
|
-
|
|
226
|
+
preparedInputs.push(prepared.value.input);
|
|
181
227
|
}
|
|
182
228
|
|
|
183
229
|
const allResults = Array.from(
|
|
@@ -191,16 +237,19 @@ export class NodeLlamaCppEmbedding implements EmbeddingPort {
|
|
|
191
237
|
while (true) {
|
|
192
238
|
const index = nextIndex;
|
|
193
239
|
nextIndex += 1;
|
|
194
|
-
if (index >=
|
|
240
|
+
if (index >= preparedInputs.length) {
|
|
195
241
|
return;
|
|
196
242
|
}
|
|
197
243
|
|
|
244
|
+
const input = preparedInputs[index];
|
|
245
|
+
if (input === undefined) {
|
|
246
|
+
return;
|
|
247
|
+
}
|
|
198
248
|
const embedding = await this.runOnSpecificWorker(
|
|
199
249
|
worker,
|
|
200
|
-
(current) =>
|
|
201
|
-
current.context.getEmbeddingFor(preparedTexts[index] as string)
|
|
250
|
+
(current) => current.context.getEmbeddingFor(input)
|
|
202
251
|
);
|
|
203
|
-
allResults[index] =
|
|
252
|
+
allResults[index] = embeddingVectorToArray(embedding.vector);
|
|
204
253
|
}
|
|
205
254
|
})
|
|
206
255
|
);
|
|
@@ -316,6 +365,11 @@ export class NodeLlamaCppEmbedding implements EmbeddingPort {
|
|
|
316
365
|
return 0;
|
|
317
366
|
}
|
|
318
367
|
|
|
368
|
+
const override = resolveThreadsPerContextOverride();
|
|
369
|
+
if (override !== undefined) {
|
|
370
|
+
return override;
|
|
371
|
+
}
|
|
372
|
+
|
|
319
373
|
return Math.max(1, Math.floor(Math.max(1, llama.cpuMathCores) / poolSize));
|
|
320
374
|
}
|
|
321
375
|
|
|
@@ -335,6 +389,8 @@ export class NodeLlamaCppEmbedding implements EmbeddingPort {
|
|
|
335
389
|
this.llamaModel = llamaModel as TokenizingModel;
|
|
336
390
|
const llama = await this.manager.getLlama();
|
|
337
391
|
const lifecycleVersion = this.lifecycleVersion;
|
|
392
|
+
this.embeddingContextSize =
|
|
393
|
+
resolveEmbeddingContextSizeOverride() ?? DEFAULT_EMBEDDING_CONTEXT_SIZE;
|
|
338
394
|
const targetPoolSize = this.resolveTargetPoolSize(llama);
|
|
339
395
|
const threadsPerContext = this.resolveThreadsPerContext(
|
|
340
396
|
llama,
|
|
@@ -400,7 +456,7 @@ export class NodeLlamaCppEmbedding implements EmbeddingPort {
|
|
|
400
456
|
private truncateForEmbedding(
|
|
401
457
|
text: string,
|
|
402
458
|
mode: "single" | "batch"
|
|
403
|
-
): LlmResult<{
|
|
459
|
+
): LlmResult<{ input: EmbeddingInput }> {
|
|
404
460
|
const model = this.llamaModel;
|
|
405
461
|
const modelLimit =
|
|
406
462
|
typeof model?.trainContextSize === "number" &&
|
|
@@ -409,7 +465,7 @@ export class NodeLlamaCppEmbedding implements EmbeddingPort {
|
|
|
409
465
|
? Math.floor(model.trainContextSize)
|
|
410
466
|
: undefined;
|
|
411
467
|
if (!model) {
|
|
412
|
-
return { ok: true, value: { text } };
|
|
468
|
+
return { ok: true, value: { input: text } };
|
|
413
469
|
}
|
|
414
470
|
|
|
415
471
|
const rawLimit =
|
|
@@ -420,10 +476,13 @@ export class NodeLlamaCppEmbedding implements EmbeddingPort {
|
|
|
420
476
|
try {
|
|
421
477
|
const tokens = model.tokenize(text);
|
|
422
478
|
if (tokens.length <= limit) {
|
|
423
|
-
return {
|
|
479
|
+
return {
|
|
480
|
+
ok: true,
|
|
481
|
+
value: { input: tokens as EmbeddingInput },
|
|
482
|
+
};
|
|
424
483
|
}
|
|
425
484
|
|
|
426
|
-
const
|
|
485
|
+
const truncatedTokens = tokens.slice(0, limit);
|
|
427
486
|
const shouldWarn =
|
|
428
487
|
mode === "single"
|
|
429
488
|
? !this.warnedSingleTruncation
|
|
@@ -438,7 +497,10 @@ export class NodeLlamaCppEmbedding implements EmbeddingPort {
|
|
|
438
497
|
`[llama] Truncated embedding input from ${tokens.length} to ${limit} tokens`
|
|
439
498
|
);
|
|
440
499
|
}
|
|
441
|
-
return {
|
|
500
|
+
return {
|
|
501
|
+
ok: true,
|
|
502
|
+
value: { input: truncatedTokens as EmbeddingInput },
|
|
503
|
+
};
|
|
442
504
|
} catch (error) {
|
|
443
505
|
return { ok: false, error: inferenceFailedError(this.modelUri, error) };
|
|
444
506
|
}
|