@gmickel/gno 1.5.1 → 1.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@gmickel/gno",
|
|
3
|
-
"version": "1.
|
|
3
|
+
"version": "1.6.0",
|
|
4
4
|
"description": "Local semantic search for your documents. Index Markdown, PDF, and Office files with hybrid BM25 + vector search.",
|
|
5
5
|
"keywords": [
|
|
6
6
|
"embeddings",
|
|
@@ -74,6 +74,8 @@
|
|
|
74
74
|
"bench:code-embeddings:write": "bun scripts/code-embedding-benchmark.ts --write",
|
|
75
75
|
"bench:general-embeddings": "bun scripts/general-embedding-benchmark.ts",
|
|
76
76
|
"bench:general-embeddings:write": "bun scripts/general-embedding-benchmark.ts --write",
|
|
77
|
+
"bench:cpu-embeddings": "bun scripts/cpu-embed-autoresearch.ts",
|
|
78
|
+
"bench:cpu-embeddings:native-batch-probe": "bun scripts/native-embedding-batch-probe.ts",
|
|
77
79
|
"eval:retrieval-candidates": "bun scripts/retrieval-candidate-benchmark.ts",
|
|
78
80
|
"eval:retrieval-candidates:write": "bun scripts/retrieval-candidate-benchmark.ts --write",
|
|
79
81
|
"eval:watch": "bun --bun evalite watch",
|
|
@@ -16,6 +16,7 @@ import { getIndexDbPath, getModelsCachePath } from "../../app/constants";
|
|
|
16
16
|
import { getConfigPaths, isInitialized, loadConfig } from "../../config";
|
|
17
17
|
import { getCodeChunkingStatus } from "../../ingestion/chunker";
|
|
18
18
|
import { ModelCache } from "../../llm/cache";
|
|
19
|
+
import { LlmAdapter } from "../../llm/nodeLlamaCpp/adapter";
|
|
19
20
|
import { getActivePreset } from "../../llm/registry";
|
|
20
21
|
import { loadFts5Snowball } from "../../store/sqlite/fts5-snowball";
|
|
21
22
|
import {
|
|
@@ -136,11 +137,10 @@ function checkCodeChunking(): DoctorCheck {
|
|
|
136
137
|
};
|
|
137
138
|
}
|
|
138
139
|
|
|
139
|
-
async function checkNodeLlamaCpp(): Promise<DoctorCheck> {
|
|
140
|
+
async function checkNodeLlamaCpp(config: Config): Promise<DoctorCheck> {
|
|
141
|
+
const llm = new LlmAdapter(config);
|
|
140
142
|
try {
|
|
141
|
-
|
|
142
|
-
// Just check that we can get the llama instance
|
|
143
|
-
await getLlama();
|
|
143
|
+
await llm.getManager().getLlama();
|
|
144
144
|
return {
|
|
145
145
|
name: "node-llama-cpp",
|
|
146
146
|
status: "ok",
|
|
@@ -153,6 +153,8 @@ async function checkNodeLlamaCpp(): Promise<DoctorCheck> {
|
|
|
153
153
|
status: "error",
|
|
154
154
|
message: `node-llama-cpp failed: ${message}`,
|
|
155
155
|
};
|
|
156
|
+
} finally {
|
|
157
|
+
await llm.dispose();
|
|
156
158
|
}
|
|
157
159
|
}
|
|
158
160
|
|
|
@@ -330,7 +332,7 @@ export async function doctor(
|
|
|
330
332
|
checks.push(...modelChecks);
|
|
331
333
|
|
|
332
334
|
// node-llama-cpp check
|
|
333
|
-
checks.push(await checkNodeLlamaCpp());
|
|
335
|
+
checks.push(await checkNodeLlamaCpp(config));
|
|
334
336
|
|
|
335
337
|
// SQLite extension checks
|
|
336
338
|
const sqliteChecks = await checkSqliteExtensions();
|
|
@@ -4,6 +4,8 @@
|
|
|
4
4
|
* @module src/llm/nodeLlamaCpp/embedding
|
|
5
5
|
*/
|
|
6
6
|
|
|
7
|
+
import { platform, totalmem } from "node:os";
|
|
8
|
+
|
|
7
9
|
import type { EmbeddingPort, LlmResult } from "../types";
|
|
8
10
|
import type { ModelManager } from "./lifecycle";
|
|
9
11
|
|
|
@@ -37,6 +39,8 @@ interface TokenizingModel {
|
|
|
37
39
|
detokenize(tokens: readonly number[]): string;
|
|
38
40
|
}
|
|
39
41
|
|
|
42
|
+
type EmbeddingInput = Parameters<LlamaEmbeddingContext["getEmbeddingFor"]>[0];
|
|
43
|
+
|
|
40
44
|
// ─────────────────────────────────────────────────────────────────────────────
|
|
41
45
|
// Constants
|
|
42
46
|
// ─────────────────────────────────────────────────────────────────────────────
|
|
@@ -44,8 +48,104 @@ interface TokenizingModel {
|
|
|
44
48
|
// Aim for a small pool so CPU-only runs can exploit parallel contexts without
|
|
45
49
|
// multiplying RAM usage too aggressively. Additional contexts fall back
|
|
46
50
|
// gracefully if memory is tight.
|
|
47
|
-
const
|
|
51
|
+
const MAX_DEFAULT_EMBEDDING_CONTEXTS = 2;
|
|
52
|
+
const MAX_EMBEDDING_CONTEXTS_OVERRIDE = 4;
|
|
48
53
|
const TARGET_CORES_PER_EMBEDDING_CONTEXT = 4;
|
|
54
|
+
const CONSTRAINED_WINDOWS_THRESHOLD_BYTES = 16 * 1024 * 1024 * 1024;
|
|
55
|
+
const MID_MEMORY_WINDOWS_THRESHOLD_BYTES = 24 * 1024 * 1024 * 1024;
|
|
56
|
+
const LOW_MEMORY_WINDOWS_CONTEXTS = 1;
|
|
57
|
+
const MID_MEMORY_WINDOWS_CONTEXTS = 2;
|
|
58
|
+
const DEFAULT_EMBEDDING_CONTEXT_SIZE = 2_048;
|
|
59
|
+
|
|
60
|
+
function embeddingVectorToArray(vector: readonly number[]): number[] {
|
|
61
|
+
return Array.isArray(vector) ? (vector as number[]) : Array.from(vector);
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
function resolveEmbeddingContextPoolOverride(
|
|
65
|
+
env: NodeJS.ProcessEnv = process.env
|
|
66
|
+
): number | undefined {
|
|
67
|
+
const raw = env.GNO_EMBED_CONTEXTS;
|
|
68
|
+
if (!raw) {
|
|
69
|
+
return undefined;
|
|
70
|
+
}
|
|
71
|
+
const parsed = Number.parseInt(raw, 10);
|
|
72
|
+
if (!(Number.isFinite(parsed) && parsed > 0)) {
|
|
73
|
+
return undefined;
|
|
74
|
+
}
|
|
75
|
+
return Math.max(1, Math.min(MAX_EMBEDDING_CONTEXTS_OVERRIDE, parsed));
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
function resolveThreadsPerContextOverride(
|
|
79
|
+
env: NodeJS.ProcessEnv = process.env
|
|
80
|
+
): number | undefined {
|
|
81
|
+
const raw = env.GNO_EMBED_THREADS;
|
|
82
|
+
if (!raw) {
|
|
83
|
+
return undefined;
|
|
84
|
+
}
|
|
85
|
+
const parsed = Number.parseInt(raw, 10);
|
|
86
|
+
if (!(Number.isFinite(parsed) && parsed > 0)) {
|
|
87
|
+
return undefined;
|
|
88
|
+
}
|
|
89
|
+
return Math.max(1, parsed);
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
function resolveEmbeddingContextSizeOverride(
|
|
93
|
+
env: NodeJS.ProcessEnv = process.env
|
|
94
|
+
): number | undefined {
|
|
95
|
+
const raw = env.GNO_EMBED_CONTEXT_SIZE;
|
|
96
|
+
if (!raw) {
|
|
97
|
+
return undefined;
|
|
98
|
+
}
|
|
99
|
+
const parsed = Number.parseInt(raw, 10);
|
|
100
|
+
if (!(Number.isFinite(parsed) && parsed > 0)) {
|
|
101
|
+
return undefined;
|
|
102
|
+
}
|
|
103
|
+
return Math.max(128, parsed);
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
export function resolveEmbeddingContextPoolSize(options: {
|
|
107
|
+
gpu: Llama["gpu"];
|
|
108
|
+
cpuMathCores: number;
|
|
109
|
+
env?: NodeJS.ProcessEnv;
|
|
110
|
+
platformName?: NodeJS.Platform;
|
|
111
|
+
totalMemoryBytes?: number;
|
|
112
|
+
}): number {
|
|
113
|
+
if (options.gpu !== false) {
|
|
114
|
+
return 1;
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
const override = resolveEmbeddingContextPoolOverride(options.env);
|
|
118
|
+
if (override !== undefined) {
|
|
119
|
+
return override;
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
const platformName = options.platformName ?? platform();
|
|
123
|
+
const totalMemoryBytes = options.totalMemoryBytes ?? totalmem();
|
|
124
|
+
if (
|
|
125
|
+
platformName === "win32" &&
|
|
126
|
+
totalMemoryBytes < CONSTRAINED_WINDOWS_THRESHOLD_BYTES
|
|
127
|
+
) {
|
|
128
|
+
return LOW_MEMORY_WINDOWS_CONTEXTS;
|
|
129
|
+
}
|
|
130
|
+
|
|
131
|
+
const cpuMathCores = Math.max(1, options.cpuMathCores);
|
|
132
|
+
const adaptivePoolSize = Math.max(
|
|
133
|
+
1,
|
|
134
|
+
Math.min(
|
|
135
|
+
MAX_DEFAULT_EMBEDDING_CONTEXTS,
|
|
136
|
+
Math.ceil(cpuMathCores / TARGET_CORES_PER_EMBEDDING_CONTEXT)
|
|
137
|
+
)
|
|
138
|
+
);
|
|
139
|
+
|
|
140
|
+
if (
|
|
141
|
+
platformName === "win32" &&
|
|
142
|
+
totalMemoryBytes < MID_MEMORY_WINDOWS_THRESHOLD_BYTES
|
|
143
|
+
) {
|
|
144
|
+
return Math.min(MID_MEMORY_WINDOWS_CONTEXTS, adaptivePoolSize);
|
|
145
|
+
}
|
|
146
|
+
|
|
147
|
+
return adaptivePoolSize;
|
|
148
|
+
}
|
|
49
149
|
|
|
50
150
|
// ─────────────────────────────────────────────────────────────────────────────
|
|
51
151
|
// Implementation
|
|
@@ -58,6 +158,7 @@ export class NodeLlamaCppEmbedding implements EmbeddingPort {
|
|
|
58
158
|
private lifecycleVersion = 0;
|
|
59
159
|
private dims: number | null = null;
|
|
60
160
|
private llamaModel: TokenizingModel | null = null;
|
|
161
|
+
private embeddingContextSize = DEFAULT_EMBEDDING_CONTEXT_SIZE;
|
|
61
162
|
private warnedSingleTruncation = false;
|
|
62
163
|
private warnedBatchTruncation = false;
|
|
63
164
|
private readonly manager: ModelManager;
|
|
@@ -90,9 +191,9 @@ export class NodeLlamaCppEmbedding implements EmbeddingPort {
|
|
|
90
191
|
return { ok: false, error: prepared.error };
|
|
91
192
|
}
|
|
92
193
|
const embedding = await this.runOnWorker((worker) =>
|
|
93
|
-
worker.context.getEmbeddingFor(prepared.value.
|
|
194
|
+
worker.context.getEmbeddingFor(prepared.value.input)
|
|
94
195
|
);
|
|
95
|
-
const vector =
|
|
196
|
+
const vector = embeddingVectorToArray(embedding.vector);
|
|
96
197
|
|
|
97
198
|
// Cache dimensions on first call
|
|
98
199
|
if (this.dims === null) {
|
|
@@ -116,13 +217,13 @@ export class NodeLlamaCppEmbedding implements EmbeddingPort {
|
|
|
116
217
|
}
|
|
117
218
|
|
|
118
219
|
try {
|
|
119
|
-
const
|
|
220
|
+
const preparedInputs: EmbeddingInput[] = [];
|
|
120
221
|
for (const text of texts) {
|
|
121
222
|
const prepared = this.truncateForEmbedding(text, "batch");
|
|
122
223
|
if (!prepared.ok) {
|
|
123
224
|
return { ok: false, error: prepared.error };
|
|
124
225
|
}
|
|
125
|
-
|
|
226
|
+
preparedInputs.push(prepared.value.input);
|
|
126
227
|
}
|
|
127
228
|
|
|
128
229
|
const allResults = Array.from(
|
|
@@ -136,16 +237,19 @@ export class NodeLlamaCppEmbedding implements EmbeddingPort {
|
|
|
136
237
|
while (true) {
|
|
137
238
|
const index = nextIndex;
|
|
138
239
|
nextIndex += 1;
|
|
139
|
-
if (index >=
|
|
240
|
+
if (index >= preparedInputs.length) {
|
|
140
241
|
return;
|
|
141
242
|
}
|
|
142
243
|
|
|
244
|
+
const input = preparedInputs[index];
|
|
245
|
+
if (input === undefined) {
|
|
246
|
+
return;
|
|
247
|
+
}
|
|
143
248
|
const embedding = await this.runOnSpecificWorker(
|
|
144
249
|
worker,
|
|
145
|
-
(current) =>
|
|
146
|
-
current.context.getEmbeddingFor(preparedTexts[index] as string)
|
|
250
|
+
(current) => current.context.getEmbeddingFor(input)
|
|
147
251
|
);
|
|
148
|
-
allResults[index] =
|
|
252
|
+
allResults[index] = embeddingVectorToArray(embedding.vector);
|
|
149
253
|
}
|
|
150
254
|
})
|
|
151
255
|
);
|
|
@@ -250,18 +354,10 @@ export class NodeLlamaCppEmbedding implements EmbeddingPort {
|
|
|
250
354
|
}
|
|
251
355
|
|
|
252
356
|
private resolveTargetPoolSize(llama: Llama): number {
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
const cpuMathCores = Math.max(1, llama.cpuMathCores);
|
|
258
|
-
return Math.max(
|
|
259
|
-
1,
|
|
260
|
-
Math.min(
|
|
261
|
-
MAX_EMBEDDING_CONTEXTS,
|
|
262
|
-
Math.ceil(cpuMathCores / TARGET_CORES_PER_EMBEDDING_CONTEXT)
|
|
263
|
-
)
|
|
264
|
-
);
|
|
357
|
+
return resolveEmbeddingContextPoolSize({
|
|
358
|
+
gpu: llama.gpu,
|
|
359
|
+
cpuMathCores: llama.cpuMathCores,
|
|
360
|
+
});
|
|
265
361
|
}
|
|
266
362
|
|
|
267
363
|
private resolveThreadsPerContext(llama: Llama, poolSize: number): number {
|
|
@@ -269,6 +365,11 @@ export class NodeLlamaCppEmbedding implements EmbeddingPort {
|
|
|
269
365
|
return 0;
|
|
270
366
|
}
|
|
271
367
|
|
|
368
|
+
const override = resolveThreadsPerContextOverride();
|
|
369
|
+
if (override !== undefined) {
|
|
370
|
+
return override;
|
|
371
|
+
}
|
|
372
|
+
|
|
272
373
|
return Math.max(1, Math.floor(Math.max(1, llama.cpuMathCores) / poolSize));
|
|
273
374
|
}
|
|
274
375
|
|
|
@@ -288,13 +389,20 @@ export class NodeLlamaCppEmbedding implements EmbeddingPort {
|
|
|
288
389
|
this.llamaModel = llamaModel as TokenizingModel;
|
|
289
390
|
const llama = await this.manager.getLlama();
|
|
290
391
|
const lifecycleVersion = this.lifecycleVersion;
|
|
392
|
+
this.embeddingContextSize =
|
|
393
|
+
resolveEmbeddingContextSizeOverride() ?? DEFAULT_EMBEDDING_CONTEXT_SIZE;
|
|
291
394
|
const targetPoolSize = this.resolveTargetPoolSize(llama);
|
|
292
395
|
const threadsPerContext = this.resolveThreadsPerContext(
|
|
293
396
|
llama,
|
|
294
397
|
targetPoolSize
|
|
295
398
|
);
|
|
296
399
|
const contextOptions =
|
|
297
|
-
llama.gpu === false
|
|
400
|
+
llama.gpu === false
|
|
401
|
+
? {
|
|
402
|
+
contextSize: this.embeddingContextSize,
|
|
403
|
+
threads: threadsPerContext,
|
|
404
|
+
}
|
|
405
|
+
: { contextSize: this.embeddingContextSize };
|
|
298
406
|
const contexts: LlamaEmbeddingContext[] = [];
|
|
299
407
|
|
|
300
408
|
for (let i = 0; i < targetPoolSize; i += 1) {
|
|
@@ -348,26 +456,33 @@ export class NodeLlamaCppEmbedding implements EmbeddingPort {
|
|
|
348
456
|
private truncateForEmbedding(
|
|
349
457
|
text: string,
|
|
350
458
|
mode: "single" | "batch"
|
|
351
|
-
): LlmResult<{
|
|
459
|
+
): LlmResult<{ input: EmbeddingInput }> {
|
|
352
460
|
const model = this.llamaModel;
|
|
353
|
-
const
|
|
461
|
+
const modelLimit =
|
|
354
462
|
typeof model?.trainContextSize === "number" &&
|
|
355
463
|
Number.isFinite(model.trainContextSize) &&
|
|
356
464
|
model.trainContextSize > 0
|
|
357
465
|
? Math.floor(model.trainContextSize)
|
|
358
466
|
: undefined;
|
|
359
|
-
if (!model
|
|
360
|
-
return { ok: true, value: { text } };
|
|
467
|
+
if (!model) {
|
|
468
|
+
return { ok: true, value: { input: text } };
|
|
361
469
|
}
|
|
362
470
|
|
|
471
|
+
const rawLimit =
|
|
472
|
+
modelLimit === undefined
|
|
473
|
+
? this.embeddingContextSize
|
|
474
|
+
: Math.min(modelLimit, this.embeddingContextSize);
|
|
363
475
|
const limit = Math.max(1, rawLimit - 4);
|
|
364
476
|
try {
|
|
365
477
|
const tokens = model.tokenize(text);
|
|
366
478
|
if (tokens.length <= limit) {
|
|
367
|
-
return {
|
|
479
|
+
return {
|
|
480
|
+
ok: true,
|
|
481
|
+
value: { input: tokens as EmbeddingInput },
|
|
482
|
+
};
|
|
368
483
|
}
|
|
369
484
|
|
|
370
|
-
const
|
|
485
|
+
const truncatedTokens = tokens.slice(0, limit);
|
|
371
486
|
const shouldWarn =
|
|
372
487
|
mode === "single"
|
|
373
488
|
? !this.warnedSingleTruncation
|
|
@@ -382,7 +497,10 @@ export class NodeLlamaCppEmbedding implements EmbeddingPort {
|
|
|
382
497
|
`[llama] Truncated embedding input from ${tokens.length} to ${limit} tokens`
|
|
383
498
|
);
|
|
384
499
|
}
|
|
385
|
-
return {
|
|
500
|
+
return {
|
|
501
|
+
ok: true,
|
|
502
|
+
value: { input: truncatedTokens as EmbeddingInput },
|
|
503
|
+
};
|
|
386
504
|
} catch (error) {
|
|
387
505
|
return { ok: false, error: inferenceFailedError(this.modelUri, error) };
|
|
388
506
|
}
|
|
@@ -5,6 +5,10 @@
|
|
|
5
5
|
* @module src/llm/nodeLlamaCpp/lifecycle
|
|
6
6
|
*/
|
|
7
7
|
|
|
8
|
+
import type { LlamaOptions } from "node-llama-cpp";
|
|
9
|
+
|
|
10
|
+
import { platform } from "node:os";
|
|
11
|
+
|
|
8
12
|
import type { ModelConfig } from "../../config/types";
|
|
9
13
|
import type { LlmResult, LoadedModel, ModelType } from "../types";
|
|
10
14
|
|
|
@@ -17,6 +21,12 @@ import { loadFailedError, outOfMemoryError, timeoutError } from "../errors";
|
|
|
17
21
|
type Llama = Awaited<ReturnType<typeof import("node-llama-cpp").getLlama>>;
|
|
18
22
|
type LlamaModel = Awaited<ReturnType<Llama["loadModel"]>>;
|
|
19
23
|
export type LlamaGpuMode = "auto" | "metal" | "vulkan" | "cuda" | false;
|
|
24
|
+
export type LlamaBuildMode = "never" | "autoAttempt";
|
|
25
|
+
|
|
26
|
+
type LlamaInitOptions = LlamaOptions & {
|
|
27
|
+
build: LlamaBuildMode;
|
|
28
|
+
gpu: LlamaGpuMode;
|
|
29
|
+
};
|
|
20
30
|
|
|
21
31
|
interface CachedModel {
|
|
22
32
|
uri: string;
|
|
@@ -26,7 +36,11 @@ interface CachedModel {
|
|
|
26
36
|
}
|
|
27
37
|
|
|
28
38
|
let invalidGpuModeWarned = false;
|
|
39
|
+
let invalidBuildModeWarned = false;
|
|
29
40
|
let gpuFallbackWarned = false;
|
|
41
|
+
let backendTimeoutWarned = false;
|
|
42
|
+
|
|
43
|
+
const DEFAULT_BACKEND_INIT_TIMEOUT_MS = 30_000;
|
|
30
44
|
|
|
31
45
|
export function resolveLlamaGpuMode(
|
|
32
46
|
env: NodeJS.ProcessEnv = process.env
|
|
@@ -59,6 +73,56 @@ export function resolveLlamaGpuMode(
|
|
|
59
73
|
return "auto";
|
|
60
74
|
}
|
|
61
75
|
|
|
76
|
+
export function resolveLlamaBuildMode(
|
|
77
|
+
env: NodeJS.ProcessEnv = process.env
|
|
78
|
+
): LlamaBuildMode {
|
|
79
|
+
const raw = (env.GNO_LLAMA_BUILD ?? "never").trim().toLowerCase();
|
|
80
|
+
if (
|
|
81
|
+
!raw ||
|
|
82
|
+
raw === "never" ||
|
|
83
|
+
raw === "prebuilt" ||
|
|
84
|
+
raw === "prebuilt-only"
|
|
85
|
+
) {
|
|
86
|
+
return "never";
|
|
87
|
+
}
|
|
88
|
+
if (
|
|
89
|
+
raw === "autoattempt" ||
|
|
90
|
+
raw === "auto-attempt" ||
|
|
91
|
+
raw === "source" ||
|
|
92
|
+
raw === "build"
|
|
93
|
+
) {
|
|
94
|
+
return "autoAttempt";
|
|
95
|
+
}
|
|
96
|
+
if (!invalidBuildModeWarned) {
|
|
97
|
+
invalidBuildModeWarned = true;
|
|
98
|
+
console.warn(`[llama] Invalid GNO_LLAMA_BUILD value "${raw}", using never`);
|
|
99
|
+
}
|
|
100
|
+
return "never";
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
export function resolveLlamaBackendInitTimeoutMs(
|
|
104
|
+
env: NodeJS.ProcessEnv = process.env
|
|
105
|
+
): number {
|
|
106
|
+
const raw = env.GNO_LLAMA_INIT_TIMEOUT_MS;
|
|
107
|
+
if (!raw) {
|
|
108
|
+
return DEFAULT_BACKEND_INIT_TIMEOUT_MS;
|
|
109
|
+
}
|
|
110
|
+
const parsed = Number.parseInt(raw, 10);
|
|
111
|
+
return Number.isFinite(parsed) && parsed > 0
|
|
112
|
+
? parsed
|
|
113
|
+
: DEFAULT_BACKEND_INIT_TIMEOUT_MS;
|
|
114
|
+
}
|
|
115
|
+
|
|
116
|
+
export function shouldRetryLlamaWithCpu(
|
|
117
|
+
gpu: LlamaGpuMode,
|
|
118
|
+
platformName = platform()
|
|
119
|
+
): boolean {
|
|
120
|
+
if (gpu === false) {
|
|
121
|
+
return false;
|
|
122
|
+
}
|
|
123
|
+
return gpu !== "auto" || platformName === "win32";
|
|
124
|
+
}
|
|
125
|
+
|
|
62
126
|
// ─────────────────────────────────────────────────────────────────────────────
|
|
63
127
|
// ModelManager
|
|
64
128
|
// ─────────────────────────────────────────────────────────────────────────────
|
|
@@ -84,15 +148,21 @@ export class ModelManager {
|
|
|
84
148
|
if (!this.llama) {
|
|
85
149
|
const { getLlama, LlamaLogLevel } = await import("node-llama-cpp");
|
|
86
150
|
const gpu = resolveLlamaGpuMode();
|
|
151
|
+
const build = resolveLlamaBuildMode();
|
|
152
|
+
const timeoutMs = resolveLlamaBackendInitTimeoutMs();
|
|
87
153
|
// Suppress model loading warnings (vocab tokens, pooling type)
|
|
88
154
|
try {
|
|
89
|
-
this.llama = await
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
155
|
+
this.llama = await this.getLlamaWithTimeout(
|
|
156
|
+
getLlama,
|
|
157
|
+
{
|
|
158
|
+
build,
|
|
159
|
+
gpu,
|
|
160
|
+
logLevel: LlamaLogLevel.error,
|
|
161
|
+
},
|
|
162
|
+
timeoutMs
|
|
163
|
+
);
|
|
94
164
|
} catch (error) {
|
|
95
|
-
if (gpu
|
|
165
|
+
if (!shouldRetryLlamaWithCpu(gpu)) {
|
|
96
166
|
throw error;
|
|
97
167
|
}
|
|
98
168
|
if (!gpuFallbackWarned) {
|
|
@@ -103,16 +173,48 @@ export class ModelManager {
|
|
|
103
173
|
}`
|
|
104
174
|
);
|
|
105
175
|
}
|
|
106
|
-
this.llama = await
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
176
|
+
this.llama = await this.getLlamaWithTimeout(
|
|
177
|
+
getLlama,
|
|
178
|
+
{
|
|
179
|
+
build,
|
|
180
|
+
gpu: false,
|
|
181
|
+
logLevel: LlamaLogLevel.error,
|
|
182
|
+
},
|
|
183
|
+
timeoutMs
|
|
184
|
+
);
|
|
111
185
|
}
|
|
112
186
|
}
|
|
113
187
|
return this.llama;
|
|
114
188
|
}
|
|
115
189
|
|
|
190
|
+
private async getLlamaWithTimeout(
|
|
191
|
+
getLlama: (options: LlamaInitOptions) => Promise<Llama>,
|
|
192
|
+
options: LlamaInitOptions,
|
|
193
|
+
timeoutMs: number
|
|
194
|
+
): Promise<Llama> {
|
|
195
|
+
let timeoutId: ReturnType<typeof setTimeout> | null = null;
|
|
196
|
+
try {
|
|
197
|
+
return await Promise.race([
|
|
198
|
+
getLlama(options),
|
|
199
|
+
new Promise<never>((_, reject) => {
|
|
200
|
+
timeoutId = setTimeout(() => {
|
|
201
|
+
if (!backendTimeoutWarned) {
|
|
202
|
+
backendTimeoutWarned = true;
|
|
203
|
+
console.warn(
|
|
204
|
+
`[llama] Backend initialization timed out after ${timeoutMs}ms`
|
|
205
|
+
);
|
|
206
|
+
}
|
|
207
|
+
reject(new Error(`Backend init timeout after ${timeoutMs}ms`));
|
|
208
|
+
}, timeoutMs);
|
|
209
|
+
}),
|
|
210
|
+
]);
|
|
211
|
+
} finally {
|
|
212
|
+
if (timeoutId) {
|
|
213
|
+
clearTimeout(timeoutId);
|
|
214
|
+
}
|
|
215
|
+
}
|
|
216
|
+
}
|
|
217
|
+
|
|
116
218
|
/**
|
|
117
219
|
* Load a model by path.
|
|
118
220
|
* Uses caching, inflight deduplication, and TTL-based disposal.
|