@gmickel/gno 1.5.0 → 1.5.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json
CHANGED
|
@@ -16,6 +16,7 @@ import { getIndexDbPath, getModelsCachePath } from "../../app/constants";
|
|
|
16
16
|
import { getConfigPaths, isInitialized, loadConfig } from "../../config";
|
|
17
17
|
import { getCodeChunkingStatus } from "../../ingestion/chunker";
|
|
18
18
|
import { ModelCache } from "../../llm/cache";
|
|
19
|
+
import { LlmAdapter } from "../../llm/nodeLlamaCpp/adapter";
|
|
19
20
|
import { getActivePreset } from "../../llm/registry";
|
|
20
21
|
import { loadFts5Snowball } from "../../store/sqlite/fts5-snowball";
|
|
21
22
|
import {
|
|
@@ -136,11 +137,10 @@ function checkCodeChunking(): DoctorCheck {
|
|
|
136
137
|
};
|
|
137
138
|
}
|
|
138
139
|
|
|
139
|
-
async function checkNodeLlamaCpp(): Promise<DoctorCheck> {
|
|
140
|
+
async function checkNodeLlamaCpp(config: Config): Promise<DoctorCheck> {
|
|
141
|
+
const llm = new LlmAdapter(config);
|
|
140
142
|
try {
|
|
141
|
-
|
|
142
|
-
// Just check that we can get the llama instance
|
|
143
|
-
await getLlama();
|
|
143
|
+
await llm.getManager().getLlama();
|
|
144
144
|
return {
|
|
145
145
|
name: "node-llama-cpp",
|
|
146
146
|
status: "ok",
|
|
@@ -153,6 +153,8 @@ async function checkNodeLlamaCpp(): Promise<DoctorCheck> {
|
|
|
153
153
|
status: "error",
|
|
154
154
|
message: `node-llama-cpp failed: ${message}`,
|
|
155
155
|
};
|
|
156
|
+
} finally {
|
|
157
|
+
await llm.dispose();
|
|
156
158
|
}
|
|
157
159
|
}
|
|
158
160
|
|
|
@@ -330,7 +332,7 @@ export async function doctor(
|
|
|
330
332
|
checks.push(...modelChecks);
|
|
331
333
|
|
|
332
334
|
// node-llama-cpp check
|
|
333
|
-
checks.push(await checkNodeLlamaCpp());
|
|
335
|
+
checks.push(await checkNodeLlamaCpp(config));
|
|
334
336
|
|
|
335
337
|
// SQLite extension checks
|
|
336
338
|
const sqliteChecks = await checkSqliteExtensions();
|
package/src/cli/program.ts
CHANGED
|
@@ -1165,8 +1165,6 @@ function wireRetrievalCommands(program: Command): void {
|
|
|
1165
1165
|
|
|
1166
1166
|
function wireMcpCommand(program: Command): void {
|
|
1167
1167
|
// mcp - Start MCP server (stdio transport) or manage MCP configuration
|
|
1168
|
-
// CRITICAL: helpOption(false) on server command prevents --help from writing
|
|
1169
|
-
// to stdout which would corrupt the JSON-RPC stream
|
|
1170
1168
|
const mcpCmd = program
|
|
1171
1169
|
.command("mcp")
|
|
1172
1170
|
.description("MCP server and configuration");
|
|
@@ -1175,7 +1173,6 @@ function wireMcpCommand(program: Command): void {
|
|
|
1175
1173
|
mcpCmd
|
|
1176
1174
|
.command("serve", { isDefault: true })
|
|
1177
1175
|
.description("Start MCP server (stdio transport)")
|
|
1178
|
-
.helpOption(false)
|
|
1179
1176
|
.option(
|
|
1180
1177
|
"--enable-write",
|
|
1181
1178
|
"Enable write operations (capture, add-collection, sync, remove-collection)"
|
|
@@ -4,6 +4,8 @@
|
|
|
4
4
|
* @module src/llm/nodeLlamaCpp/embedding
|
|
5
5
|
*/
|
|
6
6
|
|
|
7
|
+
import { platform, totalmem } from "node:os";
|
|
8
|
+
|
|
7
9
|
import type { EmbeddingPort, LlmResult } from "../types";
|
|
8
10
|
import type { ModelManager } from "./lifecycle";
|
|
9
11
|
|
|
@@ -46,6 +48,58 @@ interface TokenizingModel {
|
|
|
46
48
|
// gracefully if memory is tight.
|
|
47
49
|
const MAX_EMBEDDING_CONTEXTS = 4;
|
|
48
50
|
const TARGET_CORES_PER_EMBEDDING_CONTEXT = 4;
|
|
51
|
+
const LOW_MEMORY_WINDOWS_THRESHOLD_BYTES = 24 * 1024 * 1024 * 1024;
|
|
52
|
+
const LOW_MEMORY_WINDOWS_CONTEXTS = 1;
|
|
53
|
+
const DEFAULT_EMBEDDING_CONTEXT_SIZE = 2_048;
|
|
54
|
+
|
|
55
|
+
function resolveEmbeddingContextPoolOverride(
|
|
56
|
+
env: NodeJS.ProcessEnv = process.env
|
|
57
|
+
): number | undefined {
|
|
58
|
+
const raw = env.GNO_EMBED_CONTEXTS;
|
|
59
|
+
if (!raw) {
|
|
60
|
+
return undefined;
|
|
61
|
+
}
|
|
62
|
+
const parsed = Number.parseInt(raw, 10);
|
|
63
|
+
if (!(Number.isFinite(parsed) && parsed > 0)) {
|
|
64
|
+
return undefined;
|
|
65
|
+
}
|
|
66
|
+
return Math.max(1, Math.min(MAX_EMBEDDING_CONTEXTS, parsed));
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
export function resolveEmbeddingContextPoolSize(options: {
|
|
70
|
+
gpu: Llama["gpu"];
|
|
71
|
+
cpuMathCores: number;
|
|
72
|
+
env?: NodeJS.ProcessEnv;
|
|
73
|
+
platformName?: NodeJS.Platform;
|
|
74
|
+
totalMemoryBytes?: number;
|
|
75
|
+
}): number {
|
|
76
|
+
if (options.gpu !== false) {
|
|
77
|
+
return 1;
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
const override = resolveEmbeddingContextPoolOverride(options.env);
|
|
81
|
+
if (override !== undefined) {
|
|
82
|
+
return override;
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
const platformName = options.platformName ?? platform();
|
|
86
|
+
const totalMemoryBytes = options.totalMemoryBytes ?? totalmem();
|
|
87
|
+
if (
|
|
88
|
+
platformName === "win32" &&
|
|
89
|
+
totalMemoryBytes <= LOW_MEMORY_WINDOWS_THRESHOLD_BYTES
|
|
90
|
+
) {
|
|
91
|
+
return LOW_MEMORY_WINDOWS_CONTEXTS;
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
const cpuMathCores = Math.max(1, options.cpuMathCores);
|
|
95
|
+
return Math.max(
|
|
96
|
+
1,
|
|
97
|
+
Math.min(
|
|
98
|
+
MAX_EMBEDDING_CONTEXTS,
|
|
99
|
+
Math.ceil(cpuMathCores / TARGET_CORES_PER_EMBEDDING_CONTEXT)
|
|
100
|
+
)
|
|
101
|
+
);
|
|
102
|
+
}
|
|
49
103
|
|
|
50
104
|
// ─────────────────────────────────────────────────────────────────────────────
|
|
51
105
|
// Implementation
|
|
@@ -58,6 +112,7 @@ export class NodeLlamaCppEmbedding implements EmbeddingPort {
|
|
|
58
112
|
private lifecycleVersion = 0;
|
|
59
113
|
private dims: number | null = null;
|
|
60
114
|
private llamaModel: TokenizingModel | null = null;
|
|
115
|
+
private embeddingContextSize = DEFAULT_EMBEDDING_CONTEXT_SIZE;
|
|
61
116
|
private warnedSingleTruncation = false;
|
|
62
117
|
private warnedBatchTruncation = false;
|
|
63
118
|
private readonly manager: ModelManager;
|
|
@@ -250,18 +305,10 @@ export class NodeLlamaCppEmbedding implements EmbeddingPort {
|
|
|
250
305
|
}
|
|
251
306
|
|
|
252
307
|
private resolveTargetPoolSize(llama: Llama): number {
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
const cpuMathCores = Math.max(1, llama.cpuMathCores);
|
|
258
|
-
return Math.max(
|
|
259
|
-
1,
|
|
260
|
-
Math.min(
|
|
261
|
-
MAX_EMBEDDING_CONTEXTS,
|
|
262
|
-
Math.ceil(cpuMathCores / TARGET_CORES_PER_EMBEDDING_CONTEXT)
|
|
263
|
-
)
|
|
264
|
-
);
|
|
308
|
+
return resolveEmbeddingContextPoolSize({
|
|
309
|
+
gpu: llama.gpu,
|
|
310
|
+
cpuMathCores: llama.cpuMathCores,
|
|
311
|
+
});
|
|
265
312
|
}
|
|
266
313
|
|
|
267
314
|
private resolveThreadsPerContext(llama: Llama, poolSize: number): number {
|
|
@@ -294,7 +341,12 @@ export class NodeLlamaCppEmbedding implements EmbeddingPort {
|
|
|
294
341
|
targetPoolSize
|
|
295
342
|
);
|
|
296
343
|
const contextOptions =
|
|
297
|
-
llama.gpu === false
|
|
344
|
+
llama.gpu === false
|
|
345
|
+
? {
|
|
346
|
+
contextSize: this.embeddingContextSize,
|
|
347
|
+
threads: threadsPerContext,
|
|
348
|
+
}
|
|
349
|
+
: { contextSize: this.embeddingContextSize };
|
|
298
350
|
const contexts: LlamaEmbeddingContext[] = [];
|
|
299
351
|
|
|
300
352
|
for (let i = 0; i < targetPoolSize; i += 1) {
|
|
@@ -350,16 +402,20 @@ export class NodeLlamaCppEmbedding implements EmbeddingPort {
|
|
|
350
402
|
mode: "single" | "batch"
|
|
351
403
|
): LlmResult<{ text: string }> {
|
|
352
404
|
const model = this.llamaModel;
|
|
353
|
-
const
|
|
405
|
+
const modelLimit =
|
|
354
406
|
typeof model?.trainContextSize === "number" &&
|
|
355
407
|
Number.isFinite(model.trainContextSize) &&
|
|
356
408
|
model.trainContextSize > 0
|
|
357
409
|
? Math.floor(model.trainContextSize)
|
|
358
410
|
: undefined;
|
|
359
|
-
if (!model
|
|
411
|
+
if (!model) {
|
|
360
412
|
return { ok: true, value: { text } };
|
|
361
413
|
}
|
|
362
414
|
|
|
415
|
+
const rawLimit =
|
|
416
|
+
modelLimit === undefined
|
|
417
|
+
? this.embeddingContextSize
|
|
418
|
+
: Math.min(modelLimit, this.embeddingContextSize);
|
|
363
419
|
const limit = Math.max(1, rawLimit - 4);
|
|
364
420
|
try {
|
|
365
421
|
const tokens = model.tokenize(text);
|
|
@@ -5,6 +5,10 @@
|
|
|
5
5
|
* @module src/llm/nodeLlamaCpp/lifecycle
|
|
6
6
|
*/
|
|
7
7
|
|
|
8
|
+
import type { LlamaOptions } from "node-llama-cpp";
|
|
9
|
+
|
|
10
|
+
import { platform } from "node:os";
|
|
11
|
+
|
|
8
12
|
import type { ModelConfig } from "../../config/types";
|
|
9
13
|
import type { LlmResult, LoadedModel, ModelType } from "../types";
|
|
10
14
|
|
|
@@ -17,6 +21,12 @@ import { loadFailedError, outOfMemoryError, timeoutError } from "../errors";
|
|
|
17
21
|
type Llama = Awaited<ReturnType<typeof import("node-llama-cpp").getLlama>>;
|
|
18
22
|
type LlamaModel = Awaited<ReturnType<Llama["loadModel"]>>;
|
|
19
23
|
export type LlamaGpuMode = "auto" | "metal" | "vulkan" | "cuda" | false;
|
|
24
|
+
export type LlamaBuildMode = "never" | "autoAttempt";
|
|
25
|
+
|
|
26
|
+
type LlamaInitOptions = LlamaOptions & {
|
|
27
|
+
build: LlamaBuildMode;
|
|
28
|
+
gpu: LlamaGpuMode;
|
|
29
|
+
};
|
|
20
30
|
|
|
21
31
|
interface CachedModel {
|
|
22
32
|
uri: string;
|
|
@@ -26,7 +36,11 @@ interface CachedModel {
|
|
|
26
36
|
}
|
|
27
37
|
|
|
28
38
|
let invalidGpuModeWarned = false;
|
|
39
|
+
let invalidBuildModeWarned = false;
|
|
29
40
|
let gpuFallbackWarned = false;
|
|
41
|
+
let backendTimeoutWarned = false;
|
|
42
|
+
|
|
43
|
+
const DEFAULT_BACKEND_INIT_TIMEOUT_MS = 30_000;
|
|
30
44
|
|
|
31
45
|
export function resolveLlamaGpuMode(
|
|
32
46
|
env: NodeJS.ProcessEnv = process.env
|
|
@@ -59,6 +73,56 @@ export function resolveLlamaGpuMode(
|
|
|
59
73
|
return "auto";
|
|
60
74
|
}
|
|
61
75
|
|
|
76
|
+
export function resolveLlamaBuildMode(
|
|
77
|
+
env: NodeJS.ProcessEnv = process.env
|
|
78
|
+
): LlamaBuildMode {
|
|
79
|
+
const raw = (env.GNO_LLAMA_BUILD ?? "never").trim().toLowerCase();
|
|
80
|
+
if (
|
|
81
|
+
!raw ||
|
|
82
|
+
raw === "never" ||
|
|
83
|
+
raw === "prebuilt" ||
|
|
84
|
+
raw === "prebuilt-only"
|
|
85
|
+
) {
|
|
86
|
+
return "never";
|
|
87
|
+
}
|
|
88
|
+
if (
|
|
89
|
+
raw === "autoattempt" ||
|
|
90
|
+
raw === "auto-attempt" ||
|
|
91
|
+
raw === "source" ||
|
|
92
|
+
raw === "build"
|
|
93
|
+
) {
|
|
94
|
+
return "autoAttempt";
|
|
95
|
+
}
|
|
96
|
+
if (!invalidBuildModeWarned) {
|
|
97
|
+
invalidBuildModeWarned = true;
|
|
98
|
+
console.warn(`[llama] Invalid GNO_LLAMA_BUILD value "${raw}", using never`);
|
|
99
|
+
}
|
|
100
|
+
return "never";
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
export function resolveLlamaBackendInitTimeoutMs(
|
|
104
|
+
env: NodeJS.ProcessEnv = process.env
|
|
105
|
+
): number {
|
|
106
|
+
const raw = env.GNO_LLAMA_INIT_TIMEOUT_MS;
|
|
107
|
+
if (!raw) {
|
|
108
|
+
return DEFAULT_BACKEND_INIT_TIMEOUT_MS;
|
|
109
|
+
}
|
|
110
|
+
const parsed = Number.parseInt(raw, 10);
|
|
111
|
+
return Number.isFinite(parsed) && parsed > 0
|
|
112
|
+
? parsed
|
|
113
|
+
: DEFAULT_BACKEND_INIT_TIMEOUT_MS;
|
|
114
|
+
}
|
|
115
|
+
|
|
116
|
+
export function shouldRetryLlamaWithCpu(
|
|
117
|
+
gpu: LlamaGpuMode,
|
|
118
|
+
platformName = platform()
|
|
119
|
+
): boolean {
|
|
120
|
+
if (gpu === false) {
|
|
121
|
+
return false;
|
|
122
|
+
}
|
|
123
|
+
return gpu !== "auto" || platformName === "win32";
|
|
124
|
+
}
|
|
125
|
+
|
|
62
126
|
// ─────────────────────────────────────────────────────────────────────────────
|
|
63
127
|
// ModelManager
|
|
64
128
|
// ─────────────────────────────────────────────────────────────────────────────
|
|
@@ -84,15 +148,21 @@ export class ModelManager {
|
|
|
84
148
|
if (!this.llama) {
|
|
85
149
|
const { getLlama, LlamaLogLevel } = await import("node-llama-cpp");
|
|
86
150
|
const gpu = resolveLlamaGpuMode();
|
|
151
|
+
const build = resolveLlamaBuildMode();
|
|
152
|
+
const timeoutMs = resolveLlamaBackendInitTimeoutMs();
|
|
87
153
|
// Suppress model loading warnings (vocab tokens, pooling type)
|
|
88
154
|
try {
|
|
89
|
-
this.llama = await
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
155
|
+
this.llama = await this.getLlamaWithTimeout(
|
|
156
|
+
getLlama,
|
|
157
|
+
{
|
|
158
|
+
build,
|
|
159
|
+
gpu,
|
|
160
|
+
logLevel: LlamaLogLevel.error,
|
|
161
|
+
},
|
|
162
|
+
timeoutMs
|
|
163
|
+
);
|
|
94
164
|
} catch (error) {
|
|
95
|
-
if (gpu
|
|
165
|
+
if (!shouldRetryLlamaWithCpu(gpu)) {
|
|
96
166
|
throw error;
|
|
97
167
|
}
|
|
98
168
|
if (!gpuFallbackWarned) {
|
|
@@ -103,16 +173,48 @@ export class ModelManager {
|
|
|
103
173
|
}`
|
|
104
174
|
);
|
|
105
175
|
}
|
|
106
|
-
this.llama = await
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
176
|
+
this.llama = await this.getLlamaWithTimeout(
|
|
177
|
+
getLlama,
|
|
178
|
+
{
|
|
179
|
+
build,
|
|
180
|
+
gpu: false,
|
|
181
|
+
logLevel: LlamaLogLevel.error,
|
|
182
|
+
},
|
|
183
|
+
timeoutMs
|
|
184
|
+
);
|
|
111
185
|
}
|
|
112
186
|
}
|
|
113
187
|
return this.llama;
|
|
114
188
|
}
|
|
115
189
|
|
|
190
|
+
private async getLlamaWithTimeout(
|
|
191
|
+
getLlama: (options: LlamaInitOptions) => Promise<Llama>,
|
|
192
|
+
options: LlamaInitOptions,
|
|
193
|
+
timeoutMs: number
|
|
194
|
+
): Promise<Llama> {
|
|
195
|
+
let timeoutId: ReturnType<typeof setTimeout> | null = null;
|
|
196
|
+
try {
|
|
197
|
+
return await Promise.race([
|
|
198
|
+
getLlama(options),
|
|
199
|
+
new Promise<never>((_, reject) => {
|
|
200
|
+
timeoutId = setTimeout(() => {
|
|
201
|
+
if (!backendTimeoutWarned) {
|
|
202
|
+
backendTimeoutWarned = true;
|
|
203
|
+
console.warn(
|
|
204
|
+
`[llama] Backend initialization timed out after ${timeoutMs}ms`
|
|
205
|
+
);
|
|
206
|
+
}
|
|
207
|
+
reject(new Error(`Backend init timeout after ${timeoutMs}ms`));
|
|
208
|
+
}, timeoutMs);
|
|
209
|
+
}),
|
|
210
|
+
]);
|
|
211
|
+
} finally {
|
|
212
|
+
if (timeoutId) {
|
|
213
|
+
clearTimeout(timeoutId);
|
|
214
|
+
}
|
|
215
|
+
}
|
|
216
|
+
}
|
|
217
|
+
|
|
116
218
|
/**
|
|
117
219
|
* Load a model by path.
|
|
118
220
|
* Uses caching, inflight deduplication, and TTL-based disposal.
|