@gmickel/gno 0.27.3 → 0.28.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@gmickel/gno",
3
- "version": "0.27.3",
3
+ "version": "0.28.0",
4
4
  "description": "Local semantic search for your documents. Index Markdown, PDF, and Office files with hybrid BM25 + vector search.",
5
5
  "keywords": [
6
6
  "embeddings",
@@ -24,23 +24,32 @@ type LlamaEmbeddingContext = Awaited<
24
24
  ReturnType<LlamaModel["createEmbeddingContext"]>
25
25
  >;
26
26
 
27
+ type Llama = Awaited<ReturnType<typeof import("node-llama-cpp").getLlama>>;
28
+
29
+ interface EmbeddingWorker {
30
+ context: LlamaEmbeddingContext;
31
+ pending: number;
32
+ }
33
+
27
34
  // ─────────────────────────────────────────────────────────────────────────────
28
35
  // Constants
29
36
  // ─────────────────────────────────────────────────────────────────────────────
30
37
 
31
- // Max concurrent embedding operations per batch to avoid overwhelming the context.
32
- // node-llama-cpp contexts may not handle high concurrency well; this provides
33
- // a safe default while still allowing parallelism within chunks.
34
- const MAX_CONCURRENT_EMBEDDINGS = 16;
38
+ // Aim for a small pool so CPU-only runs can exploit parallel contexts without
39
+ // multiplying RAM usage too aggressively. Additional contexts fall back
40
+ // gracefully if memory is tight.
41
+ const MAX_EMBEDDING_CONTEXTS = 4;
42
+ const TARGET_CORES_PER_EMBEDDING_CONTEXT = 4;
35
43
 
36
44
  // ─────────────────────────────────────────────────────────────────────────────
37
45
  // Implementation
38
46
  // ─────────────────────────────────────────────────────────────────────────────
39
47
 
40
48
  export class NodeLlamaCppEmbedding implements EmbeddingPort {
41
- private context: LlamaEmbeddingContext | null = null;
42
- private contextPromise: Promise<LlmResult<LlamaEmbeddingContext>> | null =
49
+ private workers: EmbeddingWorker[] = [];
50
+ private contextsPromise: Promise<LlmResult<LlamaEmbeddingContext[]>> | null =
43
51
  null;
52
+ private lifecycleVersion = 0;
44
53
  private dims: number | null = null;
45
54
  private readonly manager: ModelManager;
46
55
  readonly modelUri: string;
@@ -53,21 +62,23 @@ export class NodeLlamaCppEmbedding implements EmbeddingPort {
53
62
  }
54
63
 
55
64
  async init(): Promise<LlmResult<void>> {
56
- const ctx = await this.getContext();
57
- if (!ctx.ok) {
58
- return ctx;
65
+ const contexts = await this.getContexts();
66
+ if (!contexts.ok) {
67
+ return contexts;
59
68
  }
60
69
  return { ok: true, value: undefined };
61
70
  }
62
71
 
63
72
  async embed(text: string): Promise<LlmResult<number[]>> {
64
- const ctx = await this.getContext();
65
- if (!ctx.ok) {
66
- return ctx;
73
+ const contexts = await this.getContexts();
74
+ if (!contexts.ok) {
75
+ return contexts;
67
76
  }
68
77
 
69
78
  try {
70
- const embedding = await ctx.value.getEmbeddingFor(text);
79
+ const embedding = await this.runOnWorker((worker) =>
80
+ worker.context.getEmbeddingFor(text)
81
+ );
71
82
  const vector = Array.from(embedding.vector) as number[];
72
83
 
73
84
  // Cache dimensions on first call
@@ -82,9 +93,9 @@ export class NodeLlamaCppEmbedding implements EmbeddingPort {
82
93
  }
83
94
 
84
95
  async embedBatch(texts: string[]): Promise<LlmResult<number[][]>> {
85
- const ctx = await this.getContext();
86
- if (!ctx.ok) {
87
- return ctx;
96
+ const contexts = await this.getContexts();
97
+ if (!contexts.ok) {
98
+ return contexts;
88
99
  }
89
100
 
90
101
  if (texts.length === 0) {
@@ -92,39 +103,40 @@ export class NodeLlamaCppEmbedding implements EmbeddingPort {
92
103
  }
93
104
 
94
105
  try {
95
- // Process in chunks to avoid overwhelming the embedding context.
96
- // node-llama-cpp v3.x only exposes getEmbeddingFor (single text), not a native
97
- // batch method. We use allSettled within chunks to ensure all in-flight ops
98
- // complete before returning (prevents orphaned operations on early failure).
99
- const allResults: number[][] = [];
100
-
101
- for (let i = 0; i < texts.length; i += MAX_CONCURRENT_EMBEDDINGS) {
102
- const chunk = texts.slice(i, i + MAX_CONCURRENT_EMBEDDINGS);
103
- const settled = await Promise.allSettled(
104
- chunk.map((text) => ctx.value.getEmbeddingFor(text))
105
- );
106
-
107
- // Check for any failures in this chunk
108
- const firstRejection = settled.find(
109
- (r): r is PromiseRejectedResult => r.status === "rejected"
110
- );
111
- if (firstRejection) {
112
- return {
113
- ok: false,
114
- error: inferenceFailedError(this.modelUri, firstRejection.reason),
115
- };
116
- }
117
-
118
- // Extract results from this chunk (cast safe after rejection check)
119
- const chunkResults = (
120
- settled as Array<
121
- PromiseFulfilledResult<
122
- Awaited<ReturnType<typeof ctx.value.getEmbeddingFor>>
123
- >
124
- >
125
- ).map((r) => Array.from(r.value.vector) as number[]);
126
-
127
- allResults.push(...chunkResults);
106
+ const allResults = Array.from(
107
+ { length: texts.length },
108
+ () => [] as number[]
109
+ );
110
+ let nextIndex = 0;
111
+
112
+ const settled = await Promise.allSettled(
113
+ this.workers.map(async (worker) => {
114
+ while (true) {
115
+ const index = nextIndex;
116
+ nextIndex += 1;
117
+ if (index >= texts.length) {
118
+ return;
119
+ }
120
+
121
+ const embedding = await this.runOnSpecificWorker(
122
+ worker,
123
+ (current) =>
124
+ current.context.getEmbeddingFor(texts[index] as string)
125
+ );
126
+ allResults[index] = Array.from(embedding.vector) as number[];
127
+ }
128
+ })
129
+ );
130
+
131
+ const firstRejection = settled.find(
132
+ (result): result is PromiseRejectedResult =>
133
+ result.status === "rejected"
134
+ );
135
+ if (firstRejection) {
136
+ return {
137
+ ok: false,
138
+ error: inferenceFailedError(this.modelUri, firstRejection.reason),
139
+ };
128
140
  }
129
141
 
130
142
  // Cache dimensions from first result
@@ -147,15 +159,17 @@ export class NodeLlamaCppEmbedding implements EmbeddingPort {
147
159
  }
148
160
 
149
161
  async dispose(): Promise<void> {
150
- // Clear promise first to prevent reuse of disposed context
151
- this.contextPromise = null;
152
- if (this.context) {
162
+ this.lifecycleVersion += 1;
163
+ this.contextsPromise = null;
164
+ const workers = this.workers;
165
+ this.workers = [];
166
+
167
+ for (const worker of workers) {
153
168
  try {
154
- await this.context.dispose();
169
+ await worker.context.dispose();
155
170
  } catch {
156
171
  // Ignore disposal errors
157
172
  }
158
- this.context = null;
159
173
  }
160
174
  }
161
175
 
@@ -163,46 +177,147 @@ export class NodeLlamaCppEmbedding implements EmbeddingPort {
163
177
  // Private
164
178
  // ───────────────────────────────────────────────────────────────────────────
165
179
 
166
- private getContext(): Promise<LlmResult<LlamaEmbeddingContext>> {
167
- // Return cached context
168
- if (this.context) {
169
- return Promise.resolve({ ok: true, value: this.context });
180
+ private async runOnWorker<T>(
181
+ task: (worker: EmbeddingWorker) => Promise<T>
182
+ ): Promise<T> {
183
+ const worker = this.getLeastBusyWorker();
184
+ return this.runOnSpecificWorker(worker, task);
185
+ }
186
+
187
+ private async runOnSpecificWorker<T>(
188
+ worker: EmbeddingWorker,
189
+ task: (worker: EmbeddingWorker) => Promise<T>
190
+ ): Promise<T> {
191
+ worker.pending += 1;
192
+ try {
193
+ return await task(worker);
194
+ } finally {
195
+ worker.pending -= 1;
196
+ }
197
+ }
198
+
199
+ private getLeastBusyWorker(): EmbeddingWorker {
200
+ const firstWorker = this.workers[0];
201
+ if (!firstWorker) {
202
+ throw new Error("Embedding context not initialized");
203
+ }
204
+
205
+ let bestWorker = firstWorker;
206
+ for (const worker of this.workers) {
207
+ if (worker.pending < bestWorker.pending) {
208
+ bestWorker = worker;
209
+ }
210
+ }
211
+ return bestWorker;
212
+ }
213
+
214
+ private getContexts(): Promise<LlmResult<LlamaEmbeddingContext[]>> {
215
+ if (this.workers.length > 0) {
216
+ return Promise.resolve({
217
+ ok: true,
218
+ value: this.workers.map((worker) => worker.context),
219
+ });
170
220
  }
171
221
 
172
- // Reuse in-flight promise to prevent concurrent context creation
173
- if (this.contextPromise) {
174
- return this.contextPromise;
222
+ if (this.contextsPromise) {
223
+ return this.contextsPromise;
175
224
  }
176
225
 
177
- this.contextPromise = this.createContext();
178
- return this.contextPromise;
226
+ this.contextsPromise = this.createContexts();
227
+ return this.contextsPromise;
179
228
  }
180
229
 
181
- private async createContext(): Promise<LlmResult<LlamaEmbeddingContext>> {
230
+ private resolveTargetPoolSize(llama: Llama): number {
231
+ if (llama.gpu !== false) {
232
+ return 1;
233
+ }
234
+
235
+ const cpuMathCores = Math.max(1, llama.cpuMathCores);
236
+ return Math.max(
237
+ 1,
238
+ Math.min(
239
+ MAX_EMBEDDING_CONTEXTS,
240
+ Math.ceil(cpuMathCores / TARGET_CORES_PER_EMBEDDING_CONTEXT)
241
+ )
242
+ );
243
+ }
244
+
245
+ private resolveThreadsPerContext(llama: Llama, poolSize: number): number {
246
+ if (llama.gpu !== false) {
247
+ return 0;
248
+ }
249
+
250
+ return Math.max(1, Math.floor(Math.max(1, llama.cpuMathCores) / poolSize));
251
+ }
252
+
253
+ private async createContexts(): Promise<LlmResult<LlamaEmbeddingContext[]>> {
182
254
  const model = await this.manager.loadModel(
183
255
  this.modelPath,
184
256
  this.modelUri,
185
257
  "embed"
186
258
  );
187
259
  if (!model.ok) {
188
- this.contextPromise = null; // Allow retry
260
+ this.contextsPromise = null;
189
261
  return model;
190
262
  }
191
263
 
192
264
  try {
193
- // Cast to access createEmbeddingContext
194
265
  const llamaModel = model.value.model as LlamaModel;
195
- this.context = await llamaModel.createEmbeddingContext();
266
+ const llama = await this.manager.getLlama();
267
+ const lifecycleVersion = this.lifecycleVersion;
268
+ const targetPoolSize = this.resolveTargetPoolSize(llama);
269
+ const threadsPerContext = this.resolveThreadsPerContext(
270
+ llama,
271
+ targetPoolSize
272
+ );
273
+ const contextOptions =
274
+ llama.gpu === false ? { threads: threadsPerContext } : undefined;
275
+ const contexts: LlamaEmbeddingContext[] = [];
276
+
277
+ for (let i = 0; i < targetPoolSize; i += 1) {
278
+ try {
279
+ const context =
280
+ await llamaModel.createEmbeddingContext(contextOptions);
281
+ contexts.push(context);
282
+ } catch (error) {
283
+ if (contexts.length === 0) {
284
+ this.contextsPromise = null;
285
+ return {
286
+ ok: false,
287
+ error: inferenceFailedError(this.modelUri, error),
288
+ };
289
+ }
290
+ break;
291
+ }
292
+ }
293
+
294
+ if (lifecycleVersion !== this.lifecycleVersion) {
295
+ for (const context of contexts) {
296
+ try {
297
+ await context.dispose();
298
+ } catch {
299
+ // Ignore disposal errors
300
+ }
301
+ }
302
+ return {
303
+ ok: false,
304
+ error: inferenceFailedError(
305
+ this.modelUri,
306
+ new Error("Embedding context disposed during initialization")
307
+ ),
308
+ };
309
+ }
310
+
311
+ this.workers = contexts.map((context) => ({ context, pending: 0 }));
196
312
 
197
- // Cache dimensions from model (available without running embed)
198
313
  const size = llamaModel.embeddingVectorSize;
199
314
  if (this.dims === null && typeof size === "number" && size > 0) {
200
315
  this.dims = size;
201
316
  }
202
317
 
203
- return { ok: true, value: this.context };
318
+ return { ok: true, value: contexts };
204
319
  } catch (e) {
205
- this.contextPromise = null; // Allow retry
320
+ this.contextsPromise = null;
206
321
  return { ok: false, error: inferenceFailedError(this.modelUri, e) };
207
322
  }
208
323
  }