@gmickel/gno 0.27.3 → 0.28.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/src/llm/nodeLlamaCpp/embedding.ts +185 -70
package/package.json
CHANGED
|
@@ -24,23 +24,32 @@ type LlamaEmbeddingContext = Awaited<
|
|
|
24
24
|
ReturnType<LlamaModel["createEmbeddingContext"]>
|
|
25
25
|
>;
|
|
26
26
|
|
|
27
|
+
type Llama = Awaited<ReturnType<typeof import("node-llama-cpp").getLlama>>;
|
|
28
|
+
|
|
29
|
+
interface EmbeddingWorker {
|
|
30
|
+
context: LlamaEmbeddingContext;
|
|
31
|
+
pending: number;
|
|
32
|
+
}
|
|
33
|
+
|
|
27
34
|
// ─────────────────────────────────────────────────────────────────────────────
|
|
28
35
|
// Constants
|
|
29
36
|
// ─────────────────────────────────────────────────────────────────────────────
|
|
30
37
|
|
|
31
|
-
//
|
|
32
|
-
//
|
|
33
|
-
//
|
|
34
|
-
const
|
|
38
|
+
// Aim for a small pool so CPU-only runs can exploit parallel contexts without
|
|
39
|
+
// multiplying RAM usage too aggressively. Additional contexts fall back
|
|
40
|
+
// gracefully if memory is tight.
|
|
41
|
+
const MAX_EMBEDDING_CONTEXTS = 4;
|
|
42
|
+
const TARGET_CORES_PER_EMBEDDING_CONTEXT = 4;
|
|
35
43
|
|
|
36
44
|
// ─────────────────────────────────────────────────────────────────────────────
|
|
37
45
|
// Implementation
|
|
38
46
|
// ─────────────────────────────────────────────────────────────────────────────
|
|
39
47
|
|
|
40
48
|
export class NodeLlamaCppEmbedding implements EmbeddingPort {
|
|
41
|
-
private
|
|
42
|
-
private
|
|
49
|
+
private workers: EmbeddingWorker[] = [];
|
|
50
|
+
private contextsPromise: Promise<LlmResult<LlamaEmbeddingContext[]>> | null =
|
|
43
51
|
null;
|
|
52
|
+
private lifecycleVersion = 0;
|
|
44
53
|
private dims: number | null = null;
|
|
45
54
|
private readonly manager: ModelManager;
|
|
46
55
|
readonly modelUri: string;
|
|
@@ -53,21 +62,23 @@ export class NodeLlamaCppEmbedding implements EmbeddingPort {
|
|
|
53
62
|
}
|
|
54
63
|
|
|
55
64
|
async init(): Promise<LlmResult<void>> {
|
|
56
|
-
const
|
|
57
|
-
if (!
|
|
58
|
-
return
|
|
65
|
+
const contexts = await this.getContexts();
|
|
66
|
+
if (!contexts.ok) {
|
|
67
|
+
return contexts;
|
|
59
68
|
}
|
|
60
69
|
return { ok: true, value: undefined };
|
|
61
70
|
}
|
|
62
71
|
|
|
63
72
|
async embed(text: string): Promise<LlmResult<number[]>> {
|
|
64
|
-
const
|
|
65
|
-
if (!
|
|
66
|
-
return
|
|
73
|
+
const contexts = await this.getContexts();
|
|
74
|
+
if (!contexts.ok) {
|
|
75
|
+
return contexts;
|
|
67
76
|
}
|
|
68
77
|
|
|
69
78
|
try {
|
|
70
|
-
const embedding = await
|
|
79
|
+
const embedding = await this.runOnWorker((worker) =>
|
|
80
|
+
worker.context.getEmbeddingFor(text)
|
|
81
|
+
);
|
|
71
82
|
const vector = Array.from(embedding.vector) as number[];
|
|
72
83
|
|
|
73
84
|
// Cache dimensions on first call
|
|
@@ -82,9 +93,9 @@ export class NodeLlamaCppEmbedding implements EmbeddingPort {
|
|
|
82
93
|
}
|
|
83
94
|
|
|
84
95
|
async embedBatch(texts: string[]): Promise<LlmResult<number[][]>> {
|
|
85
|
-
const
|
|
86
|
-
if (!
|
|
87
|
-
return
|
|
96
|
+
const contexts = await this.getContexts();
|
|
97
|
+
if (!contexts.ok) {
|
|
98
|
+
return contexts;
|
|
88
99
|
}
|
|
89
100
|
|
|
90
101
|
if (texts.length === 0) {
|
|
@@ -92,39 +103,40 @@ export class NodeLlamaCppEmbedding implements EmbeddingPort {
|
|
|
92
103
|
}
|
|
93
104
|
|
|
94
105
|
try {
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
106
|
+
const allResults = Array.from(
|
|
107
|
+
{ length: texts.length },
|
|
108
|
+
() => [] as number[]
|
|
109
|
+
);
|
|
110
|
+
let nextIndex = 0;
|
|
111
|
+
|
|
112
|
+
const settled = await Promise.allSettled(
|
|
113
|
+
this.workers.map(async (worker) => {
|
|
114
|
+
while (true) {
|
|
115
|
+
const index = nextIndex;
|
|
116
|
+
nextIndex += 1;
|
|
117
|
+
if (index >= texts.length) {
|
|
118
|
+
return;
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
const embedding = await this.runOnSpecificWorker(
|
|
122
|
+
worker,
|
|
123
|
+
(current) =>
|
|
124
|
+
current.context.getEmbeddingFor(texts[index] as string)
|
|
125
|
+
);
|
|
126
|
+
allResults[index] = Array.from(embedding.vector) as number[];
|
|
127
|
+
}
|
|
128
|
+
})
|
|
129
|
+
);
|
|
130
|
+
|
|
131
|
+
const firstRejection = settled.find(
|
|
132
|
+
(result): result is PromiseRejectedResult =>
|
|
133
|
+
result.status === "rejected"
|
|
134
|
+
);
|
|
135
|
+
if (firstRejection) {
|
|
136
|
+
return {
|
|
137
|
+
ok: false,
|
|
138
|
+
error: inferenceFailedError(this.modelUri, firstRejection.reason),
|
|
139
|
+
};
|
|
128
140
|
}
|
|
129
141
|
|
|
130
142
|
// Cache dimensions from first result
|
|
@@ -147,15 +159,17 @@ export class NodeLlamaCppEmbedding implements EmbeddingPort {
|
|
|
147
159
|
}
|
|
148
160
|
|
|
149
161
|
async dispose(): Promise<void> {
|
|
150
|
-
|
|
151
|
-
this.
|
|
152
|
-
|
|
162
|
+
this.lifecycleVersion += 1;
|
|
163
|
+
this.contextsPromise = null;
|
|
164
|
+
const workers = this.workers;
|
|
165
|
+
this.workers = [];
|
|
166
|
+
|
|
167
|
+
for (const worker of workers) {
|
|
153
168
|
try {
|
|
154
|
-
await
|
|
169
|
+
await worker.context.dispose();
|
|
155
170
|
} catch {
|
|
156
171
|
// Ignore disposal errors
|
|
157
172
|
}
|
|
158
|
-
this.context = null;
|
|
159
173
|
}
|
|
160
174
|
}
|
|
161
175
|
|
|
@@ -163,46 +177,147 @@ export class NodeLlamaCppEmbedding implements EmbeddingPort {
|
|
|
163
177
|
// Private
|
|
164
178
|
// ───────────────────────────────────────────────────────────────────────────
|
|
165
179
|
|
|
166
|
-
private
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
180
|
+
private async runOnWorker<T>(
|
|
181
|
+
task: (worker: EmbeddingWorker) => Promise<T>
|
|
182
|
+
): Promise<T> {
|
|
183
|
+
const worker = this.getLeastBusyWorker();
|
|
184
|
+
return this.runOnSpecificWorker(worker, task);
|
|
185
|
+
}
|
|
186
|
+
|
|
187
|
+
private async runOnSpecificWorker<T>(
|
|
188
|
+
worker: EmbeddingWorker,
|
|
189
|
+
task: (worker: EmbeddingWorker) => Promise<T>
|
|
190
|
+
): Promise<T> {
|
|
191
|
+
worker.pending += 1;
|
|
192
|
+
try {
|
|
193
|
+
return await task(worker);
|
|
194
|
+
} finally {
|
|
195
|
+
worker.pending -= 1;
|
|
196
|
+
}
|
|
197
|
+
}
|
|
198
|
+
|
|
199
|
+
private getLeastBusyWorker(): EmbeddingWorker {
|
|
200
|
+
const firstWorker = this.workers[0];
|
|
201
|
+
if (!firstWorker) {
|
|
202
|
+
throw new Error("Embedding context not initialized");
|
|
203
|
+
}
|
|
204
|
+
|
|
205
|
+
let bestWorker = firstWorker;
|
|
206
|
+
for (const worker of this.workers) {
|
|
207
|
+
if (worker.pending < bestWorker.pending) {
|
|
208
|
+
bestWorker = worker;
|
|
209
|
+
}
|
|
210
|
+
}
|
|
211
|
+
return bestWorker;
|
|
212
|
+
}
|
|
213
|
+
|
|
214
|
+
private getContexts(): Promise<LlmResult<LlamaEmbeddingContext[]>> {
|
|
215
|
+
if (this.workers.length > 0) {
|
|
216
|
+
return Promise.resolve({
|
|
217
|
+
ok: true,
|
|
218
|
+
value: this.workers.map((worker) => worker.context),
|
|
219
|
+
});
|
|
170
220
|
}
|
|
171
221
|
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
return this.contextPromise;
|
|
222
|
+
if (this.contextsPromise) {
|
|
223
|
+
return this.contextsPromise;
|
|
175
224
|
}
|
|
176
225
|
|
|
177
|
-
this.
|
|
178
|
-
return this.
|
|
226
|
+
this.contextsPromise = this.createContexts();
|
|
227
|
+
return this.contextsPromise;
|
|
179
228
|
}
|
|
180
229
|
|
|
181
|
-
private
|
|
230
|
+
private resolveTargetPoolSize(llama: Llama): number {
|
|
231
|
+
if (llama.gpu !== false) {
|
|
232
|
+
return 1;
|
|
233
|
+
}
|
|
234
|
+
|
|
235
|
+
const cpuMathCores = Math.max(1, llama.cpuMathCores);
|
|
236
|
+
return Math.max(
|
|
237
|
+
1,
|
|
238
|
+
Math.min(
|
|
239
|
+
MAX_EMBEDDING_CONTEXTS,
|
|
240
|
+
Math.ceil(cpuMathCores / TARGET_CORES_PER_EMBEDDING_CONTEXT)
|
|
241
|
+
)
|
|
242
|
+
);
|
|
243
|
+
}
|
|
244
|
+
|
|
245
|
+
private resolveThreadsPerContext(llama: Llama, poolSize: number): number {
|
|
246
|
+
if (llama.gpu !== false) {
|
|
247
|
+
return 0;
|
|
248
|
+
}
|
|
249
|
+
|
|
250
|
+
return Math.max(1, Math.floor(Math.max(1, llama.cpuMathCores) / poolSize));
|
|
251
|
+
}
|
|
252
|
+
|
|
253
|
+
private async createContexts(): Promise<LlmResult<LlamaEmbeddingContext[]>> {
|
|
182
254
|
const model = await this.manager.loadModel(
|
|
183
255
|
this.modelPath,
|
|
184
256
|
this.modelUri,
|
|
185
257
|
"embed"
|
|
186
258
|
);
|
|
187
259
|
if (!model.ok) {
|
|
188
|
-
this.
|
|
260
|
+
this.contextsPromise = null;
|
|
189
261
|
return model;
|
|
190
262
|
}
|
|
191
263
|
|
|
192
264
|
try {
|
|
193
|
-
// Cast to access createEmbeddingContext
|
|
194
265
|
const llamaModel = model.value.model as LlamaModel;
|
|
195
|
-
|
|
266
|
+
const llama = await this.manager.getLlama();
|
|
267
|
+
const lifecycleVersion = this.lifecycleVersion;
|
|
268
|
+
const targetPoolSize = this.resolveTargetPoolSize(llama);
|
|
269
|
+
const threadsPerContext = this.resolveThreadsPerContext(
|
|
270
|
+
llama,
|
|
271
|
+
targetPoolSize
|
|
272
|
+
);
|
|
273
|
+
const contextOptions =
|
|
274
|
+
llama.gpu === false ? { threads: threadsPerContext } : undefined;
|
|
275
|
+
const contexts: LlamaEmbeddingContext[] = [];
|
|
276
|
+
|
|
277
|
+
for (let i = 0; i < targetPoolSize; i += 1) {
|
|
278
|
+
try {
|
|
279
|
+
const context =
|
|
280
|
+
await llamaModel.createEmbeddingContext(contextOptions);
|
|
281
|
+
contexts.push(context);
|
|
282
|
+
} catch (error) {
|
|
283
|
+
if (contexts.length === 0) {
|
|
284
|
+
this.contextsPromise = null;
|
|
285
|
+
return {
|
|
286
|
+
ok: false,
|
|
287
|
+
error: inferenceFailedError(this.modelUri, error),
|
|
288
|
+
};
|
|
289
|
+
}
|
|
290
|
+
break;
|
|
291
|
+
}
|
|
292
|
+
}
|
|
293
|
+
|
|
294
|
+
if (lifecycleVersion !== this.lifecycleVersion) {
|
|
295
|
+
for (const context of contexts) {
|
|
296
|
+
try {
|
|
297
|
+
await context.dispose();
|
|
298
|
+
} catch {
|
|
299
|
+
// Ignore disposal errors
|
|
300
|
+
}
|
|
301
|
+
}
|
|
302
|
+
return {
|
|
303
|
+
ok: false,
|
|
304
|
+
error: inferenceFailedError(
|
|
305
|
+
this.modelUri,
|
|
306
|
+
new Error("Embedding context disposed during initialization")
|
|
307
|
+
),
|
|
308
|
+
};
|
|
309
|
+
}
|
|
310
|
+
|
|
311
|
+
this.workers = contexts.map((context) => ({ context, pending: 0 }));
|
|
196
312
|
|
|
197
|
-
// Cache dimensions from model (available without running embed)
|
|
198
313
|
const size = llamaModel.embeddingVectorSize;
|
|
199
314
|
if (this.dims === null && typeof size === "number" && size > 0) {
|
|
200
315
|
this.dims = size;
|
|
201
316
|
}
|
|
202
317
|
|
|
203
|
-
return { ok: true, value:
|
|
318
|
+
return { ok: true, value: contexts };
|
|
204
319
|
} catch (e) {
|
|
205
|
-
this.
|
|
320
|
+
this.contextsPromise = null;
|
|
206
321
|
return { ok: false, error: inferenceFailedError(this.modelUri, e) };
|
|
207
322
|
}
|
|
208
323
|
}
|