@gmickel/gno 1.5.2 → 1.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +1 -1
- package/package.json +5 -2
- package/src/cli/commands/doctor.ts +179 -1
- package/src/cli/commands/embed.ts +217 -242
- package/src/embed/backlog.ts +92 -45
- package/src/embed/fingerprint.ts +37 -0
- package/src/embed/retry.ts +137 -0
- package/src/llm/nodeLlamaCpp/embedding.ts +81 -19
- package/src/sdk/embed.ts +134 -59
- package/src/store/migrations/008-vector-fingerprints.ts +25 -0
- package/src/store/migrations/index.ts +2 -1
- package/src/store/sqlite/adapter.ts +20 -6
- package/src/store/types.ts +1 -0
- package/src/store/vector/freshness.ts +34 -0
- package/src/store/vector/sqlite-vec.ts +5 -2
- package/src/store/vector/stats.ts +20 -2
- package/src/store/vector/types.ts +3 -0
|
@@ -17,11 +17,17 @@ import {
|
|
|
17
17
|
isInitialized,
|
|
18
18
|
loadConfig,
|
|
19
19
|
} from "../../config";
|
|
20
|
-
import {
|
|
20
|
+
import { getEmbeddingFingerprint } from "../../embed/fingerprint";
|
|
21
|
+
import {
|
|
22
|
+
addUniqueSamples,
|
|
23
|
+
chunkRetryKey,
|
|
24
|
+
embedAndStoreBatch,
|
|
25
|
+
MAX_EMBED_CHUNK_ATTEMPTS,
|
|
26
|
+
type EmbedStoreBatchResult,
|
|
27
|
+
} from "../../embed/retry";
|
|
21
28
|
import { LlmAdapter } from "../../llm/nodeLlamaCpp/adapter";
|
|
22
29
|
import { resolveDownloadPolicy } from "../../llm/policy";
|
|
23
30
|
import { resolveModelUri } from "../../llm/registry";
|
|
24
|
-
import { formatDocForEmbedding } from "../../pipeline/contextual";
|
|
25
31
|
import { SqliteAdapter } from "../../store/sqlite/adapter";
|
|
26
32
|
import { err, ok } from "../../store/types";
|
|
27
33
|
import {
|
|
@@ -29,7 +35,6 @@ import {
|
|
|
29
35
|
createVectorIndexPort,
|
|
30
36
|
createVectorStatsPort,
|
|
31
37
|
type VectorIndexPort,
|
|
32
|
-
type VectorRow,
|
|
33
38
|
type VectorStatsPort,
|
|
34
39
|
} from "../../store/vector";
|
|
35
40
|
import { getGlobals } from "../program";
|
|
@@ -92,26 +97,6 @@ function formatDuration(seconds: number): string {
|
|
|
92
97
|
return `${mins}m ${secs.toFixed(0)}s`;
|
|
93
98
|
}
|
|
94
99
|
|
|
95
|
-
function formatLlmFailure(
|
|
96
|
-
error: { message: string; cause?: unknown } | undefined
|
|
97
|
-
): string {
|
|
98
|
-
if (!error) {
|
|
99
|
-
return "Unknown embedding failure";
|
|
100
|
-
}
|
|
101
|
-
const cause =
|
|
102
|
-
error.cause &&
|
|
103
|
-
typeof error.cause === "object" &&
|
|
104
|
-
"message" in error.cause &&
|
|
105
|
-
typeof error.cause.message === "string"
|
|
106
|
-
? error.cause.message
|
|
107
|
-
: typeof error.cause === "string"
|
|
108
|
-
? error.cause
|
|
109
|
-
: "";
|
|
110
|
-
return cause && cause !== error.message
|
|
111
|
-
? `${error.message} - ${cause}`
|
|
112
|
-
: error.message;
|
|
113
|
-
}
|
|
114
|
-
|
|
115
100
|
function isDisposedBatchError(message: string): boolean {
|
|
116
101
|
return message.toLowerCase().includes("object is disposed");
|
|
117
102
|
}
|
|
@@ -168,23 +153,155 @@ async function processBatches(ctx: BatchContext): Promise<BatchResult> {
|
|
|
168
153
|
const errorSamples: string[] = [];
|
|
169
154
|
let suggestion: string | undefined;
|
|
170
155
|
let cursor: Cursor | undefined;
|
|
156
|
+
const retryQueue = new Map<string, { item: BacklogItem; attempts: number }>();
|
|
157
|
+
const embedFingerprint = getEmbeddingFingerprint({
|
|
158
|
+
modelUri: ctx.modelUri,
|
|
159
|
+
dimensions: ctx.vectorIndex.dimensions,
|
|
160
|
+
});
|
|
171
161
|
|
|
172
162
|
const pushErrorSamples = (samples: string[]): void => {
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
163
|
+
addUniqueSamples(errorSamples, samples);
|
|
164
|
+
};
|
|
165
|
+
|
|
166
|
+
const enqueueRetryItems = (items: BacklogItem[], attempts: number): void => {
|
|
167
|
+
for (const item of items) {
|
|
168
|
+
const key = chunkRetryKey(item);
|
|
169
|
+
const existing = retryQueue.get(key);
|
|
170
|
+
retryQueue.set(key, {
|
|
171
|
+
item,
|
|
172
|
+
attempts: Math.max(existing?.attempts ?? 0, attempts),
|
|
173
|
+
});
|
|
174
|
+
}
|
|
175
|
+
};
|
|
176
|
+
|
|
177
|
+
const writeBatchDiagnostics = (
|
|
178
|
+
batch: BacklogItem[],
|
|
179
|
+
result: EmbedStoreBatchResult
|
|
180
|
+
): void => {
|
|
181
|
+
if (ctx.verbose && result.batchFailed) {
|
|
182
|
+
const titles = batch
|
|
183
|
+
.slice(0, 3)
|
|
184
|
+
.map((item) => item.title ?? item.mirrorHash.slice(0, 8))
|
|
185
|
+
.join(", ");
|
|
186
|
+
process.stderr.write(
|
|
187
|
+
`\n[embed] Batch fallback (${batch.length} chunks: ${titles}${batch.length > 3 ? "..." : ""}): ${result.batchError ?? "unknown batch error"}\n`
|
|
188
|
+
);
|
|
189
|
+
}
|
|
190
|
+
if (ctx.verbose && result.errorSamples.length > 0) {
|
|
191
|
+
for (const sample of result.errorSamples) {
|
|
192
|
+
process.stderr.write(`\n[embed] Sample failure: ${sample}\n`);
|
|
176
193
|
}
|
|
177
|
-
|
|
178
|
-
|
|
194
|
+
}
|
|
195
|
+
};
|
|
196
|
+
|
|
197
|
+
const processStoreBatch = async (
|
|
198
|
+
batch: BacklogItem[]
|
|
199
|
+
): Promise<EmbedStoreBatchResult> => {
|
|
200
|
+
let result = await embedAndStoreBatch({
|
|
201
|
+
embedPort: ctx.embedPort,
|
|
202
|
+
vectorIndex: ctx.vectorIndex,
|
|
203
|
+
items: batch,
|
|
204
|
+
modelUri: ctx.modelUri,
|
|
205
|
+
embedFingerprint,
|
|
206
|
+
});
|
|
207
|
+
|
|
208
|
+
if (
|
|
209
|
+
ctx.recreateEmbedPort &&
|
|
210
|
+
result.retryItems.length === batch.length &&
|
|
211
|
+
result.batchError &&
|
|
212
|
+
isDisposedBatchError(result.batchError)
|
|
213
|
+
) {
|
|
214
|
+
if (ctx.verbose) {
|
|
215
|
+
process.stderr.write(
|
|
216
|
+
"\n[embed] Embedding port disposed; recreating model/contexts and retrying batch once\n"
|
|
217
|
+
);
|
|
218
|
+
}
|
|
219
|
+
const recreated = await ctx.recreateEmbedPort();
|
|
220
|
+
if (recreated.ok) {
|
|
221
|
+
ctx.embedPort = recreated.value;
|
|
222
|
+
result = await embedAndStoreBatch({
|
|
223
|
+
embedPort: ctx.embedPort,
|
|
224
|
+
vectorIndex: ctx.vectorIndex,
|
|
225
|
+
items: batch,
|
|
226
|
+
modelUri: ctx.modelUri,
|
|
227
|
+
embedFingerprint,
|
|
228
|
+
});
|
|
229
|
+
if (ctx.verbose && result.embedded > 0) {
|
|
230
|
+
process.stderr.write("\n[embed] Retry after port reset succeeded\n");
|
|
231
|
+
}
|
|
179
232
|
}
|
|
180
233
|
}
|
|
234
|
+
|
|
235
|
+
return result;
|
|
236
|
+
};
|
|
237
|
+
|
|
238
|
+
const renderProgress = (): void => {
|
|
239
|
+
if (!ctx.showProgress) {
|
|
240
|
+
return;
|
|
241
|
+
}
|
|
242
|
+
const embeddedDisplay = Math.min(embedded, ctx.totalToEmbed);
|
|
243
|
+
const completed = Math.min(embedded + errors, ctx.totalToEmbed);
|
|
244
|
+
const pct = (completed / ctx.totalToEmbed) * 100;
|
|
245
|
+
const elapsed = (Date.now() - startTime) / 1000;
|
|
246
|
+
const rate = embedded / Math.max(elapsed, 0.001);
|
|
247
|
+
const eta =
|
|
248
|
+
Math.max(0, ctx.totalToEmbed - completed) / Math.max(rate, 0.001);
|
|
249
|
+
process.stdout.write(
|
|
250
|
+
`\rEmbedding: ${embeddedDisplay.toLocaleString()}/${ctx.totalToEmbed.toLocaleString()} (${pct.toFixed(1)}%) | ${rate.toFixed(1)} chunks/s | ETA ${formatDuration(eta)}`
|
|
251
|
+
);
|
|
252
|
+
};
|
|
253
|
+
|
|
254
|
+
const drainRetryQueue = async (): Promise<number> => {
|
|
255
|
+
if (retryQueue.size === 0) {
|
|
256
|
+
return 0;
|
|
257
|
+
}
|
|
258
|
+
let retryEmbedded = 0;
|
|
259
|
+
const entries = [...retryQueue.values()].filter(
|
|
260
|
+
(entry) => entry.attempts < MAX_EMBED_CHUNK_ATTEMPTS
|
|
261
|
+
);
|
|
262
|
+
for (let idx = 0; idx < entries.length; idx += ctx.batchSize) {
|
|
263
|
+
const slice = entries.slice(idx, idx + ctx.batchSize);
|
|
264
|
+
for (const entry of slice) {
|
|
265
|
+
retryQueue.delete(chunkRetryKey(entry.item));
|
|
266
|
+
entry.attempts += 1;
|
|
267
|
+
}
|
|
268
|
+
|
|
269
|
+
const retryResult = await processStoreBatch(
|
|
270
|
+
slice.map((entry) => entry.item)
|
|
271
|
+
);
|
|
272
|
+
writeBatchDiagnostics(
|
|
273
|
+
slice.map((entry) => entry.item),
|
|
274
|
+
retryResult
|
|
275
|
+
);
|
|
276
|
+
pushErrorSamples(retryResult.errorSamples);
|
|
277
|
+
suggestion ||= retryResult.suggestion;
|
|
278
|
+
embedded += retryResult.embedded;
|
|
279
|
+
errors += retryResult.errors;
|
|
280
|
+
retryEmbedded += retryResult.embedded;
|
|
281
|
+
|
|
282
|
+
const retryByKey = new Set(
|
|
283
|
+
retryResult.retryItems.map((item) => chunkRetryKey(item))
|
|
284
|
+
);
|
|
285
|
+
for (const entry of slice) {
|
|
286
|
+
if (!retryByKey.has(chunkRetryKey(entry.item))) {
|
|
287
|
+
continue;
|
|
288
|
+
}
|
|
289
|
+
if (entry.attempts >= MAX_EMBED_CHUNK_ATTEMPTS) {
|
|
290
|
+
errors += 1;
|
|
291
|
+
} else {
|
|
292
|
+
retryQueue.set(chunkRetryKey(entry.item), entry);
|
|
293
|
+
}
|
|
294
|
+
}
|
|
295
|
+
renderProgress();
|
|
296
|
+
}
|
|
297
|
+
return retryEmbedded;
|
|
181
298
|
};
|
|
182
299
|
|
|
183
300
|
while (embedded + errors < ctx.totalToEmbed) {
|
|
184
301
|
// Get next batch using seek pagination (cursor-based)
|
|
185
302
|
const batchResult = ctx.force
|
|
186
303
|
? await getActiveChunks(ctx.db, ctx.batchSize, cursor, ctx.collection)
|
|
187
|
-
: await ctx.stats.getBacklog(ctx.modelUri, {
|
|
304
|
+
: await ctx.stats.getBacklog(ctx.modelUri, embedFingerprint, {
|
|
188
305
|
limit: ctx.batchSize,
|
|
189
306
|
after: cursor,
|
|
190
307
|
collection: ctx.collection,
|
|
@@ -205,189 +322,30 @@ async function processBatches(ctx: BatchContext): Promise<BatchResult> {
|
|
|
205
322
|
cursor = { mirrorHash: lastItem.mirrorHash, seq: lastItem.seq };
|
|
206
323
|
}
|
|
207
324
|
|
|
208
|
-
|
|
209
|
-
const
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
process.stderr.write(
|
|
220
|
-
"\n[embed] Embedding port disposed; recreating model/contexts and retrying batch once\n"
|
|
221
|
-
);
|
|
222
|
-
}
|
|
223
|
-
const recreated = await ctx.recreateEmbedPort();
|
|
224
|
-
if (recreated.ok) {
|
|
225
|
-
ctx.embedPort = recreated.value;
|
|
226
|
-
const retryResult = await embedTextsWithRecovery(
|
|
227
|
-
ctx.embedPort,
|
|
228
|
-
batch.map((b) =>
|
|
229
|
-
formatDocForEmbedding(b.text, b.title ?? undefined, ctx.modelUri)
|
|
230
|
-
)
|
|
231
|
-
);
|
|
232
|
-
if (retryResult.ok) {
|
|
233
|
-
if (ctx.verbose) {
|
|
234
|
-
process.stderr.write(
|
|
235
|
-
"\n[embed] Retry after port reset succeeded\n"
|
|
236
|
-
);
|
|
237
|
-
}
|
|
238
|
-
pushErrorSamples(retryResult.value.failureSamples);
|
|
239
|
-
suggestion ||= retryResult.value.retrySuggestion;
|
|
240
|
-
|
|
241
|
-
const retryVectors: VectorRow[] = [];
|
|
242
|
-
for (const [idx, item] of batch.entries()) {
|
|
243
|
-
const embedding = retryResult.value.vectors[idx];
|
|
244
|
-
if (!embedding) {
|
|
245
|
-
errors += 1;
|
|
246
|
-
continue;
|
|
247
|
-
}
|
|
248
|
-
retryVectors.push({
|
|
249
|
-
mirrorHash: item.mirrorHash,
|
|
250
|
-
seq: item.seq,
|
|
251
|
-
model: ctx.modelUri,
|
|
252
|
-
embedding: new Float32Array(embedding),
|
|
253
|
-
});
|
|
254
|
-
}
|
|
255
|
-
|
|
256
|
-
if (retryVectors.length === 0) {
|
|
257
|
-
if (ctx.verbose) {
|
|
258
|
-
process.stderr.write(
|
|
259
|
-
"\n[embed] No recoverable embeddings in retry batch\n"
|
|
260
|
-
);
|
|
261
|
-
}
|
|
262
|
-
continue;
|
|
263
|
-
}
|
|
264
|
-
|
|
265
|
-
const retryStoreResult =
|
|
266
|
-
await ctx.vectorIndex.upsertVectors(retryVectors);
|
|
267
|
-
if (!retryStoreResult.ok) {
|
|
268
|
-
if (ctx.verbose) {
|
|
269
|
-
process.stderr.write(
|
|
270
|
-
`\n[embed] Store failed: ${retryStoreResult.error.message}\n`
|
|
271
|
-
);
|
|
272
|
-
}
|
|
273
|
-
pushErrorSamples([retryStoreResult.error.message]);
|
|
274
|
-
suggestion ??=
|
|
275
|
-
"Store write failed. Rerun `gno embed` once more; if it repeats, run `gno doctor` and `gno vec sync`.";
|
|
276
|
-
errors += retryVectors.length;
|
|
277
|
-
continue;
|
|
278
|
-
}
|
|
279
|
-
|
|
280
|
-
embedded += retryVectors.length;
|
|
281
|
-
if (ctx.showProgress) {
|
|
282
|
-
const embeddedDisplay = Math.min(embedded, ctx.totalToEmbed);
|
|
283
|
-
const completed = Math.min(embedded + errors, ctx.totalToEmbed);
|
|
284
|
-
const pct = (completed / ctx.totalToEmbed) * 100;
|
|
285
|
-
const elapsed = (Date.now() - startTime) / 1000;
|
|
286
|
-
const rate = embedded / Math.max(elapsed, 0.001);
|
|
287
|
-
const eta =
|
|
288
|
-
Math.max(0, ctx.totalToEmbed - completed) /
|
|
289
|
-
Math.max(rate, 0.001);
|
|
290
|
-
process.stdout.write(
|
|
291
|
-
`\rEmbedding: ${embeddedDisplay.toLocaleString()}/${ctx.totalToEmbed.toLocaleString()} (${pct.toFixed(1)}%) | ${rate.toFixed(1)} chunks/s | ETA ${formatDuration(eta)}`
|
|
292
|
-
);
|
|
293
|
-
}
|
|
294
|
-
continue;
|
|
295
|
-
}
|
|
296
|
-
}
|
|
297
|
-
}
|
|
298
|
-
|
|
299
|
-
if (ctx.verbose) {
|
|
300
|
-
const err = batchEmbedResult.error;
|
|
301
|
-
const cause = err.cause;
|
|
302
|
-
const causeMsg =
|
|
303
|
-
cause && typeof cause === "object" && "message" in cause
|
|
304
|
-
? (cause as { message: string }).message
|
|
305
|
-
: typeof cause === "string"
|
|
306
|
-
? cause
|
|
307
|
-
: "";
|
|
308
|
-
const titles = batch
|
|
309
|
-
.slice(0, 3)
|
|
310
|
-
.map((b) => b.title ?? b.mirrorHash.slice(0, 8))
|
|
311
|
-
.join(", ");
|
|
312
|
-
process.stderr.write(
|
|
313
|
-
`\n[embed] Batch failed (${batch.length} chunks: ${titles}${batch.length > 3 ? "..." : ""}): ${err.message}${causeMsg ? ` - ${causeMsg}` : ""}\n`
|
|
314
|
-
);
|
|
315
|
-
}
|
|
316
|
-
pushErrorSamples([formattedError]);
|
|
317
|
-
suggestion =
|
|
318
|
-
"Try rerunning the same command. If failures persist, rerun with `gno --verbose embed --batch-size 1` to isolate failing chunks.";
|
|
319
|
-
errors += batch.length;
|
|
320
|
-
continue;
|
|
321
|
-
}
|
|
322
|
-
|
|
323
|
-
if (ctx.verbose && batchEmbedResult.value.batchFailed) {
|
|
324
|
-
const titles = batch
|
|
325
|
-
.slice(0, 3)
|
|
326
|
-
.map((b) => b.title ?? b.mirrorHash.slice(0, 8))
|
|
327
|
-
.join(", ");
|
|
328
|
-
process.stderr.write(
|
|
329
|
-
`\n[embed] Batch fallback (${batch.length} chunks: ${titles}${batch.length > 3 ? "..." : ""}): ${batchEmbedResult.value.batchError ?? "unknown batch error"}\n`
|
|
330
|
-
);
|
|
331
|
-
}
|
|
332
|
-
pushErrorSamples(batchEmbedResult.value.failureSamples);
|
|
333
|
-
suggestion ||= batchEmbedResult.value.retrySuggestion;
|
|
334
|
-
if (ctx.verbose && batchEmbedResult.value.failureSamples.length > 0) {
|
|
335
|
-
for (const sample of batchEmbedResult.value.failureSamples) {
|
|
336
|
-
process.stderr.write(`\n[embed] Sample failure: ${sample}\n`);
|
|
337
|
-
}
|
|
338
|
-
}
|
|
339
|
-
|
|
340
|
-
const vectors: VectorRow[] = [];
|
|
341
|
-
for (const [idx, item] of batch.entries()) {
|
|
342
|
-
const embedding = batchEmbedResult.value.vectors[idx];
|
|
343
|
-
if (!embedding) {
|
|
344
|
-
errors += 1;
|
|
345
|
-
continue;
|
|
346
|
-
}
|
|
347
|
-
vectors.push({
|
|
348
|
-
mirrorHash: item.mirrorHash,
|
|
349
|
-
seq: item.seq,
|
|
350
|
-
model: ctx.modelUri,
|
|
351
|
-
embedding: new Float32Array(embedding),
|
|
352
|
-
});
|
|
325
|
+
const beforeEmbedded = embedded;
|
|
326
|
+
const batchStoreResult = await processStoreBatch(batch);
|
|
327
|
+
writeBatchDiagnostics(batch, batchStoreResult);
|
|
328
|
+
pushErrorSamples(batchStoreResult.errorSamples);
|
|
329
|
+
suggestion ||= batchStoreResult.suggestion;
|
|
330
|
+
embedded += batchStoreResult.embedded;
|
|
331
|
+
errors += batchStoreResult.errors;
|
|
332
|
+
enqueueRetryItems(batchStoreResult.retryItems, 1);
|
|
333
|
+
|
|
334
|
+
if (embedded > beforeEmbedded) {
|
|
335
|
+
await drainRetryQueue();
|
|
353
336
|
}
|
|
354
337
|
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
process.stderr.write("\n[embed] No recoverable embeddings in batch\n");
|
|
358
|
-
}
|
|
359
|
-
continue;
|
|
360
|
-
}
|
|
338
|
+
renderProgress();
|
|
339
|
+
}
|
|
361
340
|
|
|
362
|
-
|
|
363
|
-
if (!storeResult.ok) {
|
|
364
|
-
if (ctx.verbose) {
|
|
365
|
-
process.stderr.write(
|
|
366
|
-
`\n[embed] Store failed: ${storeResult.error.message}\n`
|
|
367
|
-
);
|
|
368
|
-
}
|
|
369
|
-
pushErrorSamples([storeResult.error.message]);
|
|
370
|
-
suggestion ??=
|
|
371
|
-
"Store write failed. Rerun `gno embed` once more; if it repeats, run `gno doctor` and `gno vec sync`.";
|
|
372
|
-
errors += vectors.length;
|
|
373
|
-
continue;
|
|
374
|
-
}
|
|
341
|
+
await drainRetryQueue();
|
|
375
342
|
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
const pct = (completed / ctx.totalToEmbed) * 100;
|
|
383
|
-
const elapsed = (Date.now() - startTime) / 1000;
|
|
384
|
-
const rate = embedded / Math.max(elapsed, 0.001);
|
|
385
|
-
const eta =
|
|
386
|
-
Math.max(0, ctx.totalToEmbed - completed) / Math.max(rate, 0.001);
|
|
387
|
-
process.stdout.write(
|
|
388
|
-
`\rEmbedding: ${embeddedDisplay.toLocaleString()}/${ctx.totalToEmbed.toLocaleString()} (${pct.toFixed(1)}%) | ${rate.toFixed(1)} chunks/s | ETA ${formatDuration(eta)}`
|
|
389
|
-
);
|
|
390
|
-
}
|
|
343
|
+
if (retryQueue.size > 0) {
|
|
344
|
+
errors += retryQueue.size;
|
|
345
|
+
pushErrorSamples(["Some chunks failed after same-run retry attempts"]);
|
|
346
|
+
suggestion ??=
|
|
347
|
+
"Some chunks failed after retry. Rerun `gno --verbose embed --batch-size 1` to isolate failing chunks.";
|
|
348
|
+
retryQueue.clear();
|
|
391
349
|
}
|
|
392
350
|
|
|
393
351
|
if (ctx.showProgress) {
|
|
@@ -488,41 +446,26 @@ export async function embed(options: EmbedOptions = {}): Promise<EmbedResult> {
|
|
|
488
446
|
// Create stats port for backlog detection
|
|
489
447
|
const stats: VectorStatsPort = createVectorStatsPort(db);
|
|
490
448
|
|
|
491
|
-
|
|
492
|
-
|
|
493
|
-
|
|
494
|
-
|
|
495
|
-
|
|
496
|
-
|
|
497
|
-
|
|
498
|
-
}
|
|
499
|
-
|
|
500
|
-
const totalToEmbed = backlogResult.value;
|
|
501
|
-
|
|
502
|
-
if (totalToEmbed === 0) {
|
|
503
|
-
const vecAvailable = await checkVecAvailable(db);
|
|
504
|
-
return {
|
|
505
|
-
success: true,
|
|
506
|
-
embedded: 0,
|
|
507
|
-
errors: 0,
|
|
508
|
-
duration: 0,
|
|
509
|
-
model: modelUri,
|
|
510
|
-
searchAvailable: vecAvailable,
|
|
511
|
-
errorSamples: [],
|
|
512
|
-
};
|
|
513
|
-
}
|
|
449
|
+
let totalToEmbed = 0;
|
|
450
|
+
if (force) {
|
|
451
|
+
const forceCount = await getActiveChunkCount(db, options.collection);
|
|
452
|
+
if (!forceCount.ok) {
|
|
453
|
+
return { success: false, error: forceCount.error.message };
|
|
454
|
+
}
|
|
455
|
+
totalToEmbed = forceCount.value;
|
|
514
456
|
|
|
515
|
-
|
|
516
|
-
|
|
517
|
-
|
|
518
|
-
|
|
519
|
-
|
|
520
|
-
|
|
521
|
-
|
|
522
|
-
|
|
523
|
-
|
|
524
|
-
|
|
525
|
-
|
|
457
|
+
if (totalToEmbed === 0 || dryRun) {
|
|
458
|
+
const vecAvailable = await checkVecAvailable(db);
|
|
459
|
+
return {
|
|
460
|
+
success: true,
|
|
461
|
+
embedded: totalToEmbed,
|
|
462
|
+
errors: 0,
|
|
463
|
+
duration: 0,
|
|
464
|
+
model: modelUri,
|
|
465
|
+
searchAvailable: vecAvailable,
|
|
466
|
+
errorSamples: [],
|
|
467
|
+
};
|
|
468
|
+
}
|
|
526
469
|
}
|
|
527
470
|
|
|
528
471
|
// Create LLM adapter and embedding port with auto-download
|
|
@@ -592,6 +535,38 @@ export async function embed(options: EmbedOptions = {}): Promise<EmbedResult> {
|
|
|
592
535
|
}
|
|
593
536
|
vectorIndex = vectorResult.value;
|
|
594
537
|
|
|
538
|
+
if (!force) {
|
|
539
|
+
const embedFingerprint = getEmbeddingFingerprint({
|
|
540
|
+
modelUri,
|
|
541
|
+
dimensions,
|
|
542
|
+
});
|
|
543
|
+
const backlogResult = await stats.countBacklog(
|
|
544
|
+
modelUri,
|
|
545
|
+
embedFingerprint,
|
|
546
|
+
{
|
|
547
|
+
collection: options.collection,
|
|
548
|
+
}
|
|
549
|
+
);
|
|
550
|
+
|
|
551
|
+
if (!backlogResult.ok) {
|
|
552
|
+
return { success: false, error: backlogResult.error.message };
|
|
553
|
+
}
|
|
554
|
+
|
|
555
|
+
totalToEmbed = backlogResult.value;
|
|
556
|
+
|
|
557
|
+
if (totalToEmbed === 0 || dryRun) {
|
|
558
|
+
return {
|
|
559
|
+
success: true,
|
|
560
|
+
embedded: totalToEmbed,
|
|
561
|
+
errors: 0,
|
|
562
|
+
duration: 0,
|
|
563
|
+
model: modelUri,
|
|
564
|
+
searchAvailable: vectorIndex.searchAvailable,
|
|
565
|
+
errorSamples: [],
|
|
566
|
+
};
|
|
567
|
+
}
|
|
568
|
+
}
|
|
569
|
+
|
|
595
570
|
// Process batches
|
|
596
571
|
const result = await processBatches({
|
|
597
572
|
db,
|
package/src/embed/backlog.ts
CHANGED
|
@@ -10,13 +10,16 @@ import type { StoreResult } from "../store/types";
|
|
|
10
10
|
import type {
|
|
11
11
|
BacklogItem,
|
|
12
12
|
VectorIndexPort,
|
|
13
|
-
VectorRow,
|
|
14
13
|
VectorStatsPort,
|
|
15
14
|
} from "../store/vector";
|
|
16
15
|
|
|
17
|
-
import { formatDocForEmbedding } from "../pipeline/contextual";
|
|
18
16
|
import { err, ok } from "../store/types";
|
|
19
|
-
import {
|
|
17
|
+
import { getEmbeddingFingerprint } from "./fingerprint";
|
|
18
|
+
import {
|
|
19
|
+
chunkRetryKey,
|
|
20
|
+
embedAndStoreBatch,
|
|
21
|
+
MAX_EMBED_CHUNK_ATTEMPTS,
|
|
22
|
+
} from "./retry";
|
|
20
23
|
|
|
21
24
|
// ─────────────────────────────────────────────────────────────────────────────
|
|
22
25
|
// Types
|
|
@@ -56,19 +59,86 @@ export async function embedBacklog(
|
|
|
56
59
|
): Promise<StoreResult<EmbedBacklogResult>> {
|
|
57
60
|
const { statsPort, embedPort, vectorIndex, modelUri, collection } = deps;
|
|
58
61
|
const batchSize = deps.batchSize ?? 32;
|
|
62
|
+
const embedFingerprint = getEmbeddingFingerprint({
|
|
63
|
+
modelUri,
|
|
64
|
+
dimensions: vectorIndex.dimensions,
|
|
65
|
+
});
|
|
59
66
|
|
|
60
67
|
let embedded = 0;
|
|
61
68
|
let errors = 0;
|
|
62
69
|
let cursor: Cursor | undefined;
|
|
70
|
+
const retryQueue = new Map<string, { item: BacklogItem; attempts: number }>();
|
|
71
|
+
|
|
72
|
+
const enqueueRetryItems = (items: BacklogItem[], attempts: number): void => {
|
|
73
|
+
for (const item of items) {
|
|
74
|
+
const key = chunkRetryKey(item);
|
|
75
|
+
const existing = retryQueue.get(key);
|
|
76
|
+
retryQueue.set(key, {
|
|
77
|
+
item,
|
|
78
|
+
attempts: Math.max(existing?.attempts ?? 0, attempts),
|
|
79
|
+
});
|
|
80
|
+
}
|
|
81
|
+
};
|
|
82
|
+
|
|
83
|
+
const drainRetryQueue = async (): Promise<number> => {
|
|
84
|
+
if (retryQueue.size === 0) {
|
|
85
|
+
return 0;
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
let retryEmbedded = 0;
|
|
89
|
+
const entries = [...retryQueue.values()].filter(
|
|
90
|
+
(entry) => entry.attempts < MAX_EMBED_CHUNK_ATTEMPTS
|
|
91
|
+
);
|
|
92
|
+
|
|
93
|
+
for (let idx = 0; idx < entries.length; idx += batchSize) {
|
|
94
|
+
const slice = entries.slice(idx, idx + batchSize);
|
|
95
|
+
for (const entry of slice) {
|
|
96
|
+
retryQueue.delete(chunkRetryKey(entry.item));
|
|
97
|
+
entry.attempts += 1;
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
const retryResult = await embedAndStoreBatch({
|
|
101
|
+
embedPort,
|
|
102
|
+
vectorIndex,
|
|
103
|
+
items: slice.map((entry) => entry.item),
|
|
104
|
+
modelUri,
|
|
105
|
+
embedFingerprint,
|
|
106
|
+
});
|
|
107
|
+
|
|
108
|
+
embedded += retryResult.embedded;
|
|
109
|
+
errors += retryResult.errors;
|
|
110
|
+
retryEmbedded += retryResult.embedded;
|
|
111
|
+
|
|
112
|
+
const retryByKey = new Set(
|
|
113
|
+
retryResult.retryItems.map((item) => chunkRetryKey(item))
|
|
114
|
+
);
|
|
115
|
+
for (const entry of slice) {
|
|
116
|
+
if (!retryByKey.has(chunkRetryKey(entry.item))) {
|
|
117
|
+
continue;
|
|
118
|
+
}
|
|
119
|
+
if (entry.attempts >= MAX_EMBED_CHUNK_ATTEMPTS) {
|
|
120
|
+
errors += 1;
|
|
121
|
+
} else {
|
|
122
|
+
retryQueue.set(chunkRetryKey(entry.item), entry);
|
|
123
|
+
}
|
|
124
|
+
}
|
|
125
|
+
}
|
|
126
|
+
|
|
127
|
+
return retryEmbedded;
|
|
128
|
+
};
|
|
63
129
|
|
|
64
130
|
try {
|
|
65
131
|
while (true) {
|
|
66
132
|
// Get next batch using seek pagination
|
|
67
|
-
const batchResult = await statsPort.getBacklog(
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
133
|
+
const batchResult = await statsPort.getBacklog(
|
|
134
|
+
modelUri,
|
|
135
|
+
embedFingerprint,
|
|
136
|
+
{
|
|
137
|
+
limit: batchSize,
|
|
138
|
+
after: cursor,
|
|
139
|
+
collection,
|
|
140
|
+
}
|
|
141
|
+
);
|
|
72
142
|
|
|
73
143
|
if (!batchResult.ok) {
|
|
74
144
|
return err("QUERY_FAILED", batchResult.error.message);
|
|
@@ -85,48 +155,25 @@ export async function embedBacklog(
|
|
|
85
155
|
cursor = { mirrorHash: lastItem.mirrorHash, seq: lastItem.seq };
|
|
86
156
|
}
|
|
87
157
|
|
|
88
|
-
|
|
89
|
-
const
|
|
158
|
+
const beforeEmbedded = embedded;
|
|
159
|
+
const batchStoreResult = await embedAndStoreBatch({
|
|
90
160
|
embedPort,
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
);
|
|
99
|
-
|
|
100
|
-
if (!embedResult.ok) {
|
|
101
|
-
errors += batch.length;
|
|
102
|
-
continue;
|
|
103
|
-
}
|
|
104
|
-
|
|
105
|
-
const vectors: VectorRow[] = [];
|
|
106
|
-
for (const [idx, item] of batch.entries()) {
|
|
107
|
-
const embedding = embedResult.value.vectors[idx];
|
|
108
|
-
if (!embedding) {
|
|
109
|
-
errors += 1;
|
|
110
|
-
continue;
|
|
111
|
-
}
|
|
112
|
-
vectors.push({
|
|
113
|
-
mirrorHash: item.mirrorHash,
|
|
114
|
-
seq: item.seq,
|
|
115
|
-
model: modelUri,
|
|
116
|
-
embedding: new Float32Array(embedding),
|
|
117
|
-
});
|
|
118
|
-
}
|
|
161
|
+
vectorIndex,
|
|
162
|
+
items: batch,
|
|
163
|
+
modelUri,
|
|
164
|
+
embedFingerprint,
|
|
165
|
+
});
|
|
166
|
+
embedded += batchStoreResult.embedded;
|
|
167
|
+
errors += batchStoreResult.errors;
|
|
168
|
+
enqueueRetryItems(batchStoreResult.retryItems, 1);
|
|
119
169
|
|
|
120
|
-
if (
|
|
121
|
-
|
|
122
|
-
if (!storeResult.ok) {
|
|
123
|
-
errors += vectors.length;
|
|
124
|
-
continue;
|
|
125
|
-
}
|
|
126
|
-
embedded += vectors.length;
|
|
170
|
+
if (embedded > beforeEmbedded) {
|
|
171
|
+
await drainRetryQueue();
|
|
127
172
|
}
|
|
128
173
|
}
|
|
129
174
|
|
|
175
|
+
await drainRetryQueue();
|
|
176
|
+
|
|
130
177
|
// Sync vec index once at end if any vec0 writes failed
|
|
131
178
|
let syncError: string | undefined;
|
|
132
179
|
if (vectorIndex.vecDirty) {
|