@gmickel/gno 1.6.0 → 1.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +1 -1
- package/package.json +3 -2
- package/src/cli/commands/doctor.ts +179 -1
- package/src/cli/commands/embed.ts +217 -242
- package/src/embed/backlog.ts +92 -45
- package/src/embed/fingerprint.ts +37 -0
- package/src/embed/retry.ts +137 -0
- package/src/sdk/embed.ts +134 -59
- package/src/store/migrations/008-vector-fingerprints.ts +25 -0
- package/src/store/migrations/index.ts +2 -1
- package/src/store/sqlite/adapter.ts +20 -6
- package/src/store/types.ts +1 -0
- package/src/store/vector/freshness.ts +34 -0
- package/src/store/vector/sqlite-vec.ts +5 -2
- package/src/store/vector/stats.ts +20 -2
- package/src/store/vector/types.ts +3 -0
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Embedding freshness fingerprint.
|
|
3
|
+
*
|
|
4
|
+
* @module src/embed/fingerprint
|
|
5
|
+
*/
|
|
6
|
+
|
|
7
|
+
import { getEmbeddingCompatibilityProfile } from "../llm/embedding-compatibility";
|
|
8
|
+
|
|
9
|
+
export const EMBEDDING_CONTEXTUAL_FORMAT_VERSION = "contextual-embedding-v1";
|
|
10
|
+
export const EMBEDDING_CHUNKING_STRATEGY_VERSION = "markdown-char-semantic-v1";
|
|
11
|
+
|
|
12
|
+
export interface EmbeddingFingerprintInput {
|
|
13
|
+
modelUri: string;
|
|
14
|
+
dimensions?: number;
|
|
15
|
+
}
|
|
16
|
+
|
|
17
|
+
export function getEmbeddingFingerprint(
|
|
18
|
+
input: EmbeddingFingerprintInput
|
|
19
|
+
): string {
|
|
20
|
+
const profile = getEmbeddingCompatibilityProfile(input.modelUri);
|
|
21
|
+
const payload = {
|
|
22
|
+
chunking: EMBEDDING_CHUNKING_STRATEGY_VERSION,
|
|
23
|
+
contextualFormatting: EMBEDDING_CONTEXTUAL_FORMAT_VERSION,
|
|
24
|
+
dimensions: input.dimensions ?? null,
|
|
25
|
+
modelUri: input.modelUri,
|
|
26
|
+
profile: {
|
|
27
|
+
batchEmbeddingTrusted: profile.batchEmbeddingTrusted,
|
|
28
|
+
documentFormat: profile.documentFormat,
|
|
29
|
+
id: profile.id,
|
|
30
|
+
queryFormat: profile.queryFormat,
|
|
31
|
+
},
|
|
32
|
+
};
|
|
33
|
+
|
|
34
|
+
return new Bun.CryptoHasher("sha256")
|
|
35
|
+
.update(JSON.stringify(payload))
|
|
36
|
+
.digest("hex");
|
|
37
|
+
}
|
|
@@ -0,0 +1,137 @@
|
|
|
1
|
+
import type { EmbeddingPort } from "../llm/types";
|
|
2
|
+
import type { BacklogItem, VectorIndexPort, VectorRow } from "../store/vector";
|
|
3
|
+
|
|
4
|
+
import { formatDocForEmbedding } from "../pipeline/contextual";
|
|
5
|
+
import { embedTextsWithRecovery } from "./batch";
|
|
6
|
+
|
|
7
|
+
export const MAX_EMBED_CHUNK_ATTEMPTS = 2;
|
|
8
|
+
export const MAX_EMBED_FAILURE_SAMPLES = 5;
|
|
9
|
+
|
|
10
|
+
export interface EmbedStoreBatchResult {
|
|
11
|
+
embedded: number;
|
|
12
|
+
errors: number;
|
|
13
|
+
retryItems: BacklogItem[];
|
|
14
|
+
errorSamples: string[];
|
|
15
|
+
suggestion?: string;
|
|
16
|
+
batchFailed: boolean;
|
|
17
|
+
batchError?: string;
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
export function chunkRetryKey(item: Pick<BacklogItem, "mirrorHash" | "seq">) {
|
|
21
|
+
return `${item.mirrorHash}\0${item.seq}`;
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
export function addUniqueSamples(target: string[], samples: string[]): void {
|
|
25
|
+
for (const sample of samples) {
|
|
26
|
+
if (target.length >= MAX_EMBED_FAILURE_SAMPLES) {
|
|
27
|
+
break;
|
|
28
|
+
}
|
|
29
|
+
if (!target.includes(sample)) {
|
|
30
|
+
target.push(sample);
|
|
31
|
+
}
|
|
32
|
+
}
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
export function formatLlmFailure(
|
|
36
|
+
error: { message: string; cause?: unknown } | undefined
|
|
37
|
+
): string {
|
|
38
|
+
if (!error) {
|
|
39
|
+
return "Unknown embedding failure";
|
|
40
|
+
}
|
|
41
|
+
const cause =
|
|
42
|
+
error.cause &&
|
|
43
|
+
typeof error.cause === "object" &&
|
|
44
|
+
"message" in error.cause &&
|
|
45
|
+
typeof error.cause.message === "string"
|
|
46
|
+
? error.cause.message
|
|
47
|
+
: typeof error.cause === "string"
|
|
48
|
+
? error.cause
|
|
49
|
+
: "";
|
|
50
|
+
return cause && cause !== error.message
|
|
51
|
+
? `${error.message} - ${cause}`
|
|
52
|
+
: error.message;
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
export async function embedAndStoreBatch(params: {
|
|
56
|
+
embedPort: EmbeddingPort;
|
|
57
|
+
vectorIndex: VectorIndexPort;
|
|
58
|
+
items: BacklogItem[];
|
|
59
|
+
modelUri: string;
|
|
60
|
+
embedFingerprint: string;
|
|
61
|
+
}): Promise<EmbedStoreBatchResult> {
|
|
62
|
+
const { embedPort, vectorIndex, items, modelUri, embedFingerprint } = params;
|
|
63
|
+
const embedResult = await embedTextsWithRecovery(
|
|
64
|
+
embedPort,
|
|
65
|
+
items.map((item) =>
|
|
66
|
+
formatDocForEmbedding(item.text, item.title ?? undefined, modelUri)
|
|
67
|
+
)
|
|
68
|
+
);
|
|
69
|
+
|
|
70
|
+
if (!embedResult.ok) {
|
|
71
|
+
const formattedError = formatLlmFailure(embedResult.error);
|
|
72
|
+
return {
|
|
73
|
+
embedded: 0,
|
|
74
|
+
errors: embedResult.error.retryable ? 0 : items.length,
|
|
75
|
+
retryItems: embedResult.error.retryable ? items : [],
|
|
76
|
+
errorSamples: [formattedError],
|
|
77
|
+
suggestion: embedResult.error.retryable
|
|
78
|
+
? "Try rerunning the same command. If failures persist, rerun with `gno --verbose embed --batch-size 1` to isolate failing chunks."
|
|
79
|
+
: embedResult.error.suggestion,
|
|
80
|
+
batchFailed: true,
|
|
81
|
+
batchError: formattedError,
|
|
82
|
+
};
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
const vectors: VectorRow[] = [];
|
|
86
|
+
const retryItems: BacklogItem[] = [];
|
|
87
|
+
for (const [idx, item] of items.entries()) {
|
|
88
|
+
const embedding = embedResult.value.vectors[idx];
|
|
89
|
+
if (!embedding) {
|
|
90
|
+
retryItems.push(item);
|
|
91
|
+
continue;
|
|
92
|
+
}
|
|
93
|
+
vectors.push({
|
|
94
|
+
mirrorHash: item.mirrorHash,
|
|
95
|
+
seq: item.seq,
|
|
96
|
+
model: modelUri,
|
|
97
|
+
embedFingerprint,
|
|
98
|
+
embedding: new Float32Array(embedding),
|
|
99
|
+
});
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
if (vectors.length === 0) {
|
|
103
|
+
return {
|
|
104
|
+
embedded: 0,
|
|
105
|
+
errors: 0,
|
|
106
|
+
retryItems,
|
|
107
|
+
errorSamples: embedResult.value.failureSamples,
|
|
108
|
+
suggestion: embedResult.value.retrySuggestion,
|
|
109
|
+
batchFailed: embedResult.value.batchFailed,
|
|
110
|
+
batchError: embedResult.value.batchError,
|
|
111
|
+
};
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
const storeResult = await vectorIndex.upsertVectors(vectors);
|
|
115
|
+
if (!storeResult.ok) {
|
|
116
|
+
return {
|
|
117
|
+
embedded: 0,
|
|
118
|
+
errors: vectors.length,
|
|
119
|
+
retryItems,
|
|
120
|
+
errorSamples: [storeResult.error.message],
|
|
121
|
+
suggestion:
|
|
122
|
+
"Store write failed. Rerun `gno embed` once more; if it repeats, run `gno doctor` and `gno vec sync`.",
|
|
123
|
+
batchFailed: embedResult.value.batchFailed,
|
|
124
|
+
batchError: embedResult.value.batchError,
|
|
125
|
+
};
|
|
126
|
+
}
|
|
127
|
+
|
|
128
|
+
return {
|
|
129
|
+
embedded: vectors.length,
|
|
130
|
+
errors: 0,
|
|
131
|
+
retryItems,
|
|
132
|
+
errorSamples: embedResult.value.failureSamples,
|
|
133
|
+
suggestion: embedResult.value.retrySuggestion,
|
|
134
|
+
batchFailed: embedResult.value.batchFailed,
|
|
135
|
+
batchError: embedResult.value.batchError,
|
|
136
|
+
};
|
|
137
|
+
}
|
package/src/sdk/embed.ts
CHANGED
|
@@ -19,15 +19,15 @@ import type {
|
|
|
19
19
|
import type { GnoEmbedOptions, GnoEmbedResult } from "./types";
|
|
20
20
|
|
|
21
21
|
import { embedBacklog } from "../embed";
|
|
22
|
-
import {
|
|
22
|
+
import { getEmbeddingFingerprint } from "../embed/fingerprint";
|
|
23
|
+
import {
|
|
24
|
+
chunkRetryKey,
|
|
25
|
+
embedAndStoreBatch,
|
|
26
|
+
MAX_EMBED_CHUNK_ATTEMPTS,
|
|
27
|
+
} from "../embed/retry";
|
|
23
28
|
import { resolveModelUri } from "../llm/registry";
|
|
24
|
-
import { formatDocForEmbedding } from "../pipeline/contextual";
|
|
25
29
|
import { err, ok } from "../store/types";
|
|
26
|
-
import {
|
|
27
|
-
createVectorIndexPort,
|
|
28
|
-
createVectorStatsPort,
|
|
29
|
-
type VectorRow,
|
|
30
|
-
} from "../store/vector";
|
|
30
|
+
import { createVectorIndexPort, createVectorStatsPort } from "../store/vector";
|
|
31
31
|
import { sdkError } from "./errors";
|
|
32
32
|
|
|
33
33
|
interface EmbedRuntimeOptions {
|
|
@@ -121,6 +121,68 @@ async function forceEmbedAll(
|
|
|
121
121
|
let embedded = 0;
|
|
122
122
|
let errors = 0;
|
|
123
123
|
let cursor: { mirrorHash: string; seq: number } | undefined;
|
|
124
|
+
const retryQueue = new Map<string, { item: BacklogItem; attempts: number }>();
|
|
125
|
+
const embedFingerprint = getEmbeddingFingerprint({
|
|
126
|
+
modelUri,
|
|
127
|
+
dimensions: vectorIndex.dimensions,
|
|
128
|
+
});
|
|
129
|
+
|
|
130
|
+
const enqueueRetryItems = (items: BacklogItem[], attempts: number): void => {
|
|
131
|
+
for (const item of items) {
|
|
132
|
+
const key = chunkRetryKey(item);
|
|
133
|
+
const existing = retryQueue.get(key);
|
|
134
|
+
retryQueue.set(key, {
|
|
135
|
+
item,
|
|
136
|
+
attempts: Math.max(existing?.attempts ?? 0, attempts),
|
|
137
|
+
});
|
|
138
|
+
}
|
|
139
|
+
};
|
|
140
|
+
|
|
141
|
+
const drainRetryQueue = async (): Promise<number> => {
|
|
142
|
+
if (retryQueue.size === 0) {
|
|
143
|
+
return 0;
|
|
144
|
+
}
|
|
145
|
+
|
|
146
|
+
let retryEmbedded = 0;
|
|
147
|
+
const entries = [...retryQueue.values()].filter(
|
|
148
|
+
(entry) => entry.attempts < MAX_EMBED_CHUNK_ATTEMPTS
|
|
149
|
+
);
|
|
150
|
+
|
|
151
|
+
for (let idx = 0; idx < entries.length; idx += batchSize) {
|
|
152
|
+
const slice = entries.slice(idx, idx + batchSize);
|
|
153
|
+
for (const entry of slice) {
|
|
154
|
+
retryQueue.delete(chunkRetryKey(entry.item));
|
|
155
|
+
entry.attempts += 1;
|
|
156
|
+
}
|
|
157
|
+
|
|
158
|
+
const retryResult = await embedAndStoreBatch({
|
|
159
|
+
embedPort,
|
|
160
|
+
vectorIndex,
|
|
161
|
+
items: slice.map((entry) => entry.item),
|
|
162
|
+
modelUri,
|
|
163
|
+
embedFingerprint,
|
|
164
|
+
});
|
|
165
|
+
embedded += retryResult.embedded;
|
|
166
|
+
errors += retryResult.errors;
|
|
167
|
+
retryEmbedded += retryResult.embedded;
|
|
168
|
+
|
|
169
|
+
const retryByKey = new Set(
|
|
170
|
+
retryResult.retryItems.map((item) => chunkRetryKey(item))
|
|
171
|
+
);
|
|
172
|
+
for (const entry of slice) {
|
|
173
|
+
if (!retryByKey.has(chunkRetryKey(entry.item))) {
|
|
174
|
+
continue;
|
|
175
|
+
}
|
|
176
|
+
if (entry.attempts >= MAX_EMBED_CHUNK_ATTEMPTS) {
|
|
177
|
+
errors += 1;
|
|
178
|
+
} else {
|
|
179
|
+
retryQueue.set(chunkRetryKey(entry.item), entry);
|
|
180
|
+
}
|
|
181
|
+
}
|
|
182
|
+
}
|
|
183
|
+
|
|
184
|
+
return retryEmbedded;
|
|
185
|
+
};
|
|
124
186
|
|
|
125
187
|
while (true) {
|
|
126
188
|
const batchResult = await getActiveChunks(db, batchSize, cursor);
|
|
@@ -140,45 +202,27 @@ async function forceEmbedAll(
|
|
|
140
202
|
cursor = { mirrorHash: lastItem.mirrorHash, seq: lastItem.seq };
|
|
141
203
|
}
|
|
142
204
|
|
|
143
|
-
const
|
|
205
|
+
const beforeEmbedded = embedded;
|
|
206
|
+
const embedResult = await embedAndStoreBatch({
|
|
144
207
|
embedPort,
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
);
|
|
153
|
-
|
|
154
|
-
if (!embedResult.ok) {
|
|
155
|
-
errors += batch.length;
|
|
156
|
-
continue;
|
|
157
|
-
}
|
|
208
|
+
vectorIndex,
|
|
209
|
+
items: batch,
|
|
210
|
+
modelUri,
|
|
211
|
+
embedFingerprint,
|
|
212
|
+
});
|
|
213
|
+
embedded += embedResult.embedded;
|
|
214
|
+
errors += embedResult.errors;
|
|
215
|
+
enqueueRetryItems(embedResult.retryItems, 1);
|
|
158
216
|
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
const embedding = embedResult.value.vectors[idx];
|
|
162
|
-
if (!embedding) {
|
|
163
|
-
errors += 1;
|
|
164
|
-
continue;
|
|
165
|
-
}
|
|
166
|
-
vectors.push({
|
|
167
|
-
mirrorHash: item.mirrorHash,
|
|
168
|
-
seq: item.seq,
|
|
169
|
-
model: modelUri,
|
|
170
|
-
embedding: new Float32Array(embedding),
|
|
171
|
-
});
|
|
217
|
+
if (embedded > beforeEmbedded) {
|
|
218
|
+
await drainRetryQueue();
|
|
172
219
|
}
|
|
220
|
+
}
|
|
173
221
|
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
continue;
|
|
179
|
-
}
|
|
180
|
-
embedded += vectors.length;
|
|
181
|
-
}
|
|
222
|
+
await drainRetryQueue();
|
|
223
|
+
if (retryQueue.size > 0) {
|
|
224
|
+
errors += retryQueue.size;
|
|
225
|
+
retryQueue.clear();
|
|
182
226
|
}
|
|
183
227
|
|
|
184
228
|
if (vectorIndex.vecDirty) {
|
|
@@ -217,24 +261,25 @@ export async function runEmbed(
|
|
|
217
261
|
const db = runtime.store.getRawDb();
|
|
218
262
|
const stats: VectorStatsPort = createVectorStatsPort(db);
|
|
219
263
|
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
264
|
+
let totalToEmbed = 0;
|
|
265
|
+
if (force) {
|
|
266
|
+
const forceCount = await getActiveChunkCount(db);
|
|
267
|
+
if (!forceCount.ok) {
|
|
268
|
+
throw sdkError("STORE", forceCount.error.message, {
|
|
269
|
+
cause: forceCount.error.cause,
|
|
270
|
+
});
|
|
271
|
+
}
|
|
228
272
|
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
273
|
+
totalToEmbed = forceCount.value;
|
|
274
|
+
if (totalToEmbed === 0 || dryRun) {
|
|
275
|
+
return {
|
|
276
|
+
embedded: totalToEmbed,
|
|
277
|
+
errors: 0,
|
|
278
|
+
duration: 0,
|
|
279
|
+
model: modelUri,
|
|
280
|
+
searchAvailable: await checkVecAvailable(db),
|
|
281
|
+
};
|
|
282
|
+
}
|
|
238
283
|
}
|
|
239
284
|
|
|
240
285
|
const embedResult = await runtime.llm.createEmbeddingPort(modelUri, {
|
|
@@ -266,6 +311,36 @@ export async function runEmbed(
|
|
|
266
311
|
}
|
|
267
312
|
|
|
268
313
|
const vectorIndex = vectorResult.value;
|
|
314
|
+
if (!force) {
|
|
315
|
+
const embedFingerprint = getEmbeddingFingerprint({
|
|
316
|
+
modelUri,
|
|
317
|
+
dimensions: vectorIndex.dimensions,
|
|
318
|
+
});
|
|
319
|
+
const backlogResult = await stats.countBacklog(
|
|
320
|
+
modelUri,
|
|
321
|
+
embedFingerprint,
|
|
322
|
+
{
|
|
323
|
+
collection: options.collection,
|
|
324
|
+
}
|
|
325
|
+
);
|
|
326
|
+
if (!backlogResult.ok) {
|
|
327
|
+
throw sdkError("STORE", backlogResult.error.message, {
|
|
328
|
+
cause: backlogResult.error.cause,
|
|
329
|
+
});
|
|
330
|
+
}
|
|
331
|
+
|
|
332
|
+
totalToEmbed = backlogResult.value;
|
|
333
|
+
if (totalToEmbed === 0 || dryRun) {
|
|
334
|
+
return {
|
|
335
|
+
embedded: totalToEmbed,
|
|
336
|
+
errors: 0,
|
|
337
|
+
duration: 0,
|
|
338
|
+
model: modelUri,
|
|
339
|
+
searchAvailable: vectorIndex.searchAvailable,
|
|
340
|
+
};
|
|
341
|
+
}
|
|
342
|
+
}
|
|
343
|
+
|
|
269
344
|
const startedAt = Date.now();
|
|
270
345
|
let result: { embedded: number; errors: number };
|
|
271
346
|
if (force) {
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Migration: vector embedding freshness fingerprints.
|
|
3
|
+
*
|
|
4
|
+
* @module src/store/migrations/008-vector-fingerprints
|
|
5
|
+
*/
|
|
6
|
+
|
|
7
|
+
import type { Database } from "bun:sqlite";
|
|
8
|
+
|
|
9
|
+
import type { Migration } from "./runner";
|
|
10
|
+
|
|
11
|
+
export const migration: Migration = {
|
|
12
|
+
version: 8,
|
|
13
|
+
name: "vector_fingerprints",
|
|
14
|
+
|
|
15
|
+
up(db: Database): void {
|
|
16
|
+
db.exec(`
|
|
17
|
+
ALTER TABLE content_vectors ADD COLUMN embed_fingerprint TEXT NOT NULL DEFAULT ''
|
|
18
|
+
`);
|
|
19
|
+
|
|
20
|
+
db.exec(`
|
|
21
|
+
CREATE INDEX IF NOT EXISTS idx_vectors_freshness
|
|
22
|
+
ON content_vectors(model, embed_fingerprint, mirror_hash, seq, embedded_at)
|
|
23
|
+
`);
|
|
24
|
+
},
|
|
25
|
+
};
|
|
@@ -21,6 +21,7 @@ import { migration as m004 } from "./004-doc-links";
|
|
|
21
21
|
import { migration as m005 } from "./005-graph-indexes";
|
|
22
22
|
import { migration as m006 } from "./006-document-metadata";
|
|
23
23
|
import { migration as m007 } from "./007-document-date-fields";
|
|
24
|
+
import { migration as m008 } from "./008-vector-fingerprints";
|
|
24
25
|
|
|
25
26
|
/** All migrations in order */
|
|
26
|
-
export const migrations = [m001, m002, m003, m004, m005, m006, m007];
|
|
27
|
+
export const migrations = [m001, m002, m003, m004, m005, m006, m007, m008];
|
|
@@ -53,6 +53,7 @@ import { analyzeGraphCommunities } from "../../core/graph-analysis";
|
|
|
53
53
|
import { normalizeWikiName, stripWikiMdExt } from "../../core/links";
|
|
54
54
|
import { migrations, runMigrations } from "../migrations";
|
|
55
55
|
import { err, ok } from "../types";
|
|
56
|
+
import { getStoredEmbeddingFingerprint } from "../vector/freshness";
|
|
56
57
|
import { modelTableName } from "../vector/sqlite-vec";
|
|
57
58
|
import { loadFts5Snowball } from "./fts5-snowball";
|
|
58
59
|
|
|
@@ -3065,10 +3066,14 @@ export class SqliteAdapter implements StorePort, SqliteDbProvider {
|
|
|
3065
3066
|
|
|
3066
3067
|
async getStatus(options?: {
|
|
3067
3068
|
embedModel?: string;
|
|
3069
|
+
embedFingerprint?: string;
|
|
3068
3070
|
}): Promise<StoreResult<IndexStatus>> {
|
|
3069
3071
|
try {
|
|
3070
3072
|
const db = this.ensureOpen();
|
|
3071
3073
|
const embedModel = options?.embedModel ?? null;
|
|
3074
|
+
const embedFingerprint =
|
|
3075
|
+
options?.embedFingerprint ??
|
|
3076
|
+
(embedModel ? getStoredEmbeddingFingerprint(db, embedModel) : null);
|
|
3072
3077
|
|
|
3073
3078
|
// Get version
|
|
3074
3079
|
const versionRow = db
|
|
@@ -3097,7 +3102,7 @@ export class SqliteAdapter implements StorePort, SqliteDbProvider {
|
|
|
3097
3102
|
}
|
|
3098
3103
|
|
|
3099
3104
|
const collectionStats = db
|
|
3100
|
-
.query<CollectionStat, [string | null, string | null]>(
|
|
3105
|
+
.query<CollectionStat, [string | null, string | null, string | null]>(
|
|
3101
3106
|
`
|
|
3102
3107
|
SELECT
|
|
3103
3108
|
c.name,
|
|
@@ -3120,7 +3125,10 @@ export class SqliteAdapter implements StorePort, SqliteDbProvider {
|
|
|
3120
3125
|
SELECT 1 FROM content_vectors cv
|
|
3121
3126
|
WHERE cv.mirror_hash = cc.mirror_hash
|
|
3122
3127
|
AND cv.seq = cc.seq
|
|
3123
|
-
AND (? IS NULL OR
|
|
3128
|
+
AND (? IS NULL OR (
|
|
3129
|
+
cv.model = ?
|
|
3130
|
+
AND cv.embed_fingerprint = ?
|
|
3131
|
+
))
|
|
3124
3132
|
AND cv.embedded_at >= cc.created_at
|
|
3125
3133
|
)) as embedded_count
|
|
3126
3134
|
FROM collections c
|
|
@@ -3128,7 +3136,7 @@ export class SqliteAdapter implements StorePort, SqliteDbProvider {
|
|
|
3128
3136
|
GROUP BY c.name, c.path
|
|
3129
3137
|
`
|
|
3130
3138
|
)
|
|
3131
|
-
.all(embedModel, embedModel);
|
|
3139
|
+
.all(embedModel, embedModel, embedFingerprint);
|
|
3132
3140
|
|
|
3133
3141
|
// Get totals
|
|
3134
3142
|
const totalsRow = db
|
|
@@ -3152,7 +3160,10 @@ export class SqliteAdapter implements StorePort, SqliteDbProvider {
|
|
|
3152
3160
|
// Embedding backlog: chunks from active docs without vectors
|
|
3153
3161
|
// Uses EXISTS to avoid duplicates when multiple docs share mirror_hash
|
|
3154
3162
|
const backlogRow = db
|
|
3155
|
-
.query<
|
|
3163
|
+
.query<
|
|
3164
|
+
{ count: number },
|
|
3165
|
+
[string | null, string | null, string | null]
|
|
3166
|
+
>(
|
|
3156
3167
|
`
|
|
3157
3168
|
SELECT COUNT(*) as count FROM content_chunks c
|
|
3158
3169
|
WHERE EXISTS (
|
|
@@ -3163,12 +3174,15 @@ export class SqliteAdapter implements StorePort, SqliteDbProvider {
|
|
|
3163
3174
|
SELECT 1 FROM content_vectors v
|
|
3164
3175
|
WHERE v.mirror_hash = c.mirror_hash
|
|
3165
3176
|
AND v.seq = c.seq
|
|
3166
|
-
AND (? IS NULL OR
|
|
3177
|
+
AND (? IS NULL OR (
|
|
3178
|
+
v.model = ?
|
|
3179
|
+
AND v.embed_fingerprint = ?
|
|
3180
|
+
))
|
|
3167
3181
|
AND v.embedded_at >= c.created_at
|
|
3168
3182
|
)
|
|
3169
3183
|
`
|
|
3170
3184
|
)
|
|
3171
|
-
.get(embedModel, embedModel);
|
|
3185
|
+
.get(embedModel, embedModel, embedFingerprint);
|
|
3172
3186
|
|
|
3173
3187
|
// Recent errors (last 24h)
|
|
3174
3188
|
const recentErrorsRow = db
|
package/src/store/types.ts
CHANGED
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Vector freshness helpers.
|
|
3
|
+
*
|
|
4
|
+
* @module src/store/vector/freshness
|
|
5
|
+
*/
|
|
6
|
+
|
|
7
|
+
import type { Database } from "bun:sqlite";
|
|
8
|
+
|
|
9
|
+
import { getEmbeddingFingerprint } from "../../embed/fingerprint";
|
|
10
|
+
|
|
11
|
+
export function getStoredEmbeddingDimensions(
|
|
12
|
+
db: Database,
|
|
13
|
+
model: string
|
|
14
|
+
): number | undefined {
|
|
15
|
+
const row = db
|
|
16
|
+
.prepare("SELECT embedding FROM content_vectors WHERE model = ? LIMIT 1")
|
|
17
|
+
.get(model) as { embedding: Uint8Array } | undefined;
|
|
18
|
+
|
|
19
|
+
if (!row?.embedding) {
|
|
20
|
+
return undefined;
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
return row.embedding.byteLength / Float32Array.BYTES_PER_ELEMENT;
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
export function getStoredEmbeddingFingerprint(
|
|
27
|
+
db: Database,
|
|
28
|
+
modelUri: string
|
|
29
|
+
): string {
|
|
30
|
+
return getEmbeddingFingerprint({
|
|
31
|
+
modelUri,
|
|
32
|
+
dimensions: getStoredEmbeddingDimensions(db, modelUri),
|
|
33
|
+
});
|
|
34
|
+
}
|
|
@@ -116,8 +116,10 @@ export async function createVectorIndexPort(
|
|
|
116
116
|
|
|
117
117
|
// Prepared statements for content_vectors table
|
|
118
118
|
const upsertVectorStmt = db.prepare(`
|
|
119
|
-
INSERT OR REPLACE INTO content_vectors (
|
|
120
|
-
|
|
119
|
+
INSERT OR REPLACE INTO content_vectors (
|
|
120
|
+
mirror_hash, seq, model, embed_fingerprint, embedding, embedded_at
|
|
121
|
+
)
|
|
122
|
+
VALUES (?, ?, ?, ?, ?, datetime('now'))
|
|
121
123
|
`);
|
|
122
124
|
|
|
123
125
|
const deleteVectorStmt = db.prepare(`
|
|
@@ -172,6 +174,7 @@ export async function createVectorIndexPort(
|
|
|
172
174
|
row.mirrorHash,
|
|
173
175
|
row.seq,
|
|
174
176
|
row.model,
|
|
177
|
+
row.embedFingerprint,
|
|
175
178
|
encodeEmbedding(row.embedding)
|
|
176
179
|
);
|
|
177
180
|
}
|
|
@@ -65,6 +65,7 @@ export function createVectorStatsPort(db: Database): VectorStatsPort {
|
|
|
65
65
|
|
|
66
66
|
countBacklog(
|
|
67
67
|
model: string,
|
|
68
|
+
embedFingerprint: string,
|
|
68
69
|
options?: { collection?: string }
|
|
69
70
|
): Promise<StoreResult<number>> {
|
|
70
71
|
try {
|
|
@@ -80,10 +81,13 @@ export function createVectorStatsPort(db: Database): VectorStatsPort {
|
|
|
80
81
|
WHERE v.mirror_hash = c.mirror_hash
|
|
81
82
|
AND v.seq = c.seq
|
|
82
83
|
AND v.model = ?
|
|
84
|
+
AND v.embed_fingerprint = ?
|
|
83
85
|
AND v.embedded_at >= c.created_at
|
|
84
86
|
)
|
|
85
87
|
`;
|
|
86
|
-
const result = db
|
|
88
|
+
const result = db
|
|
89
|
+
.prepare(sql)
|
|
90
|
+
.get(...activeDoc.params, model, embedFingerprint) as {
|
|
87
91
|
count: number;
|
|
88
92
|
};
|
|
89
93
|
return Promise.resolve(ok(result.count));
|
|
@@ -99,6 +103,7 @@ export function createVectorStatsPort(db: Database): VectorStatsPort {
|
|
|
99
103
|
|
|
100
104
|
getBacklog(
|
|
101
105
|
model: string,
|
|
106
|
+
embedFingerprint: string,
|
|
102
107
|
options?: {
|
|
103
108
|
limit?: number;
|
|
104
109
|
after?: { mirrorHash: string; seq: number };
|
|
@@ -123,6 +128,7 @@ export function createVectorStatsPort(db: Database): VectorStatsPort {
|
|
|
123
128
|
WHERE v.mirror_hash = c.mirror_hash
|
|
124
129
|
AND v.seq = c.seq
|
|
125
130
|
AND v.model = ?
|
|
131
|
+
AND v.embed_fingerprint = ?
|
|
126
132
|
) THEN 'new'
|
|
127
133
|
ELSE 'changed'
|
|
128
134
|
END as reason
|
|
@@ -133,6 +139,7 @@ export function createVectorStatsPort(db: Database): VectorStatsPort {
|
|
|
133
139
|
WHERE v.mirror_hash = c.mirror_hash
|
|
134
140
|
AND v.seq = c.seq
|
|
135
141
|
AND v.model = ?
|
|
142
|
+
AND v.embed_fingerprint = ?
|
|
136
143
|
AND v.embedded_at >= c.created_at
|
|
137
144
|
)
|
|
138
145
|
AND (c.mirror_hash > ? OR (c.mirror_hash = ? AND c.seq > ?))
|
|
@@ -148,6 +155,7 @@ export function createVectorStatsPort(db: Database): VectorStatsPort {
|
|
|
148
155
|
WHERE v.mirror_hash = c.mirror_hash
|
|
149
156
|
AND v.seq = c.seq
|
|
150
157
|
AND v.model = ?
|
|
158
|
+
AND v.embed_fingerprint = ?
|
|
151
159
|
) THEN 'new'
|
|
152
160
|
ELSE 'changed'
|
|
153
161
|
END as reason
|
|
@@ -158,6 +166,7 @@ export function createVectorStatsPort(db: Database): VectorStatsPort {
|
|
|
158
166
|
WHERE v.mirror_hash = c.mirror_hash
|
|
159
167
|
AND v.seq = c.seq
|
|
160
168
|
AND v.model = ?
|
|
169
|
+
AND v.embed_fingerprint = ?
|
|
161
170
|
AND v.embedded_at >= c.created_at
|
|
162
171
|
)
|
|
163
172
|
ORDER BY c.mirror_hash, c.seq
|
|
@@ -167,14 +176,23 @@ export function createVectorStatsPort(db: Database): VectorStatsPort {
|
|
|
167
176
|
const params = after
|
|
168
177
|
? [
|
|
169
178
|
model,
|
|
179
|
+
embedFingerprint,
|
|
170
180
|
...activeDoc.params,
|
|
171
181
|
model,
|
|
182
|
+
embedFingerprint,
|
|
172
183
|
after.mirrorHash,
|
|
173
184
|
after.mirrorHash,
|
|
174
185
|
after.seq,
|
|
175
186
|
limit,
|
|
176
187
|
]
|
|
177
|
-
: [
|
|
188
|
+
: [
|
|
189
|
+
model,
|
|
190
|
+
embedFingerprint,
|
|
191
|
+
...activeDoc.params,
|
|
192
|
+
model,
|
|
193
|
+
embedFingerprint,
|
|
194
|
+
limit,
|
|
195
|
+
];
|
|
178
196
|
|
|
179
197
|
const results = db.prepare(sql).all(...params) as BacklogItem[];
|
|
180
198
|
return Promise.resolve(ok(results));
|