@gmickel/gno 1.5.2 → 1.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +1 -1
- package/package.json +5 -2
- package/src/cli/commands/doctor.ts +179 -1
- package/src/cli/commands/embed.ts +217 -242
- package/src/embed/backlog.ts +92 -45
- package/src/embed/fingerprint.ts +37 -0
- package/src/embed/retry.ts +137 -0
- package/src/llm/nodeLlamaCpp/embedding.ts +81 -19
- package/src/sdk/embed.ts +134 -59
- package/src/store/migrations/008-vector-fingerprints.ts +25 -0
- package/src/store/migrations/index.ts +2 -1
- package/src/store/sqlite/adapter.ts +20 -6
- package/src/store/types.ts +1 -0
- package/src/store/vector/freshness.ts +34 -0
- package/src/store/vector/sqlite-vec.ts +5 -2
- package/src/store/vector/stats.ts +20 -2
- package/src/store/vector/types.ts +3 -0
package/README.md
CHANGED
|
@@ -99,7 +99,7 @@ gno daemon --detach # headless continuous indexing (background; --status / --st
|
|
|
99
99
|
- **Retrieval Quality Upgrade**: stronger BM25 lexical handling, code-aware chunking, terminal result hyperlinks, and per-collection model overrides
|
|
100
100
|
- **Code Embedding Benchmarks**: new benchmark workflow across canonical, real-GNO, and pinned OSS slices for comparing alternate embedding models
|
|
101
101
|
- **Default Embed Model**: built-in presets now use `Qwen3-Embedding-0.6B-GGUF` after it beat `bge-m3` on both code and multilingual prose benchmark lanes
|
|
102
|
-
- **Regression Fixes**: tightened phrase/negation/hyphen/underscore BM25 behavior, cleaned non-TTY hyperlink output, improved `gno doctor` chunking visibility, and fixed the embedding autoresearch harness
|
|
102
|
+
- **Regression Fixes**: tightened phrase/negation/hyphen/underscore BM25 behavior, cleaned non-TTY hyperlink output, improved `gno doctor` chunking and embedding fingerprint visibility, and fixed the embedding autoresearch harness
|
|
103
103
|
|
|
104
104
|
### Upgrading Existing Collections
|
|
105
105
|
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@gmickel/gno",
|
|
3
|
-
"version": "1.
|
|
3
|
+
"version": "1.7.0",
|
|
4
4
|
"description": "Local semantic search for your documents. Index Markdown, PDF, and Office files with hybrid BM25 + vector search.",
|
|
5
5
|
"keywords": [
|
|
6
6
|
"embeddings",
|
|
@@ -60,6 +60,7 @@
|
|
|
60
60
|
"test:web": "bun test test/serve/public --timeout 30000",
|
|
61
61
|
"test:e2e": "bun scripts/web-ui-smoke.ts",
|
|
62
62
|
"test:e2e:install": "bunx playwright install chromium",
|
|
63
|
+
"test:package": "bun scripts/package-smoke.ts",
|
|
63
64
|
"test:watch": "bun test --watch",
|
|
64
65
|
"test:coverage": "bun test --coverage",
|
|
65
66
|
"test:coverage:html": "bun test --coverage --html",
|
|
@@ -74,6 +75,8 @@
|
|
|
74
75
|
"bench:code-embeddings:write": "bun scripts/code-embedding-benchmark.ts --write",
|
|
75
76
|
"bench:general-embeddings": "bun scripts/general-embedding-benchmark.ts",
|
|
76
77
|
"bench:general-embeddings:write": "bun scripts/general-embedding-benchmark.ts --write",
|
|
78
|
+
"bench:cpu-embeddings": "bun scripts/cpu-embed-autoresearch.ts",
|
|
79
|
+
"bench:cpu-embeddings:native-batch-probe": "bun scripts/native-embedding-batch-probe.ts",
|
|
77
80
|
"eval:retrieval-candidates": "bun scripts/retrieval-candidate-benchmark.ts",
|
|
78
81
|
"eval:retrieval-candidates:write": "bun scripts/retrieval-candidate-benchmark.ts --write",
|
|
79
82
|
"eval:watch": "bun --bun evalite watch",
|
|
@@ -128,7 +131,7 @@
|
|
|
128
131
|
"version:patch": "npm version patch --no-git-tag-version",
|
|
129
132
|
"version:minor": "npm version minor --no-git-tag-version",
|
|
130
133
|
"version:major": "npm version major --no-git-tag-version",
|
|
131
|
-
"prerelease": "bun run lint:check && bun test",
|
|
134
|
+
"prerelease": "bun run lint:check && bun test && bun run docs:verify && bun run test:package",
|
|
132
135
|
"release:dry-run": "gh workflow run publish.yml -f publish=false",
|
|
133
136
|
"release:trigger": "gh workflow run publish.yml -f publish=true",
|
|
134
137
|
"prepare": "lefthook install"
|
|
@@ -17,13 +17,15 @@ import { getConfigPaths, isInitialized, loadConfig } from "../../config";
|
|
|
17
17
|
import { getCodeChunkingStatus } from "../../ingestion/chunker";
|
|
18
18
|
import { ModelCache } from "../../llm/cache";
|
|
19
19
|
import { LlmAdapter } from "../../llm/nodeLlamaCpp/adapter";
|
|
20
|
-
import { getActivePreset } from "../../llm/registry";
|
|
20
|
+
import { getActivePreset, resolveModelUri } from "../../llm/registry";
|
|
21
|
+
import { SqliteAdapter } from "../../store/sqlite/adapter";
|
|
21
22
|
import { loadFts5Snowball } from "../../store/sqlite/fts5-snowball";
|
|
22
23
|
import {
|
|
23
24
|
getCustomSqlitePath,
|
|
24
25
|
getExtensionLoadingMode,
|
|
25
26
|
getLoadAttempts,
|
|
26
27
|
} from "../../store/sqlite/setup";
|
|
28
|
+
import { getStoredEmbeddingFingerprint } from "../../store/vector/freshness";
|
|
27
29
|
|
|
28
30
|
// ─────────────────────────────────────────────────────────────────────────────
|
|
29
31
|
// Types
|
|
@@ -37,6 +39,25 @@ export interface DoctorCheck {
|
|
|
37
39
|
message: string;
|
|
38
40
|
/** Additional diagnostic details (shown in verbose/json output) */
|
|
39
41
|
details?: string[];
|
|
42
|
+
/** Embedding fingerprint diagnostics for machine consumers */
|
|
43
|
+
embeddingFingerprint?: EmbeddingFingerprintHealth;
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
export interface EmbeddingFingerprintGroup {
|
|
47
|
+
model: string;
|
|
48
|
+
fingerprint: string;
|
|
49
|
+
count: number;
|
|
50
|
+
current: boolean;
|
|
51
|
+
legacy: boolean;
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
export interface EmbeddingFingerprintHealth {
|
|
55
|
+
model: string;
|
|
56
|
+
currentFingerprint: string;
|
|
57
|
+
pendingChunks: number;
|
|
58
|
+
legacyChunks: number;
|
|
59
|
+
mixedGroups: number;
|
|
60
|
+
groups: EmbeddingFingerprintGroup[];
|
|
40
61
|
}
|
|
41
62
|
|
|
42
63
|
export interface DoctorOptions {
|
|
@@ -137,6 +158,160 @@ function checkCodeChunking(): DoctorCheck {
|
|
|
137
158
|
};
|
|
138
159
|
}
|
|
139
160
|
|
|
161
|
+
const FINGERPRINT_DISPLAY_LENGTH = 12;
|
|
162
|
+
|
|
163
|
+
function shortFingerprint(fingerprint: string): string {
|
|
164
|
+
return fingerprint.slice(0, FINGERPRINT_DISPLAY_LENGTH);
|
|
165
|
+
}
|
|
166
|
+
|
|
167
|
+
function describeFingerprintGroup(group: EmbeddingFingerprintGroup): string {
|
|
168
|
+
if (group.current) {
|
|
169
|
+
return "current";
|
|
170
|
+
}
|
|
171
|
+
if (group.legacy) {
|
|
172
|
+
return "legacy";
|
|
173
|
+
}
|
|
174
|
+
return "stale";
|
|
175
|
+
}
|
|
176
|
+
|
|
177
|
+
async function checkEmbeddingFingerprints(
|
|
178
|
+
config: Config
|
|
179
|
+
): Promise<DoctorCheck> {
|
|
180
|
+
const dbPath = getIndexDbPath();
|
|
181
|
+
try {
|
|
182
|
+
await stat(dbPath);
|
|
183
|
+
} catch {
|
|
184
|
+
return {
|
|
185
|
+
name: "embedding-fingerprint",
|
|
186
|
+
status: "warn",
|
|
187
|
+
message: "Database not found. Run: gno init",
|
|
188
|
+
};
|
|
189
|
+
}
|
|
190
|
+
|
|
191
|
+
const store = new SqliteAdapter();
|
|
192
|
+
const paths = getConfigPaths();
|
|
193
|
+
store.setConfigPath(paths.configFile);
|
|
194
|
+
|
|
195
|
+
const openResult = await store.open(dbPath, config.ftsTokenizer);
|
|
196
|
+
if (!openResult.ok) {
|
|
197
|
+
return {
|
|
198
|
+
name: "embedding-fingerprint",
|
|
199
|
+
status: "warn",
|
|
200
|
+
message: `Fingerprint health unavailable: ${openResult.error.message}`,
|
|
201
|
+
details: ["Run: gno doctor --json", "Then run: gno embed"],
|
|
202
|
+
};
|
|
203
|
+
}
|
|
204
|
+
|
|
205
|
+
try {
|
|
206
|
+
const db = store.getRawDb();
|
|
207
|
+
const model = resolveModelUri(config, "embed");
|
|
208
|
+
const currentFingerprint = getStoredEmbeddingFingerprint(db, model);
|
|
209
|
+
const statusResult = await store.getStatus({ embedModel: model });
|
|
210
|
+
if (!statusResult.ok) {
|
|
211
|
+
return {
|
|
212
|
+
name: "embedding-fingerprint",
|
|
213
|
+
status: "warn",
|
|
214
|
+
message: `Fingerprint health unavailable: ${statusResult.error.message}`,
|
|
215
|
+
details: ["Run: gno embed"],
|
|
216
|
+
};
|
|
217
|
+
}
|
|
218
|
+
|
|
219
|
+
const legacyChunks =
|
|
220
|
+
db
|
|
221
|
+
.query<{ count: number }, [string]>(
|
|
222
|
+
`
|
|
223
|
+
SELECT COUNT(*) as count
|
|
224
|
+
FROM content_vectors v
|
|
225
|
+
JOIN content_chunks c
|
|
226
|
+
ON c.mirror_hash = v.mirror_hash
|
|
227
|
+
AND c.seq = v.seq
|
|
228
|
+
WHERE v.model = ?
|
|
229
|
+
AND v.embed_fingerprint = ''
|
|
230
|
+
AND EXISTS (
|
|
231
|
+
SELECT 1 FROM documents d
|
|
232
|
+
WHERE d.mirror_hash = c.mirror_hash
|
|
233
|
+
AND d.active = 1
|
|
234
|
+
)
|
|
235
|
+
`
|
|
236
|
+
)
|
|
237
|
+
.get(model)?.count ?? 0;
|
|
238
|
+
|
|
239
|
+
const groups = db
|
|
240
|
+
.query<{ model: string; fingerprint: string; count: number }, []>(
|
|
241
|
+
`
|
|
242
|
+
SELECT
|
|
243
|
+
v.model as model,
|
|
244
|
+
v.embed_fingerprint as fingerprint,
|
|
245
|
+
COUNT(*) as count
|
|
246
|
+
FROM content_vectors v
|
|
247
|
+
JOIN content_chunks c
|
|
248
|
+
ON c.mirror_hash = v.mirror_hash
|
|
249
|
+
AND c.seq = v.seq
|
|
250
|
+
WHERE EXISTS (
|
|
251
|
+
SELECT 1 FROM documents d
|
|
252
|
+
WHERE d.mirror_hash = c.mirror_hash
|
|
253
|
+
AND d.active = 1
|
|
254
|
+
)
|
|
255
|
+
GROUP BY v.model, v.embed_fingerprint
|
|
256
|
+
ORDER BY count DESC, v.model ASC, v.embed_fingerprint ASC
|
|
257
|
+
`
|
|
258
|
+
)
|
|
259
|
+
.all()
|
|
260
|
+
.map((group) => ({
|
|
261
|
+
model: group.model,
|
|
262
|
+
fingerprint: group.fingerprint,
|
|
263
|
+
count: group.count,
|
|
264
|
+
current:
|
|
265
|
+
group.model === model && group.fingerprint === currentFingerprint,
|
|
266
|
+
legacy: group.fingerprint === "",
|
|
267
|
+
}));
|
|
268
|
+
|
|
269
|
+
const mixedGroups = groups.length;
|
|
270
|
+
const pendingChunks = statusResult.value.embeddingBacklog;
|
|
271
|
+
const hasWarnings =
|
|
272
|
+
pendingChunks > 0 || legacyChunks > 0 || mixedGroups > 1;
|
|
273
|
+
|
|
274
|
+
const health: EmbeddingFingerprintHealth = {
|
|
275
|
+
model,
|
|
276
|
+
currentFingerprint,
|
|
277
|
+
pendingChunks,
|
|
278
|
+
legacyChunks,
|
|
279
|
+
mixedGroups,
|
|
280
|
+
groups,
|
|
281
|
+
};
|
|
282
|
+
|
|
283
|
+
const message =
|
|
284
|
+
`current ${shortFingerprint(currentFingerprint)}, ` +
|
|
285
|
+
`${pendingChunks} pending/stale, ${legacyChunks} legacy, ` +
|
|
286
|
+
`${mixedGroups} group${mixedGroups === 1 ? "" : "s"}`;
|
|
287
|
+
const details: string[] = [];
|
|
288
|
+
|
|
289
|
+
if (hasWarnings) {
|
|
290
|
+
details.push("Run: gno embed");
|
|
291
|
+
details.push("If vectors still look stale, run: gno embed --force");
|
|
292
|
+
for (const group of groups) {
|
|
293
|
+
const label = describeFingerprintGroup(group);
|
|
294
|
+
const fingerprint = group.legacy
|
|
295
|
+
? "(empty)"
|
|
296
|
+
: shortFingerprint(group.fingerprint);
|
|
297
|
+
details.push(
|
|
298
|
+
`${label}: ${group.count} chunks model=${group.model} fingerprint=${fingerprint}`
|
|
299
|
+
);
|
|
300
|
+
}
|
|
301
|
+
}
|
|
302
|
+
|
|
303
|
+
return {
|
|
304
|
+
name: "embedding-fingerprint",
|
|
305
|
+
status: hasWarnings ? "warn" : "ok",
|
|
306
|
+
message,
|
|
307
|
+
details: details.length > 0 ? details : undefined,
|
|
308
|
+
embeddingFingerprint: health,
|
|
309
|
+
};
|
|
310
|
+
} finally {
|
|
311
|
+
await store.close();
|
|
312
|
+
}
|
|
313
|
+
}
|
|
314
|
+
|
|
140
315
|
async function checkNodeLlamaCpp(config: Config): Promise<DoctorCheck> {
|
|
141
316
|
const llm = new LlmAdapter(config);
|
|
142
317
|
try {
|
|
@@ -341,6 +516,9 @@ export async function doctor(
|
|
|
341
516
|
// Code chunking capability
|
|
342
517
|
checks.push(checkCodeChunking());
|
|
343
518
|
|
|
519
|
+
// Embedding fingerprint freshness
|
|
520
|
+
checks.push(await checkEmbeddingFingerprints(config));
|
|
521
|
+
|
|
344
522
|
// Determine overall health
|
|
345
523
|
const hasErrors = checks.some((c) => c.status === "error");
|
|
346
524
|
|