akm-cli 0.7.1 → 0.7.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +35 -0
- package/dist/cli.js +62 -16
- package/dist/commands/history.js +2 -7
- package/dist/commands/info.js +2 -2
- package/dist/commands/installed-stashes.js +45 -1
- package/dist/commands/search.js +2 -2
- package/dist/commands/show.js +4 -19
- package/dist/commands/source-add.js +1 -1
- package/dist/core/common.js +16 -1
- package/dist/core/config.js +18 -3
- package/dist/indexer/db-search.js +33 -39
- package/dist/indexer/db.js +51 -1
- package/dist/indexer/graph-extraction.js +5 -3
- package/dist/indexer/indexer.js +334 -121
- package/dist/indexer/manifest.js +18 -23
- package/dist/indexer/memory-inference.js +47 -58
- package/dist/indexer/metadata.js +253 -21
- package/dist/indexer/search-source.js +11 -5
- package/dist/llm/client.js +61 -1
- package/dist/llm/embedder.js +8 -5
- package/dist/llm/embedders/local.js +8 -2
- package/dist/llm/embedders/remote.js +4 -2
- package/dist/llm/graph-extract.js +4 -4
- package/dist/llm/memory-infer.js +61 -33
- package/dist/llm/metadata-enhance.js +2 -2
- package/dist/output/cli-hints.js +5 -2
- package/dist/output/renderers.js +22 -49
- package/dist/registry/build-index.js +13 -18
- package/dist/setup/setup.js +238 -96
- package/dist/sources/providers/git.js +14 -2
- package/dist/sources/providers/website.js +4 -460
- package/dist/sources/website-ingest.js +470 -0
- package/dist/wiki/wiki.js +11 -1
- package/dist/workflows/parser.js +19 -4
- package/dist/workflows/runs.js +3 -3
- package/docs/README.md +10 -3
- package/docs/migration/release-notes/0.7.0.md +22 -0
- package/package.json +5 -2
package/dist/indexer/indexer.js
CHANGED
|
@@ -5,25 +5,32 @@ import { getDbPath } from "../core/paths";
|
|
|
5
5
|
import { isVerbose, warn, warnVerbose } from "../core/warn";
|
|
6
6
|
import { resolveIndexPassLLM } from "../llm/index-passes";
|
|
7
7
|
import { takeWorkflowDocument } from "../workflows/document-cache";
|
|
8
|
-
import { closeDatabase, deleteEntriesByDir, deleteEntriesByStashDir, getEmbeddingCount, getEntriesByDir, getEntryCount, getMeta, isVecAvailable, openDatabase, rebuildFts, setMeta, upsertEmbedding, upsertEntry, upsertUtilityScore, warnIfVecMissing, } from "./db";
|
|
8
|
+
import { closeDatabase, deleteEntriesByDir, deleteEntriesByStashDir, deleteIndexDirStatesByStashDir, getEmbeddingCount, getEntriesByDir, getEntryCount, getIndexDirState, getMeta, isVecAvailable, openDatabase, openExistingDatabase, rebuildFts, setMeta, upsertEmbedding, upsertEntry, upsertIndexDirState, upsertUtilityScore, warnIfVecMissing, } from "./db";
|
|
9
9
|
import { runGraphExtractionPass } from "./graph-extraction";
|
|
10
10
|
import { runMemoryInferencePass } from "./memory-inference";
|
|
11
|
-
import { generateMetadataFlat, isWorkflowSkipWarning, loadStashFile, shouldIndexStashFile, } from "./metadata";
|
|
11
|
+
import { applyCuratedFrontmatter, applyWikiFrontmatter, generateMetadataFlat, isWorkflowSkipWarning, loadStashFile, shouldIndexStashFile, } from "./metadata";
|
|
12
12
|
import { buildSearchText } from "./search-fields";
|
|
13
13
|
import { classifySemanticFailure, clearSemanticStatus, deriveSemanticProviderFingerprint, writeSemanticStatus, } from "./semantic-status";
|
|
14
14
|
import { ensureUsageEventsSchema, purgeOldUsageEvents } from "./usage-events";
|
|
15
15
|
import { walkStashFlat } from "./walker";
|
|
16
|
+
function throwIfAborted(signal) {
|
|
17
|
+
if (signal?.aborted) {
|
|
18
|
+
throw signal.reason instanceof Error ? signal.reason : new Error("index interrupted");
|
|
19
|
+
}
|
|
20
|
+
}
|
|
16
21
|
// ── Indexer ──────────────────────────────────────────────────────────────────
|
|
17
22
|
export async function akmIndex(options) {
|
|
18
23
|
const stashDir = options?.stashDir || resolveStashDir();
|
|
19
24
|
const onProgress = options?.onProgress ?? (() => { });
|
|
25
|
+
const signal = options?.signal;
|
|
26
|
+
const enrich = options?.enrich === true;
|
|
20
27
|
// Load config and resolve all stash sources
|
|
21
28
|
const { loadConfig } = await import("../core/config.js");
|
|
22
29
|
const config = loadConfig();
|
|
23
30
|
// Ensure git stash caches are extracted before resolving stash dirs,
|
|
24
31
|
// so their content directories exist on disk for the walker to discover.
|
|
25
32
|
const { ensureSourceCaches, resolveSourceEntries } = await import("./search-source.js");
|
|
26
|
-
await ensureSourceCaches(config);
|
|
33
|
+
await ensureSourceCaches(config, { force: options?.full === true });
|
|
27
34
|
const allSourceEntries = resolveSourceEntries(stashDir, config);
|
|
28
35
|
const allSourceDirs = allSourceEntries.map((s) => s.path);
|
|
29
36
|
const t0 = Date.now();
|
|
@@ -44,13 +51,11 @@ export async function akmIndex(options) {
|
|
|
44
51
|
sourcesCount: allSourceDirs.length,
|
|
45
52
|
semanticSearchMode: config.semanticSearchMode,
|
|
46
53
|
embeddingProvider: getEmbeddingProvider(config.embedding),
|
|
47
|
-
|
|
48
|
-
// run. Today that means the enrichment pass; future passes plug in
|
|
49
|
-
// via `resolveIndexPassLLM`.
|
|
50
|
-
llmEnabled: !!resolveIndexPassLLM("enrichment", config),
|
|
54
|
+
llmEnabled: enrich && !!resolveIndexPassLLM("enrichment", config),
|
|
51
55
|
vecAvailable: isVecAvailable(db),
|
|
52
56
|
}),
|
|
53
57
|
});
|
|
58
|
+
let hadRemovedSources = false;
|
|
54
59
|
if (options?.full || !isIncremental) {
|
|
55
60
|
// The delete is now merged into the insert transaction inside
|
|
56
61
|
// indexEntries() so that a reader never sees an empty database between
|
|
@@ -77,31 +82,39 @@ export async function akmIndex(options) {
|
|
|
77
82
|
const currentSet = new Set(allSourceDirs);
|
|
78
83
|
for (const dir of prevStashDirs) {
|
|
79
84
|
if (!currentSet.has(dir)) {
|
|
85
|
+
hadRemovedSources = true;
|
|
80
86
|
deleteEntriesByStashDir(db, dir);
|
|
87
|
+
deleteIndexDirStatesByStashDir(db, dir);
|
|
81
88
|
}
|
|
82
89
|
}
|
|
83
90
|
}
|
|
84
91
|
}
|
|
85
|
-
|
|
92
|
+
throwIfAborted(signal);
|
|
93
|
+
// Memory inference pass (#201). Runs before the walk so any derived-memory
|
|
86
94
|
// children that get written are picked up by the walker in this same run
|
|
87
95
|
// and don't have to wait for the next `akm index`. Gated entirely by
|
|
88
96
|
// `resolveIndexPassLLM("memory", config)` — when the user has no
|
|
89
97
|
// `akm.llm` block or has set `index.memory.llm = false`, this is a no-op
|
|
90
98
|
// and existing inferred children are left in place.
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
99
|
+
if (enrich) {
|
|
100
|
+
try {
|
|
101
|
+
const inferenceResult = await runMemoryInferencePass(config, allSourceEntries, signal);
|
|
102
|
+
if (inferenceResult.writtenFacts > 0) {
|
|
103
|
+
onProgress({
|
|
104
|
+
phase: "llm",
|
|
105
|
+
message: `Memory inference wrote ${inferenceResult.writtenFacts} derived memor${inferenceResult.writtenFacts === 1 ? "y" : "ies"} from ${inferenceResult.splitParents} parent memor${inferenceResult.splitParents === 1 ? "y" : "ies"}.`,
|
|
106
|
+
});
|
|
107
|
+
}
|
|
108
|
+
}
|
|
109
|
+
catch (err) {
|
|
110
|
+
warn(`Memory inference pass aborted: ${err instanceof Error ? err.message : String(err)}`);
|
|
98
111
|
}
|
|
99
112
|
}
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
113
|
+
else {
|
|
114
|
+
onProgress({
|
|
115
|
+
phase: "llm",
|
|
116
|
+
message: "LLM passes disabled; rerun with --enrich to enable inference and enrichment.",
|
|
117
|
+
});
|
|
105
118
|
}
|
|
106
119
|
// Graph extraction pass (#207). Runs after memory inference so any
|
|
107
120
|
// atomic-fact children that just got written are visible to the graph
|
|
@@ -113,24 +126,27 @@ export async function akmIndex(options) {
|
|
|
113
126
|
// `llm.features.graph_extraction` feature flag or the per-pass
|
|
114
127
|
// `index.graph.llm` toggle) is off; the existing graph file is
|
|
115
128
|
// preserved on disk in that case.
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
129
|
+
if (enrich) {
|
|
130
|
+
try {
|
|
131
|
+
const graphResult = await runGraphExtractionPass(config, allSourceEntries, signal);
|
|
132
|
+
if (graphResult.written) {
|
|
133
|
+
onProgress({
|
|
134
|
+
phase: "llm",
|
|
135
|
+
message: `Graph extraction wrote ${graphResult.totalEntities} entit${graphResult.totalEntities === 1 ? "y" : "ies"} and ${graphResult.totalRelations} relation${graphResult.totalRelations === 1 ? "" : "s"} from ${graphResult.extracted} file${graphResult.extracted === 1 ? "" : "s"}.`,
|
|
136
|
+
});
|
|
137
|
+
}
|
|
138
|
+
}
|
|
139
|
+
catch (err) {
|
|
140
|
+
warn(`Graph extraction pass aborted: ${err instanceof Error ? err.message : String(err)}`);
|
|
123
141
|
}
|
|
124
142
|
}
|
|
125
|
-
|
|
126
|
-
warn(`Graph extraction pass aborted: ${err instanceof Error ? err.message : String(err)}`);
|
|
127
|
-
}
|
|
143
|
+
throwIfAborted(signal);
|
|
128
144
|
const tWalkStart = Date.now();
|
|
129
145
|
// Walk stash dirs and index entries.
|
|
130
146
|
// doFullDelete=true merges the wipe into the same transaction as the
|
|
131
147
|
// inserts so readers never see an empty database mid-rebuild.
|
|
132
148
|
const doFullDelete = options?.full || !isIncremental;
|
|
133
|
-
const { scannedDirs, skippedDirs, generatedCount, dirsNeedingLlm, warnings } = await indexEntries(db, allSourceEntries, isIncremental, builtAtMs, doFullDelete);
|
|
149
|
+
const { scannedDirs, skippedDirs, generatedCount, dirsNeedingLlm, warnings } = await indexEntries(db, allSourceEntries, isIncremental, builtAtMs, hadRemovedSources, doFullDelete, onProgress);
|
|
134
150
|
onProgress({
|
|
135
151
|
phase: "scan",
|
|
136
152
|
message: `Scanned ${scannedDirs} ${scannedDirs === 1 ? "directory" : "directories"} and skipped ${skippedDirs}.`,
|
|
@@ -150,15 +166,17 @@ export async function akmIndex(options) {
|
|
|
150
166
|
}
|
|
151
167
|
}
|
|
152
168
|
const tWalkEnd = Date.now();
|
|
169
|
+
throwIfAborted(signal);
|
|
153
170
|
// Enhance entries with LLM if configured
|
|
154
|
-
await enhanceDirsWithLlm(db, config, dirsNeedingLlm);
|
|
171
|
+
await enhanceDirsWithLlm(db, config, dirsNeedingLlm, signal, enrich);
|
|
155
172
|
onProgress({
|
|
156
173
|
phase: "llm",
|
|
157
|
-
message: resolveIndexPassLLM("enrichment", config)
|
|
174
|
+
message: enrich && resolveIndexPassLLM("enrichment", config)
|
|
158
175
|
? `LLM enhancement reviewed ${dirsNeedingLlm.length} ${dirsNeedingLlm.length === 1 ? "directory" : "directories"}.`
|
|
159
176
|
: "LLM enhancement disabled.",
|
|
160
177
|
});
|
|
161
178
|
const tLlmEnd = Date.now();
|
|
179
|
+
throwIfAborted(signal);
|
|
162
180
|
// Rebuild FTS after all inserts. Use incremental mode when this whole
|
|
163
181
|
// index run is incremental — only entries touched by `upsertEntry`
|
|
164
182
|
// since the last rebuild are re-indexed, instead of re-scanning every
|
|
@@ -200,6 +218,7 @@ export async function akmIndex(options) {
|
|
|
200
218
|
catch {
|
|
201
219
|
/* best-effort */
|
|
202
220
|
}
|
|
221
|
+
throwIfAborted(signal);
|
|
203
222
|
// Generate embeddings if semantic search is enabled
|
|
204
223
|
const embeddingResult = await generateEmbeddingsForDb(db, config, onProgress);
|
|
205
224
|
const tEmbedEnd = Date.now();
|
|
@@ -252,7 +271,7 @@ export async function akmIndex(options) {
|
|
|
252
271
|
}
|
|
253
272
|
}
|
|
254
273
|
// ── Extracted helpers for indexing ────────────────────────────────────────────
|
|
255
|
-
async function indexEntries(db, allSourceEntries, isIncremental, builtAtMs, doFullDelete = false) {
|
|
274
|
+
async function indexEntries(db, allSourceEntries, isIncremental, builtAtMs, hadRemovedSources, doFullDelete = false, onProgress) {
|
|
256
275
|
let scannedDirs = 0;
|
|
257
276
|
let skippedDirs = 0;
|
|
258
277
|
let generatedCount = 0;
|
|
@@ -260,9 +279,29 @@ async function indexEntries(db, allSourceEntries, isIncremental, builtAtMs, doFu
|
|
|
260
279
|
const seenPaths = new Set();
|
|
261
280
|
const dirsNeedingLlm = [];
|
|
262
281
|
const dirRecords = [];
|
|
282
|
+
let processedDirs = 0;
|
|
283
|
+
let priorDirsChanged = hadRemovedSources;
|
|
284
|
+
const reportScanProgress = (message) => {
|
|
285
|
+
onProgress?.({
|
|
286
|
+
phase: "scan",
|
|
287
|
+
message,
|
|
288
|
+
processed: processedDirs,
|
|
289
|
+
total: allSourceEntries.length,
|
|
290
|
+
});
|
|
291
|
+
};
|
|
292
|
+
const reportDirDecision = (kind, dirPath, currentStashDir, reason, persistedRowCount) => {
|
|
293
|
+
if (!isVerbose())
|
|
294
|
+
return;
|
|
295
|
+
const detail = reason.detail ? ` (${reason.detail})` : "";
|
|
296
|
+
const rowInfo = persistedRowCount !== undefined ? `; previous rows=${persistedRowCount}` : "";
|
|
297
|
+
reportScanProgress(`${kind === "scan" ? "Rescanning" : "Skipping"} ${path.relative(currentStashDir, dirPath) || "."} ` +
|
|
298
|
+
`from ${currentStashDir}: ${reason.kind}${detail}${rowInfo}`);
|
|
299
|
+
};
|
|
263
300
|
for (const sourceAdded of allSourceEntries) {
|
|
264
301
|
const currentStashDir = sourceAdded.path;
|
|
265
302
|
const fileContexts = walkStashFlat(currentStashDir);
|
|
303
|
+
processedDirs++;
|
|
304
|
+
reportScanProgress(`Processed ${processedDirs}/${allSourceEntries.length} source${allSourceEntries.length === 1 ? "" : "s"}.`);
|
|
266
305
|
// Wiki-root stashes: all .md files are indexed as wiki pages under wikiName
|
|
267
306
|
if (sourceAdded.wikiName) {
|
|
268
307
|
const wikiName = sourceAdded.wikiName;
|
|
@@ -273,13 +312,17 @@ async function indexEntries(db, allSourceEntries, isIncremental, builtAtMs, doFu
|
|
|
273
312
|
if (!shouldIndexStashFile(currentStashDir, ctx.absPath, { treatStashRootAsWikiRoot: true }))
|
|
274
313
|
continue;
|
|
275
314
|
const relNoExt = ctx.relPath.replace(/\.md$/, "");
|
|
315
|
+
const frontmatter = ctx.frontmatter() ?? {};
|
|
276
316
|
const entry = {
|
|
277
317
|
name: `${wikiName}/${relNoExt}`,
|
|
278
318
|
type: "wiki",
|
|
279
319
|
filename: ctx.fileName,
|
|
280
|
-
|
|
281
|
-
|
|
320
|
+
quality: "generated",
|
|
321
|
+
confidence: 0.55,
|
|
322
|
+
source: "filename",
|
|
282
323
|
};
|
|
324
|
+
applyCuratedFrontmatter(entry, frontmatter);
|
|
325
|
+
applyWikiFrontmatter(entry, frontmatter);
|
|
283
326
|
const dir = ctx.parentDirAbs;
|
|
284
327
|
const group = wikiDirGroups.get(dir);
|
|
285
328
|
if (group) {
|
|
@@ -292,12 +335,32 @@ async function indexEntries(db, allSourceEntries, isIncremental, builtAtMs, doFu
|
|
|
292
335
|
}
|
|
293
336
|
for (const [dirPath, { files, entries }] of wikiDirGroups) {
|
|
294
337
|
if (seenPaths.has(path.resolve(dirPath))) {
|
|
295
|
-
|
|
338
|
+
const reason = { kind: "duplicate-dir" };
|
|
339
|
+
dirRecords.push({ dirPath, currentStashDir, files, stash: null, skip: true, reason });
|
|
340
|
+
reportDirDecision("skip", dirPath, currentStashDir, reason);
|
|
296
341
|
continue;
|
|
297
342
|
}
|
|
298
343
|
seenPaths.add(path.resolve(dirPath));
|
|
344
|
+
const previousState = getDirIndexState(db, dirPath, files, builtAtMs);
|
|
345
|
+
if (isIncremental && !previousState.stale && canUseIncrementalSkip(previousState, priorDirsChanged)) {
|
|
346
|
+
skippedDirs++;
|
|
347
|
+
dirRecords.push({ dirPath, currentStashDir, files, stash: null, skip: true, reason: previousState.reason });
|
|
348
|
+
reportDirDecision("skip", dirPath, currentStashDir, previousState.reason, previousState.persistedRowCount);
|
|
349
|
+
continue;
|
|
350
|
+
}
|
|
299
351
|
scannedDirs++;
|
|
300
|
-
|
|
352
|
+
priorDirsChanged = true;
|
|
353
|
+
const reason = isIncremental ? previousState.reason : { kind: "full-rebuild" };
|
|
354
|
+
dirRecords.push({
|
|
355
|
+
dirPath,
|
|
356
|
+
currentStashDir,
|
|
357
|
+
files,
|
|
358
|
+
stash: { entries },
|
|
359
|
+
skip: false,
|
|
360
|
+
reason,
|
|
361
|
+
persistedRowCount: previousState.persistedRowCount,
|
|
362
|
+
});
|
|
363
|
+
reportDirDecision("scan", dirPath, currentStashDir, reason, previousState.persistedRowCount);
|
|
301
364
|
}
|
|
302
365
|
continue;
|
|
303
366
|
}
|
|
@@ -311,46 +374,70 @@ async function indexEntries(db, allSourceEntries, isIncremental, builtAtMs, doFu
|
|
|
311
374
|
dirGroups.set(dir, [ctx.absPath]);
|
|
312
375
|
}
|
|
313
376
|
for (const [dirPath, files] of dirGroups) {
|
|
377
|
+
const indexableFiles = files.filter((file) => shouldIndexStashFile(currentStashDir, file));
|
|
314
378
|
if (seenPaths.has(path.resolve(dirPath))) {
|
|
315
|
-
|
|
379
|
+
const reason = { kind: "duplicate-dir" };
|
|
380
|
+
dirRecords.push({ dirPath, currentStashDir, files: indexableFiles, stash: null, skip: true, reason });
|
|
381
|
+
reportDirDecision("skip", dirPath, currentStashDir, reason);
|
|
316
382
|
continue;
|
|
317
383
|
}
|
|
318
384
|
seenPaths.add(path.resolve(dirPath));
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
const
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
continue;
|
|
326
|
-
}
|
|
385
|
+
if (indexableFiles.length === 0) {
|
|
386
|
+
skippedDirs++;
|
|
387
|
+
const reason = { kind: "no-indexable-files" };
|
|
388
|
+
dirRecords.push({ dirPath, currentStashDir, files: indexableFiles, stash: null, skip: true, reason });
|
|
389
|
+
reportDirDecision("skip", dirPath, currentStashDir, reason);
|
|
390
|
+
continue;
|
|
327
391
|
}
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
}
|
|
342
|
-
}
|
|
392
|
+
const cachedZeroRowState = isIncremental && getCachedZeroRowDirState(db, dirPath, indexableFiles, builtAtMs, priorDirsChanged);
|
|
393
|
+
if (cachedZeroRowState) {
|
|
394
|
+
skippedDirs++;
|
|
395
|
+
dirRecords.push({
|
|
396
|
+
dirPath,
|
|
397
|
+
currentStashDir,
|
|
398
|
+
files: indexableFiles,
|
|
399
|
+
stash: null,
|
|
400
|
+
skip: true,
|
|
401
|
+
reason: cachedZeroRowState.reason,
|
|
402
|
+
});
|
|
403
|
+
reportDirDecision("skip", dirPath, currentStashDir, cachedZeroRowState.reason, cachedZeroRowState.persistedRowCount);
|
|
404
|
+
continue;
|
|
343
405
|
}
|
|
344
|
-
|
|
345
|
-
|
|
346
|
-
|
|
347
|
-
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
}
|
|
406
|
+
const generated = await generateMetadataFlat(currentStashDir, indexableFiles);
|
|
407
|
+
if (generated.warnings?.length)
|
|
408
|
+
warnings.push(...generated.warnings);
|
|
409
|
+
const legacyOverrides = loadStashFile(dirPath, { requireFilename: true });
|
|
410
|
+
const { stash, staleFiles } = buildIndexedDirCandidate(dirPath, indexableFiles, generated, legacyOverrides);
|
|
411
|
+
if (generated.entries.length > 0) {
|
|
412
|
+
generatedCount += generated.entries.length;
|
|
352
413
|
}
|
|
353
|
-
|
|
414
|
+
const previousState = getDirIndexState(db, dirPath, staleFiles, builtAtMs);
|
|
415
|
+
if (isIncremental && !previousState.stale && canUseIncrementalSkip(previousState, priorDirsChanged)) {
|
|
416
|
+
skippedDirs++;
|
|
417
|
+
dirRecords.push({
|
|
418
|
+
dirPath,
|
|
419
|
+
currentStashDir,
|
|
420
|
+
files: staleFiles,
|
|
421
|
+
stash: null,
|
|
422
|
+
skip: true,
|
|
423
|
+
reason: previousState.reason,
|
|
424
|
+
});
|
|
425
|
+
reportDirDecision("skip", dirPath, currentStashDir, previousState.reason, previousState.persistedRowCount);
|
|
426
|
+
continue;
|
|
427
|
+
}
|
|
428
|
+
scannedDirs++;
|
|
429
|
+
priorDirsChanged = true;
|
|
430
|
+
const reason = isIncremental ? previousState.reason : { kind: "full-rebuild" };
|
|
431
|
+
dirRecords.push({
|
|
432
|
+
dirPath,
|
|
433
|
+
currentStashDir,
|
|
434
|
+
files: staleFiles,
|
|
435
|
+
stash,
|
|
436
|
+
skip: false,
|
|
437
|
+
reason,
|
|
438
|
+
persistedRowCount: previousState.persistedRowCount,
|
|
439
|
+
});
|
|
440
|
+
reportDirDecision("scan", dirPath, currentStashDir, reason, previousState.persistedRowCount);
|
|
354
441
|
}
|
|
355
442
|
}
|
|
356
443
|
// Phase 2 (sync): write all pre-generated metadata inside a single transaction.
|
|
@@ -382,6 +469,7 @@ async function indexEntries(db, allSourceEntries, isIncremental, builtAtMs, doFu
|
|
|
382
469
|
}
|
|
383
470
|
db.exec("DELETE FROM entries_fts");
|
|
384
471
|
db.exec("DELETE FROM utility_scores");
|
|
472
|
+
db.exec("DELETE FROM index_dir_state");
|
|
385
473
|
// Detach usage_events from entries about to be deleted — null out entry_id
|
|
386
474
|
// but keep entry_ref so events can be re-linked after entries are rebuilt.
|
|
387
475
|
try {
|
|
@@ -392,32 +480,42 @@ async function indexEntries(db, allSourceEntries, isIncremental, builtAtMs, doFu
|
|
|
392
480
|
}
|
|
393
481
|
db.exec("DELETE FROM entries");
|
|
394
482
|
}
|
|
395
|
-
for (const { dirPath, currentStashDir, files, stash, skip } of dirRecords) {
|
|
396
|
-
if (skip)
|
|
483
|
+
for (const { dirPath, currentStashDir, files, stash, skip, reason } of dirRecords) {
|
|
484
|
+
if (skip) {
|
|
485
|
+
if (reason?.kind === "unchanged") {
|
|
486
|
+
const fingerprint = computeDirFingerprint(dirPath, files);
|
|
487
|
+
upsertIndexDirState(db, {
|
|
488
|
+
dirPath,
|
|
489
|
+
fileSetHash: fingerprint.fileSetHash,
|
|
490
|
+
fileMtimeMaxMs: fingerprint.fileMtimeMaxMs,
|
|
491
|
+
reason: reason.kind,
|
|
492
|
+
});
|
|
493
|
+
}
|
|
397
494
|
continue;
|
|
495
|
+
}
|
|
398
496
|
// Delete old entries for this dir (will be re-inserted)
|
|
399
497
|
deleteEntriesByDir(db, dirPath);
|
|
498
|
+
let persistedRows = 0;
|
|
499
|
+
let dedupedRows = 0;
|
|
400
500
|
if (stash) {
|
|
401
|
-
// Build a lookup for matching filename-less entries to actual files
|
|
402
|
-
const fileBasenameMap = buildFileBasenameMap(files);
|
|
403
501
|
for (const entry of stash.entries) {
|
|
404
|
-
const entryPath = entry.filename
|
|
405
|
-
? path.join(dirPath, entry.filename)
|
|
406
|
-
: matchEntryToFile(entry.name, fileBasenameMap, files);
|
|
502
|
+
const entryPath = entry.filename ? path.join(dirPath, entry.filename) : null;
|
|
407
503
|
if (!entryPath)
|
|
408
504
|
continue; // skip unresolvable entries
|
|
409
505
|
if (!shouldIndexStashFile(currentStashDir, entryPath))
|
|
410
506
|
continue;
|
|
411
507
|
// Skip if a higher-priority stash root already indexed this asset
|
|
412
|
-
const
|
|
413
|
-
|
|
414
|
-
|
|
508
|
+
const identityKey = `${entry.type}\0${entry.name}`;
|
|
509
|
+
if (indexedAssetIdentities.has(identityKey)) {
|
|
510
|
+
dedupedRows++;
|
|
415
511
|
continue;
|
|
512
|
+
}
|
|
416
513
|
indexedAssetIdentities.add(identityKey);
|
|
417
514
|
const entryKey = `${currentStashDir}:${entry.type}:${entry.name}`;
|
|
418
515
|
const searchText = buildSearchText(entry);
|
|
419
516
|
const entryWithSize = attachFileSize(entry, entryPath);
|
|
420
517
|
const entryId = upsertEntry(db, entryKey, dirPath, entryPath, currentStashDir, entryWithSize, searchText);
|
|
518
|
+
persistedRows++;
|
|
421
519
|
if (entry.type === "workflow") {
|
|
422
520
|
const doc = takeWorkflowDocument(entry);
|
|
423
521
|
if (doc) {
|
|
@@ -430,12 +528,121 @@ async function indexEntries(db, allSourceEntries, isIncremental, builtAtMs, doFu
|
|
|
430
528
|
dirsNeedingLlm.push({ dirPath, files, currentStashDir, stash });
|
|
431
529
|
}
|
|
432
530
|
}
|
|
531
|
+
const fingerprint = computeDirFingerprint(dirPath, files);
|
|
532
|
+
const persistedReason = persistedRows === 0
|
|
533
|
+
? inferZeroRowReason(stash, reason, warnings, dirPath, dedupedRows)
|
|
534
|
+
: reason?.kind === "full-rebuild"
|
|
535
|
+
? "full-rebuild"
|
|
536
|
+
: (reason?.kind ?? "updated");
|
|
537
|
+
upsertIndexDirState(db, {
|
|
538
|
+
dirPath,
|
|
539
|
+
fileSetHash: fingerprint.fileSetHash,
|
|
540
|
+
fileMtimeMaxMs: fingerprint.fileMtimeMaxMs,
|
|
541
|
+
reason: persistedReason,
|
|
542
|
+
});
|
|
543
|
+
if (persistedRows === 0) {
|
|
544
|
+
warnVerbose(`[index] zero-row ${dirPath}: ${persistedReason}`);
|
|
545
|
+
}
|
|
433
546
|
}
|
|
434
547
|
});
|
|
435
548
|
insertTransaction();
|
|
436
549
|
return { scannedDirs, skippedDirs, generatedCount, warnings, dirsNeedingLlm };
|
|
437
550
|
}
|
|
438
|
-
|
|
551
|
+
function getDirIndexState(db, dirPath, files, builtAtMs) {
|
|
552
|
+
const prevEntries = getEntriesByDir(db, dirPath);
|
|
553
|
+
const fingerprint = computeDirFingerprint(dirPath, files);
|
|
554
|
+
if (prevEntries.length > 0) {
|
|
555
|
+
const staleReason = getDirStaleReason(dirPath, files, prevEntries, builtAtMs);
|
|
556
|
+
if (!staleReason) {
|
|
557
|
+
return { stale: false, reason: { kind: "unchanged" }, persistedRowCount: prevEntries.length };
|
|
558
|
+
}
|
|
559
|
+
return { stale: true, reason: staleReason, persistedRowCount: prevEntries.length };
|
|
560
|
+
}
|
|
561
|
+
const cachedState = getIndexDirState(db, dirPath);
|
|
562
|
+
if (cachedState &&
|
|
563
|
+
cachedState.fileSetHash === fingerprint.fileSetHash &&
|
|
564
|
+
cachedState.fileMtimeMaxMs === fingerprint.fileMtimeMaxMs) {
|
|
565
|
+
return {
|
|
566
|
+
stale: false,
|
|
567
|
+
reason: { kind: "cached-zero-row-state", detail: cachedState.reason },
|
|
568
|
+
persistedRowCount: 0,
|
|
569
|
+
};
|
|
570
|
+
}
|
|
571
|
+
return {
|
|
572
|
+
stale: true,
|
|
573
|
+
reason: { kind: "no-previous-rows", detail: cachedState ? `cached=${cachedState.reason}` : undefined },
|
|
574
|
+
persistedRowCount: 0,
|
|
575
|
+
};
|
|
576
|
+
}
|
|
577
|
+
function getCachedZeroRowDirState(db, dirPath, files, builtAtMs, priorDirsChanged) {
|
|
578
|
+
const state = getDirIndexState(db, dirPath, files, builtAtMs);
|
|
579
|
+
if (state.stale || state.reason.kind !== "cached-zero-row-state")
|
|
580
|
+
return undefined;
|
|
581
|
+
if (!canUseIncrementalSkip(state, priorDirsChanged))
|
|
582
|
+
return undefined;
|
|
583
|
+
return state;
|
|
584
|
+
}
|
|
585
|
+
function canUseIncrementalSkip(state, priorDirsChanged) {
|
|
586
|
+
return !(priorDirsChanged &&
|
|
587
|
+
state.reason.kind === "cached-zero-row-state" &&
|
|
588
|
+
state.reason.detail === "deduped-zero-row");
|
|
589
|
+
}
|
|
590
|
+
function computeDirFingerprint(_dirPath, files) {
|
|
591
|
+
const normalizedFiles = [...new Set(files.map((file) => path.basename(file)))].sort();
|
|
592
|
+
let fileMtimeMaxMs = 0;
|
|
593
|
+
for (const file of files) {
|
|
594
|
+
try {
|
|
595
|
+
fileMtimeMaxMs = Math.max(fileMtimeMaxMs, fs.statSync(file).mtimeMs);
|
|
596
|
+
}
|
|
597
|
+
catch {
|
|
598
|
+
fileMtimeMaxMs = Number.POSITIVE_INFINITY;
|
|
599
|
+
break;
|
|
600
|
+
}
|
|
601
|
+
}
|
|
602
|
+
return {
|
|
603
|
+
fileSetHash: normalizedFiles.join("\0"),
|
|
604
|
+
fileMtimeMaxMs,
|
|
605
|
+
};
|
|
606
|
+
}
|
|
607
|
+
function getDirStaleReason(_dirPath, currentFiles, previousEntries, builtAtMs) {
|
|
608
|
+
const prevFileNames = new Set(previousEntries
|
|
609
|
+
.map((ie) => {
|
|
610
|
+
const fromPath = path.basename(ie.filePath);
|
|
611
|
+
return fromPath || ie.entry.filename;
|
|
612
|
+
})
|
|
613
|
+
.filter((e) => !!e));
|
|
614
|
+
const currFileNames = new Set(currentFiles.map((f) => path.basename(f)));
|
|
615
|
+
if (prevFileNames.size !== currFileNames.size) {
|
|
616
|
+
return { kind: "file-set-changed", detail: `${prevFileNames.size} -> ${currFileNames.size} files` };
|
|
617
|
+
}
|
|
618
|
+
for (const name of currFileNames) {
|
|
619
|
+
if (!prevFileNames.has(name))
|
|
620
|
+
return { kind: "file-set-changed", detail: name };
|
|
621
|
+
}
|
|
622
|
+
for (const file of currentFiles) {
|
|
623
|
+
try {
|
|
624
|
+
if (fs.statSync(file).mtimeMs > builtAtMs)
|
|
625
|
+
return { kind: "mtime-changed", detail: path.basename(file) };
|
|
626
|
+
}
|
|
627
|
+
catch {
|
|
628
|
+
return { kind: "missing-file", detail: path.basename(file) };
|
|
629
|
+
}
|
|
630
|
+
}
|
|
631
|
+
return undefined;
|
|
632
|
+
}
|
|
633
|
+
function inferZeroRowReason(stash, priorReason, warnings, dirPath, dedupedRows) {
|
|
634
|
+
if (dedupedRows > 0)
|
|
635
|
+
return "deduped-zero-row";
|
|
636
|
+
const workflowNoise = warnings.some((warning) => warning.startsWith("Skipped workflow ") && warning.includes(dirPath));
|
|
637
|
+
if (workflowNoise)
|
|
638
|
+
return "workflow-noise";
|
|
639
|
+
if (!stash || stash.entries.length === 0)
|
|
640
|
+
return "empty-generated-set";
|
|
641
|
+
return `zero-row:${priorReason?.kind ?? "unknown"}`;
|
|
642
|
+
}
|
|
643
|
+
async function enhanceDirsWithLlm(db, config, dirsNeedingLlm, signal, enrich = false) {
|
|
644
|
+
if (!enrich)
|
|
645
|
+
return;
|
|
439
646
|
// Resolve per-pass LLM config via the unified shim. Returns undefined when
|
|
440
647
|
// either no `akm.llm` is configured or the user opted this pass out via
|
|
441
648
|
// `index.enrichment.llm = false`. (#208)
|
|
@@ -447,12 +654,13 @@ async function enhanceDirsWithLlm(db, config, dirsNeedingLlm) {
|
|
|
447
654
|
// and leaving the user wondering why nothing got enhanced.
|
|
448
655
|
const summary = { attempted: 0, succeeded: 0, failureSamples: [] };
|
|
449
656
|
for (const { dirPath, files, currentStashDir, stash: originalStash } of dirsNeedingLlm) {
|
|
657
|
+
throwIfAborted(signal);
|
|
450
658
|
// Only enhance generated entries; user-provided overrides should not be overwritten
|
|
451
659
|
const generatedEntries = originalStash.entries.filter((e) => e.quality === "generated");
|
|
452
660
|
if (generatedEntries.length === 0)
|
|
453
661
|
continue;
|
|
454
662
|
const generatedStash = { entries: generatedEntries };
|
|
455
|
-
const enhanced = await enhanceStashWithLlm(llmConfig, generatedStash, files, summary);
|
|
663
|
+
const enhanced = await enhanceStashWithLlm(llmConfig, generatedStash, files, summary, signal);
|
|
456
664
|
// Re-upsert the enhanced entries in a single transaction so a crash
|
|
457
665
|
// cannot leave half the entries updated and the rest stale.
|
|
458
666
|
db.transaction(() => {
|
|
@@ -475,7 +683,8 @@ async function enhanceDirsWithLlm(db, config, dirsNeedingLlm) {
|
|
|
475
683
|
warn(`LLM enhancement failed for ${failed}/${summary.attempted} entries — they were left un-enhanced.${sample}`);
|
|
476
684
|
}
|
|
477
685
|
}
|
|
478
|
-
async function generateEmbeddingsForDb(db, config, onProgress) {
|
|
686
|
+
async function generateEmbeddingsForDb(db, config, onProgress, signal) {
|
|
687
|
+
throwIfAborted(signal);
|
|
479
688
|
if (config.semanticSearchMode === "off") {
|
|
480
689
|
onProgress({ phase: "embeddings", message: "Semantic search disabled; skipping embeddings." });
|
|
481
690
|
return { success: false, reason: "index-missing", message: "Semantic search is disabled." };
|
|
@@ -504,6 +713,7 @@ async function generateEmbeddingsForDb(db, config, onProgress) {
|
|
|
504
713
|
try {
|
|
505
714
|
const { embedBatch } = await import("../llm/embedder.js");
|
|
506
715
|
const { estimateTokenCount } = await import("../llm/embedders/remote.js");
|
|
716
|
+
throwIfAborted(signal);
|
|
507
717
|
const allEntries = getAllEntriesForEmbedding(db);
|
|
508
718
|
if (allEntries.length === 0) {
|
|
509
719
|
onProgress({ phase: "embeddings", message: "Embeddings already up to date." });
|
|
@@ -528,7 +738,8 @@ async function generateEmbeddingsForDb(db, config, onProgress) {
|
|
|
528
738
|
warnVerbose(`[embed] ${ref} (${chars} chars, est. ${tokens} tokens) → batch ${batchNum}/${totalBatches}`);
|
|
529
739
|
}
|
|
530
740
|
}
|
|
531
|
-
const embeddings = await embedBatch(texts, config.embedding);
|
|
741
|
+
const embeddings = await embedBatch(texts, config.embedding, signal);
|
|
742
|
+
throwIfAborted(signal);
|
|
532
743
|
// Wrap all embedding upserts in a single transaction so partial
|
|
533
744
|
// state is rolled back on failure rather than leaving the table half-filled.
|
|
534
745
|
db.transaction(() => {
|
|
@@ -668,41 +879,31 @@ function verifyIndexState(db, config, totalEntries, embeddingResult) {
|
|
|
668
879
|
vecAvailable,
|
|
669
880
|
};
|
|
670
881
|
}
|
|
671
|
-
function
|
|
672
|
-
|
|
673
|
-
|
|
674
|
-
|
|
675
|
-
|
|
676
|
-
|
|
677
|
-
|
|
678
|
-
|
|
679
|
-
|
|
680
|
-
|
|
681
|
-
|
|
682
|
-
for (const
|
|
683
|
-
|
|
684
|
-
|
|
685
|
-
|
|
686
|
-
|
|
687
|
-
|
|
688
|
-
return true;
|
|
689
|
-
}
|
|
690
|
-
}
|
|
691
|
-
// Check .stash.json modification time
|
|
692
|
-
const stashPath = path.join(dirPath, ".stash.json");
|
|
693
|
-
try {
|
|
694
|
-
if (fs.statSync(stashPath).mtimeMs > builtAtMs)
|
|
695
|
-
return true;
|
|
696
|
-
}
|
|
697
|
-
catch {
|
|
698
|
-
// file doesn't exist, not stale
|
|
882
|
+
function buildIndexedDirCandidate(dirPath, indexableFiles, generated, legacyOverrides) {
|
|
883
|
+
const mergedEntries = legacyOverrides
|
|
884
|
+
? generated.entries.map((entry) => mergeLegacyEntry(entry, legacyOverrides.entries))
|
|
885
|
+
: generated.entries;
|
|
886
|
+
const stash = mergedEntries.length > 0 ? { entries: mergedEntries } : legacyOverrides;
|
|
887
|
+
const staleFiles = stash ? resolveIndexedFiles(dirPath, indexableFiles, stash) : indexableFiles;
|
|
888
|
+
return { stash, staleFiles };
|
|
889
|
+
}
|
|
890
|
+
function resolveIndexedFiles(dirPath, files, stash) {
|
|
891
|
+
const fileBasenameMap = buildFileBasenameMap(files);
|
|
892
|
+
const resolved = new Set();
|
|
893
|
+
for (const entry of stash.entries) {
|
|
894
|
+
const entryPath = entry.filename
|
|
895
|
+
? path.join(dirPath, entry.filename)
|
|
896
|
+
: matchEntryToFile(entry.name, fileBasenameMap, files);
|
|
897
|
+
if (entryPath)
|
|
898
|
+
resolved.add(entryPath);
|
|
699
899
|
}
|
|
700
|
-
return
|
|
900
|
+
return resolved.size > 0 ? [...resolved] : files;
|
|
701
901
|
}
|
|
702
|
-
async function enhanceStashWithLlm(llmConfig, stash, files, summary) {
|
|
902
|
+
async function enhanceStashWithLlm(llmConfig, stash, files, summary, signal) {
|
|
703
903
|
const { enhanceMetadata } = await import("../llm/metadata-enhance");
|
|
704
904
|
const enhanced = [];
|
|
705
905
|
for (const entry of stash.entries) {
|
|
906
|
+
throwIfAborted(signal);
|
|
706
907
|
summary.attempted++;
|
|
707
908
|
try {
|
|
708
909
|
const entryFile = entry.filename
|
|
@@ -717,7 +918,7 @@ async function enhanceStashWithLlm(llmConfig, stash, files, summary) {
|
|
|
717
918
|
/* ignore unreadable files */
|
|
718
919
|
}
|
|
719
920
|
}
|
|
720
|
-
const improvements = await enhanceMetadata(llmConfig, entry, fileContent);
|
|
921
|
+
const improvements = await enhanceMetadata(llmConfig, entry, fileContent, signal);
|
|
721
922
|
const updated = { ...entry };
|
|
722
923
|
if (improvements.description)
|
|
723
924
|
updated.description = improvements.description;
|
|
@@ -760,9 +961,9 @@ export function buildFileBasenameMap(files) {
|
|
|
760
961
|
* 1. Exact basename match: entry.name === filename without extension
|
|
761
962
|
* 2. Last path segment match: for entries with names like "dir/sub-entry",
|
|
762
963
|
* try matching the last segment
|
|
763
|
-
* 3.
|
|
964
|
+
* 3. No implicit file fallback: ambiguous legacy entries are skipped
|
|
764
965
|
*/
|
|
765
|
-
export function matchEntryToFile(entryName, fileMap,
|
|
966
|
+
export function matchEntryToFile(entryName, fileMap, _files) {
|
|
766
967
|
// Exact match on entry name
|
|
767
968
|
const exact = fileMap.get(entryName);
|
|
768
969
|
if (exact)
|
|
@@ -774,8 +975,20 @@ export function matchEntryToFile(entryName, fileMap, files) {
|
|
|
774
975
|
if (segmentMatch)
|
|
775
976
|
return segmentMatch;
|
|
776
977
|
}
|
|
777
|
-
|
|
778
|
-
|
|
978
|
+
return null;
|
|
979
|
+
}
|
|
980
|
+
function mergeLegacyEntry(entry, legacyEntries) {
|
|
981
|
+
const legacy = legacyEntries.find((candidate) => candidate.filename === entry.filename);
|
|
982
|
+
if (!legacy)
|
|
983
|
+
return entry;
|
|
984
|
+
return {
|
|
985
|
+
...entry,
|
|
986
|
+
...legacy,
|
|
987
|
+
filename: entry.filename,
|
|
988
|
+
source: legacy.source ?? entry.source,
|
|
989
|
+
quality: legacy.quality ?? entry.quality,
|
|
990
|
+
confidence: legacy.confidence ?? entry.confidence,
|
|
991
|
+
};
|
|
779
992
|
}
|
|
780
993
|
/**
|
|
781
994
|
* Look up a single asset by ref. Spec §6.2 — `akm show` queries this and
|
|
@@ -803,7 +1016,7 @@ export async function lookup(ref) {
|
|
|
803
1016
|
if (sources.length === 0)
|
|
804
1017
|
return null;
|
|
805
1018
|
const dbPath = getDbPath();
|
|
806
|
-
const db =
|
|
1019
|
+
const db = openExistingDatabase(dbPath);
|
|
807
1020
|
try {
|
|
808
1021
|
// entry_key shape: `${stashDir}:${type}:${name}`. Suffix-match on
|
|
809
1022
|
// `:type:name` so we can scope by source dir as a prefix when origin is
|