akm-cli 0.7.1 → 0.7.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. package/CHANGELOG.md +35 -0
  2. package/dist/cli.js +62 -16
  3. package/dist/commands/history.js +2 -7
  4. package/dist/commands/info.js +2 -2
  5. package/dist/commands/installed-stashes.js +45 -1
  6. package/dist/commands/search.js +2 -2
  7. package/dist/commands/show.js +4 -19
  8. package/dist/commands/source-add.js +1 -1
  9. package/dist/core/common.js +16 -1
  10. package/dist/core/config.js +18 -3
  11. package/dist/indexer/db-search.js +33 -39
  12. package/dist/indexer/db.js +51 -1
  13. package/dist/indexer/graph-extraction.js +5 -3
  14. package/dist/indexer/indexer.js +334 -121
  15. package/dist/indexer/manifest.js +18 -23
  16. package/dist/indexer/memory-inference.js +47 -58
  17. package/dist/indexer/metadata.js +253 -21
  18. package/dist/indexer/search-source.js +11 -5
  19. package/dist/llm/client.js +61 -1
  20. package/dist/llm/embedder.js +8 -5
  21. package/dist/llm/embedders/local.js +8 -2
  22. package/dist/llm/embedders/remote.js +4 -2
  23. package/dist/llm/graph-extract.js +4 -4
  24. package/dist/llm/memory-infer.js +61 -33
  25. package/dist/llm/metadata-enhance.js +2 -2
  26. package/dist/output/cli-hints.js +5 -2
  27. package/dist/output/renderers.js +22 -49
  28. package/dist/registry/build-index.js +13 -18
  29. package/dist/setup/setup.js +238 -96
  30. package/dist/sources/providers/git.js +14 -2
  31. package/dist/sources/providers/website.js +4 -460
  32. package/dist/sources/website-ingest.js +470 -0
  33. package/dist/wiki/wiki.js +11 -1
  34. package/dist/workflows/parser.js +19 -4
  35. package/dist/workflows/runs.js +3 -3
  36. package/docs/README.md +10 -3
  37. package/docs/migration/release-notes/0.7.0.md +22 -0
  38. package/package.json +5 -2
@@ -5,25 +5,32 @@ import { getDbPath } from "../core/paths";
5
5
  import { isVerbose, warn, warnVerbose } from "../core/warn";
6
6
  import { resolveIndexPassLLM } from "../llm/index-passes";
7
7
  import { takeWorkflowDocument } from "../workflows/document-cache";
8
- import { closeDatabase, deleteEntriesByDir, deleteEntriesByStashDir, getEmbeddingCount, getEntriesByDir, getEntryCount, getMeta, isVecAvailable, openDatabase, rebuildFts, setMeta, upsertEmbedding, upsertEntry, upsertUtilityScore, warnIfVecMissing, } from "./db";
8
+ import { closeDatabase, deleteEntriesByDir, deleteEntriesByStashDir, deleteIndexDirStatesByStashDir, getEmbeddingCount, getEntriesByDir, getEntryCount, getIndexDirState, getMeta, isVecAvailable, openDatabase, openExistingDatabase, rebuildFts, setMeta, upsertEmbedding, upsertEntry, upsertIndexDirState, upsertUtilityScore, warnIfVecMissing, } from "./db";
9
9
  import { runGraphExtractionPass } from "./graph-extraction";
10
10
  import { runMemoryInferencePass } from "./memory-inference";
11
- import { generateMetadataFlat, isWorkflowSkipWarning, loadStashFile, shouldIndexStashFile, } from "./metadata";
11
+ import { applyCuratedFrontmatter, applyWikiFrontmatter, generateMetadataFlat, isWorkflowSkipWarning, loadStashFile, shouldIndexStashFile, } from "./metadata";
12
12
  import { buildSearchText } from "./search-fields";
13
13
  import { classifySemanticFailure, clearSemanticStatus, deriveSemanticProviderFingerprint, writeSemanticStatus, } from "./semantic-status";
14
14
  import { ensureUsageEventsSchema, purgeOldUsageEvents } from "./usage-events";
15
15
  import { walkStashFlat } from "./walker";
16
+ function throwIfAborted(signal) {
17
+ if (signal?.aborted) {
18
+ throw signal.reason instanceof Error ? signal.reason : new Error("index interrupted");
19
+ }
20
+ }
16
21
  // ── Indexer ──────────────────────────────────────────────────────────────────
17
22
  export async function akmIndex(options) {
18
23
  const stashDir = options?.stashDir || resolveStashDir();
19
24
  const onProgress = options?.onProgress ?? (() => { });
25
+ const signal = options?.signal;
26
+ const enrich = options?.enrich === true;
20
27
  // Load config and resolve all stash sources
21
28
  const { loadConfig } = await import("../core/config.js");
22
29
  const config = loadConfig();
23
30
  // Ensure git stash caches are extracted before resolving stash dirs,
24
31
  // so their content directories exist on disk for the walker to discover.
25
32
  const { ensureSourceCaches, resolveSourceEntries } = await import("./search-source.js");
26
- await ensureSourceCaches(config);
33
+ await ensureSourceCaches(config, { force: options?.full === true });
27
34
  const allSourceEntries = resolveSourceEntries(stashDir, config);
28
35
  const allSourceDirs = allSourceEntries.map((s) => s.path);
29
36
  const t0 = Date.now();
@@ -44,13 +51,11 @@ export async function akmIndex(options) {
44
51
  sourcesCount: allSourceDirs.length,
45
52
  semanticSearchMode: config.semanticSearchMode,
46
53
  embeddingProvider: getEmbeddingProvider(config.embedding),
47
- // Surface "llm enabled" only when at least one pass would actually
48
- // run. Today that means the enrichment pass; future passes plug in
49
- // via `resolveIndexPassLLM`.
50
- llmEnabled: !!resolveIndexPassLLM("enrichment", config),
54
+ llmEnabled: enrich && !!resolveIndexPassLLM("enrichment", config),
51
55
  vecAvailable: isVecAvailable(db),
52
56
  }),
53
57
  });
58
+ let hadRemovedSources = false;
54
59
  if (options?.full || !isIncremental) {
55
60
  // The delete is now merged into the insert transaction inside
56
61
  // indexEntries() so that a reader never sees an empty database between
@@ -77,31 +82,39 @@ export async function akmIndex(options) {
77
82
  const currentSet = new Set(allSourceDirs);
78
83
  for (const dir of prevStashDirs) {
79
84
  if (!currentSet.has(dir)) {
85
+ hadRemovedSources = true;
80
86
  deleteEntriesByStashDir(db, dir);
87
+ deleteIndexDirStatesByStashDir(db, dir);
81
88
  }
82
89
  }
83
90
  }
84
91
  }
85
- // Memory inference pass (#201). Runs before the walk so any atomic-fact
92
+ throwIfAborted(signal);
93
+ // Memory inference pass (#201). Runs before the walk so any derived-memory
86
94
  // children that get written are picked up by the walker in this same run
87
95
  // and don't have to wait for the next `akm index`. Gated entirely by
88
96
  // `resolveIndexPassLLM("memory", config)` — when the user has no
89
97
  // `akm.llm` block or has set `index.memory.llm = false`, this is a no-op
90
98
  // and existing inferred children are left in place.
91
- try {
92
- const inferenceResult = await runMemoryInferencePass(config, allSourceEntries);
93
- if (inferenceResult.writtenFacts > 0) {
94
- onProgress({
95
- phase: "llm",
96
- message: `Memory inference wrote ${inferenceResult.writtenFacts} atomic fact${inferenceResult.writtenFacts === 1 ? "" : "s"} from ${inferenceResult.splitParents} parent memor${inferenceResult.splitParents === 1 ? "y" : "ies"}.`,
97
- });
99
+ if (enrich) {
100
+ try {
101
+ const inferenceResult = await runMemoryInferencePass(config, allSourceEntries, signal);
102
+ if (inferenceResult.writtenFacts > 0) {
103
+ onProgress({
104
+ phase: "llm",
105
+ message: `Memory inference wrote ${inferenceResult.writtenFacts} derived memor${inferenceResult.writtenFacts === 1 ? "y" : "ies"} from ${inferenceResult.splitParents} parent memor${inferenceResult.splitParents === 1 ? "y" : "ies"}.`,
106
+ });
107
+ }
108
+ }
109
+ catch (err) {
110
+ warn(`Memory inference pass aborted: ${err instanceof Error ? err.message : String(err)}`);
98
111
  }
99
112
  }
100
- catch (err) {
101
- // Defensive — runMemoryInferencePass swallows per-memory failures.
102
- // A thrown error here would only come from an unexpected programming
103
- // bug; surface it as a warning rather than aborting the index run.
104
- warn(`Memory inference pass aborted: ${err instanceof Error ? err.message : String(err)}`);
113
+ else {
114
+ onProgress({
115
+ phase: "llm",
116
+ message: "LLM passes disabled; rerun with --enrich to enable inference and enrichment.",
117
+ });
105
118
  }
106
119
  // Graph extraction pass (#207). Runs after memory inference so any
107
120
  // atomic-fact children that just got written are visible to the graph
@@ -113,24 +126,27 @@ export async function akmIndex(options) {
113
126
  // `llm.features.graph_extraction` feature flag or the per-pass
114
127
  // `index.graph.llm` toggle) is off; the existing graph file is
115
128
  // preserved on disk in that case.
116
- try {
117
- const graphResult = await runGraphExtractionPass(config, allSourceEntries);
118
- if (graphResult.written) {
119
- onProgress({
120
- phase: "llm",
121
- message: `Graph extraction wrote ${graphResult.totalEntities} entit${graphResult.totalEntities === 1 ? "y" : "ies"} and ${graphResult.totalRelations} relation${graphResult.totalRelations === 1 ? "" : "s"} from ${graphResult.extracted} file${graphResult.extracted === 1 ? "" : "s"}.`,
122
- });
129
+ if (enrich) {
130
+ try {
131
+ const graphResult = await runGraphExtractionPass(config, allSourceEntries, signal);
132
+ if (graphResult.written) {
133
+ onProgress({
134
+ phase: "llm",
135
+ message: `Graph extraction wrote ${graphResult.totalEntities} entit${graphResult.totalEntities === 1 ? "y" : "ies"} and ${graphResult.totalRelations} relation${graphResult.totalRelations === 1 ? "" : "s"} from ${graphResult.extracted} file${graphResult.extracted === 1 ? "" : "s"}.`,
136
+ });
137
+ }
138
+ }
139
+ catch (err) {
140
+ warn(`Graph extraction pass aborted: ${err instanceof Error ? err.message : String(err)}`);
123
141
  }
124
142
  }
125
- catch (err) {
126
- warn(`Graph extraction pass aborted: ${err instanceof Error ? err.message : String(err)}`);
127
- }
143
+ throwIfAborted(signal);
128
144
  const tWalkStart = Date.now();
129
145
  // Walk stash dirs and index entries.
130
146
  // doFullDelete=true merges the wipe into the same transaction as the
131
147
  // inserts so readers never see an empty database mid-rebuild.
132
148
  const doFullDelete = options?.full || !isIncremental;
133
- const { scannedDirs, skippedDirs, generatedCount, dirsNeedingLlm, warnings } = await indexEntries(db, allSourceEntries, isIncremental, builtAtMs, doFullDelete);
149
+ const { scannedDirs, skippedDirs, generatedCount, dirsNeedingLlm, warnings } = await indexEntries(db, allSourceEntries, isIncremental, builtAtMs, hadRemovedSources, doFullDelete, onProgress);
134
150
  onProgress({
135
151
  phase: "scan",
136
152
  message: `Scanned ${scannedDirs} ${scannedDirs === 1 ? "directory" : "directories"} and skipped ${skippedDirs}.`,
@@ -150,15 +166,17 @@ export async function akmIndex(options) {
150
166
  }
151
167
  }
152
168
  const tWalkEnd = Date.now();
169
+ throwIfAborted(signal);
153
170
  // Enhance entries with LLM if configured
154
- await enhanceDirsWithLlm(db, config, dirsNeedingLlm);
171
+ await enhanceDirsWithLlm(db, config, dirsNeedingLlm, signal, enrich);
155
172
  onProgress({
156
173
  phase: "llm",
157
- message: resolveIndexPassLLM("enrichment", config)
174
+ message: enrich && resolveIndexPassLLM("enrichment", config)
158
175
  ? `LLM enhancement reviewed ${dirsNeedingLlm.length} ${dirsNeedingLlm.length === 1 ? "directory" : "directories"}.`
159
176
  : "LLM enhancement disabled.",
160
177
  });
161
178
  const tLlmEnd = Date.now();
179
+ throwIfAborted(signal);
162
180
  // Rebuild FTS after all inserts. Use incremental mode when this whole
163
181
  // index run is incremental — only entries touched by `upsertEntry`
164
182
  // since the last rebuild are re-indexed, instead of re-scanning every
@@ -200,6 +218,7 @@ export async function akmIndex(options) {
200
218
  catch {
201
219
  /* best-effort */
202
220
  }
221
+ throwIfAborted(signal);
203
222
  // Generate embeddings if semantic search is enabled
204
223
  const embeddingResult = await generateEmbeddingsForDb(db, config, onProgress);
205
224
  const tEmbedEnd = Date.now();
@@ -252,7 +271,7 @@ export async function akmIndex(options) {
252
271
  }
253
272
  }
254
273
  // ── Extracted helpers for indexing ────────────────────────────────────────────
255
- async function indexEntries(db, allSourceEntries, isIncremental, builtAtMs, doFullDelete = false) {
274
+ async function indexEntries(db, allSourceEntries, isIncremental, builtAtMs, hadRemovedSources, doFullDelete = false, onProgress) {
256
275
  let scannedDirs = 0;
257
276
  let skippedDirs = 0;
258
277
  let generatedCount = 0;
@@ -260,9 +279,29 @@ async function indexEntries(db, allSourceEntries, isIncremental, builtAtMs, doFu
260
279
  const seenPaths = new Set();
261
280
  const dirsNeedingLlm = [];
262
281
  const dirRecords = [];
282
+ let processedDirs = 0;
283
+ let priorDirsChanged = hadRemovedSources;
284
+ const reportScanProgress = (message) => {
285
+ onProgress?.({
286
+ phase: "scan",
287
+ message,
288
+ processed: processedDirs,
289
+ total: allSourceEntries.length,
290
+ });
291
+ };
292
+ const reportDirDecision = (kind, dirPath, currentStashDir, reason, persistedRowCount) => {
293
+ if (!isVerbose())
294
+ return;
295
+ const detail = reason.detail ? ` (${reason.detail})` : "";
296
+ const rowInfo = persistedRowCount !== undefined ? `; previous rows=${persistedRowCount}` : "";
297
+ reportScanProgress(`${kind === "scan" ? "Rescanning" : "Skipping"} ${path.relative(currentStashDir, dirPath) || "."} ` +
298
+ `from ${currentStashDir}: ${reason.kind}${detail}${rowInfo}`);
299
+ };
263
300
  for (const sourceAdded of allSourceEntries) {
264
301
  const currentStashDir = sourceAdded.path;
265
302
  const fileContexts = walkStashFlat(currentStashDir);
303
+ processedDirs++;
304
+ reportScanProgress(`Processed ${processedDirs}/${allSourceEntries.length} source${allSourceEntries.length === 1 ? "" : "s"}.`);
266
305
  // Wiki-root stashes: all .md files are indexed as wiki pages under wikiName
267
306
  if (sourceAdded.wikiName) {
268
307
  const wikiName = sourceAdded.wikiName;
@@ -273,13 +312,17 @@ async function indexEntries(db, allSourceEntries, isIncremental, builtAtMs, doFu
273
312
  if (!shouldIndexStashFile(currentStashDir, ctx.absPath, { treatStashRootAsWikiRoot: true }))
274
313
  continue;
275
314
  const relNoExt = ctx.relPath.replace(/\.md$/, "");
315
+ const frontmatter = ctx.frontmatter() ?? {};
276
316
  const entry = {
277
317
  name: `${wikiName}/${relNoExt}`,
278
318
  type: "wiki",
279
319
  filename: ctx.fileName,
280
- description: ctx.frontmatter()?.description,
281
- source: "frontmatter",
320
+ quality: "generated",
321
+ confidence: 0.55,
322
+ source: "filename",
282
323
  };
324
+ applyCuratedFrontmatter(entry, frontmatter);
325
+ applyWikiFrontmatter(entry, frontmatter);
283
326
  const dir = ctx.parentDirAbs;
284
327
  const group = wikiDirGroups.get(dir);
285
328
  if (group) {
@@ -292,12 +335,32 @@ async function indexEntries(db, allSourceEntries, isIncremental, builtAtMs, doFu
292
335
  }
293
336
  for (const [dirPath, { files, entries }] of wikiDirGroups) {
294
337
  if (seenPaths.has(path.resolve(dirPath))) {
295
- dirRecords.push({ dirPath, currentStashDir, files, stash: null, skip: true });
338
+ const reason = { kind: "duplicate-dir" };
339
+ dirRecords.push({ dirPath, currentStashDir, files, stash: null, skip: true, reason });
340
+ reportDirDecision("skip", dirPath, currentStashDir, reason);
296
341
  continue;
297
342
  }
298
343
  seenPaths.add(path.resolve(dirPath));
344
+ const previousState = getDirIndexState(db, dirPath, files, builtAtMs);
345
+ if (isIncremental && !previousState.stale && canUseIncrementalSkip(previousState, priorDirsChanged)) {
346
+ skippedDirs++;
347
+ dirRecords.push({ dirPath, currentStashDir, files, stash: null, skip: true, reason: previousState.reason });
348
+ reportDirDecision("skip", dirPath, currentStashDir, previousState.reason, previousState.persistedRowCount);
349
+ continue;
350
+ }
299
351
  scannedDirs++;
300
- dirRecords.push({ dirPath, currentStashDir, files, stash: { entries }, skip: false });
352
+ priorDirsChanged = true;
353
+ const reason = isIncremental ? previousState.reason : { kind: "full-rebuild" };
354
+ dirRecords.push({
355
+ dirPath,
356
+ currentStashDir,
357
+ files,
358
+ stash: { entries },
359
+ skip: false,
360
+ reason,
361
+ persistedRowCount: previousState.persistedRowCount,
362
+ });
363
+ reportDirDecision("scan", dirPath, currentStashDir, reason, previousState.persistedRowCount);
301
364
  }
302
365
  continue;
303
366
  }
@@ -311,46 +374,70 @@ async function indexEntries(db, allSourceEntries, isIncremental, builtAtMs, doFu
311
374
  dirGroups.set(dir, [ctx.absPath]);
312
375
  }
313
376
  for (const [dirPath, files] of dirGroups) {
377
+ const indexableFiles = files.filter((file) => shouldIndexStashFile(currentStashDir, file));
314
378
  if (seenPaths.has(path.resolve(dirPath))) {
315
- dirRecords.push({ dirPath, currentStashDir, files, stash: null, skip: true });
379
+ const reason = { kind: "duplicate-dir" };
380
+ dirRecords.push({ dirPath, currentStashDir, files: indexableFiles, stash: null, skip: true, reason });
381
+ reportDirDecision("skip", dirPath, currentStashDir, reason);
316
382
  continue;
317
383
  }
318
384
  seenPaths.add(path.resolve(dirPath));
319
- // Incremental: skip directories that haven't changed
320
- if (isIncremental) {
321
- const prevEntries = getEntriesByDir(db, dirPath);
322
- if (prevEntries.length > 0 && !isDirStale(dirPath, files, prevEntries, builtAtMs)) {
323
- skippedDirs++;
324
- dirRecords.push({ dirPath, currentStashDir, files, stash: null, skip: true });
325
- continue;
326
- }
385
+ if (indexableFiles.length === 0) {
386
+ skippedDirs++;
387
+ const reason = { kind: "no-indexable-files" };
388
+ dirRecords.push({ dirPath, currentStashDir, files: indexableFiles, stash: null, skip: true, reason });
389
+ reportDirDecision("skip", dirPath, currentStashDir, reason);
390
+ continue;
327
391
  }
328
- scannedDirs++;
329
- // Try loading existing .stash.json (user metadata overrides)
330
- let stash = loadStashFile(dirPath);
331
- if (stash) {
332
- const coveredFiles = new Set(stash.entries.map((e) => (e.filename ? path.basename(e.filename) : "")).filter((e) => !!e));
333
- const uncoveredFiles = files.filter((f) => !coveredFiles.has(path.basename(f)));
334
- if (uncoveredFiles.length > 0) {
335
- const generated = await generateMetadataFlat(currentStashDir, uncoveredFiles);
336
- if (generated.warnings?.length)
337
- warnings.push(...generated.warnings);
338
- if (generated.entries.length > 0) {
339
- stash = { entries: [...stash.entries, ...generated.entries] };
340
- generatedCount += generated.entries.length;
341
- }
342
- }
392
+ const cachedZeroRowState = isIncremental && getCachedZeroRowDirState(db, dirPath, indexableFiles, builtAtMs, priorDirsChanged);
393
+ if (cachedZeroRowState) {
394
+ skippedDirs++;
395
+ dirRecords.push({
396
+ dirPath,
397
+ currentStashDir,
398
+ files: indexableFiles,
399
+ stash: null,
400
+ skip: true,
401
+ reason: cachedZeroRowState.reason,
402
+ });
403
+ reportDirDecision("skip", dirPath, currentStashDir, cachedZeroRowState.reason, cachedZeroRowState.persistedRowCount);
404
+ continue;
343
405
  }
344
- if (!stash) {
345
- const generated = await generateMetadataFlat(currentStashDir, files);
346
- if (generated.warnings?.length)
347
- warnings.push(...generated.warnings);
348
- if (generated.entries.length > 0) {
349
- stash = { entries: generated.entries };
350
- generatedCount += generated.entries.length;
351
- }
406
+ const generated = await generateMetadataFlat(currentStashDir, indexableFiles);
407
+ if (generated.warnings?.length)
408
+ warnings.push(...generated.warnings);
409
+ const legacyOverrides = loadStashFile(dirPath, { requireFilename: true });
410
+ const { stash, staleFiles } = buildIndexedDirCandidate(dirPath, indexableFiles, generated, legacyOverrides);
411
+ if (generated.entries.length > 0) {
412
+ generatedCount += generated.entries.length;
352
413
  }
353
- dirRecords.push({ dirPath, currentStashDir, files, stash, skip: false });
414
+ const previousState = getDirIndexState(db, dirPath, staleFiles, builtAtMs);
415
+ if (isIncremental && !previousState.stale && canUseIncrementalSkip(previousState, priorDirsChanged)) {
416
+ skippedDirs++;
417
+ dirRecords.push({
418
+ dirPath,
419
+ currentStashDir,
420
+ files: staleFiles,
421
+ stash: null,
422
+ skip: true,
423
+ reason: previousState.reason,
424
+ });
425
+ reportDirDecision("skip", dirPath, currentStashDir, previousState.reason, previousState.persistedRowCount);
426
+ continue;
427
+ }
428
+ scannedDirs++;
429
+ priorDirsChanged = true;
430
+ const reason = isIncremental ? previousState.reason : { kind: "full-rebuild" };
431
+ dirRecords.push({
432
+ dirPath,
433
+ currentStashDir,
434
+ files: staleFiles,
435
+ stash,
436
+ skip: false,
437
+ reason,
438
+ persistedRowCount: previousState.persistedRowCount,
439
+ });
440
+ reportDirDecision("scan", dirPath, currentStashDir, reason, previousState.persistedRowCount);
354
441
  }
355
442
  }
356
443
  // Phase 2 (sync): write all pre-generated metadata inside a single transaction.
@@ -382,6 +469,7 @@ async function indexEntries(db, allSourceEntries, isIncremental, builtAtMs, doFu
382
469
  }
383
470
  db.exec("DELETE FROM entries_fts");
384
471
  db.exec("DELETE FROM utility_scores");
472
+ db.exec("DELETE FROM index_dir_state");
385
473
  // Detach usage_events from entries about to be deleted — null out entry_id
386
474
  // but keep entry_ref so events can be re-linked after entries are rebuilt.
387
475
  try {
@@ -392,32 +480,42 @@ async function indexEntries(db, allSourceEntries, isIncremental, builtAtMs, doFu
392
480
  }
393
481
  db.exec("DELETE FROM entries");
394
482
  }
395
- for (const { dirPath, currentStashDir, files, stash, skip } of dirRecords) {
396
- if (skip)
483
+ for (const { dirPath, currentStashDir, files, stash, skip, reason } of dirRecords) {
484
+ if (skip) {
485
+ if (reason?.kind === "unchanged") {
486
+ const fingerprint = computeDirFingerprint(dirPath, files);
487
+ upsertIndexDirState(db, {
488
+ dirPath,
489
+ fileSetHash: fingerprint.fileSetHash,
490
+ fileMtimeMaxMs: fingerprint.fileMtimeMaxMs,
491
+ reason: reason.kind,
492
+ });
493
+ }
397
494
  continue;
495
+ }
398
496
  // Delete old entries for this dir (will be re-inserted)
399
497
  deleteEntriesByDir(db, dirPath);
498
+ let persistedRows = 0;
499
+ let dedupedRows = 0;
400
500
  if (stash) {
401
- // Build a lookup for matching filename-less entries to actual files
402
- const fileBasenameMap = buildFileBasenameMap(files);
403
501
  for (const entry of stash.entries) {
404
- const entryPath = entry.filename
405
- ? path.join(dirPath, entry.filename)
406
- : matchEntryToFile(entry.name, fileBasenameMap, files);
502
+ const entryPath = entry.filename ? path.join(dirPath, entry.filename) : null;
407
503
  if (!entryPath)
408
504
  continue; // skip unresolvable entries
409
505
  if (!shouldIndexStashFile(currentStashDir, entryPath))
410
506
  continue;
411
507
  // Skip if a higher-priority stash root already indexed this asset
412
- const basename = path.basename(entryPath);
413
- const identityKey = `${entry.type}\0${basename}\0${entry.description ?? ""}`;
414
- if (indexedAssetIdentities.has(identityKey))
508
+ const identityKey = `${entry.type}\0${entry.name}`;
509
+ if (indexedAssetIdentities.has(identityKey)) {
510
+ dedupedRows++;
415
511
  continue;
512
+ }
416
513
  indexedAssetIdentities.add(identityKey);
417
514
  const entryKey = `${currentStashDir}:${entry.type}:${entry.name}`;
418
515
  const searchText = buildSearchText(entry);
419
516
  const entryWithSize = attachFileSize(entry, entryPath);
420
517
  const entryId = upsertEntry(db, entryKey, dirPath, entryPath, currentStashDir, entryWithSize, searchText);
518
+ persistedRows++;
421
519
  if (entry.type === "workflow") {
422
520
  const doc = takeWorkflowDocument(entry);
423
521
  if (doc) {
@@ -430,12 +528,121 @@ async function indexEntries(db, allSourceEntries, isIncremental, builtAtMs, doFu
430
528
  dirsNeedingLlm.push({ dirPath, files, currentStashDir, stash });
431
529
  }
432
530
  }
531
+ const fingerprint = computeDirFingerprint(dirPath, files);
532
+ const persistedReason = persistedRows === 0
533
+ ? inferZeroRowReason(stash, reason, warnings, dirPath, dedupedRows)
534
+ : reason?.kind === "full-rebuild"
535
+ ? "full-rebuild"
536
+ : (reason?.kind ?? "updated");
537
+ upsertIndexDirState(db, {
538
+ dirPath,
539
+ fileSetHash: fingerprint.fileSetHash,
540
+ fileMtimeMaxMs: fingerprint.fileMtimeMaxMs,
541
+ reason: persistedReason,
542
+ });
543
+ if (persistedRows === 0) {
544
+ warnVerbose(`[index] zero-row ${dirPath}: ${persistedReason}`);
545
+ }
433
546
  }
434
547
  });
435
548
  insertTransaction();
436
549
  return { scannedDirs, skippedDirs, generatedCount, warnings, dirsNeedingLlm };
437
550
  }
438
- async function enhanceDirsWithLlm(db, config, dirsNeedingLlm) {
551
+ function getDirIndexState(db, dirPath, files, builtAtMs) {
552
+ const prevEntries = getEntriesByDir(db, dirPath);
553
+ const fingerprint = computeDirFingerprint(dirPath, files);
554
+ if (prevEntries.length > 0) {
555
+ const staleReason = getDirStaleReason(dirPath, files, prevEntries, builtAtMs);
556
+ if (!staleReason) {
557
+ return { stale: false, reason: { kind: "unchanged" }, persistedRowCount: prevEntries.length };
558
+ }
559
+ return { stale: true, reason: staleReason, persistedRowCount: prevEntries.length };
560
+ }
561
+ const cachedState = getIndexDirState(db, dirPath);
562
+ if (cachedState &&
563
+ cachedState.fileSetHash === fingerprint.fileSetHash &&
564
+ cachedState.fileMtimeMaxMs === fingerprint.fileMtimeMaxMs) {
565
+ return {
566
+ stale: false,
567
+ reason: { kind: "cached-zero-row-state", detail: cachedState.reason },
568
+ persistedRowCount: 0,
569
+ };
570
+ }
571
+ return {
572
+ stale: true,
573
+ reason: { kind: "no-previous-rows", detail: cachedState ? `cached=${cachedState.reason}` : undefined },
574
+ persistedRowCount: 0,
575
+ };
576
+ }
577
+ function getCachedZeroRowDirState(db, dirPath, files, builtAtMs, priorDirsChanged) {
578
+ const state = getDirIndexState(db, dirPath, files, builtAtMs);
579
+ if (state.stale || state.reason.kind !== "cached-zero-row-state")
580
+ return undefined;
581
+ if (!canUseIncrementalSkip(state, priorDirsChanged))
582
+ return undefined;
583
+ return state;
584
+ }
585
+ function canUseIncrementalSkip(state, priorDirsChanged) {
586
+ return !(priorDirsChanged &&
587
+ state.reason.kind === "cached-zero-row-state" &&
588
+ state.reason.detail === "deduped-zero-row");
589
+ }
590
+ function computeDirFingerprint(_dirPath, files) {
591
+ const normalizedFiles = [...new Set(files.map((file) => path.basename(file)))].sort();
592
+ let fileMtimeMaxMs = 0;
593
+ for (const file of files) {
594
+ try {
595
+ fileMtimeMaxMs = Math.max(fileMtimeMaxMs, fs.statSync(file).mtimeMs);
596
+ }
597
+ catch {
598
+ fileMtimeMaxMs = Number.POSITIVE_INFINITY;
599
+ break;
600
+ }
601
+ }
602
+ return {
603
+ fileSetHash: normalizedFiles.join("\0"),
604
+ fileMtimeMaxMs,
605
+ };
606
+ }
607
+ function getDirStaleReason(_dirPath, currentFiles, previousEntries, builtAtMs) {
608
+ const prevFileNames = new Set(previousEntries
609
+ .map((ie) => {
610
+ const fromPath = path.basename(ie.filePath);
611
+ return fromPath || ie.entry.filename;
612
+ })
613
+ .filter((e) => !!e));
614
+ const currFileNames = new Set(currentFiles.map((f) => path.basename(f)));
615
+ if (prevFileNames.size !== currFileNames.size) {
616
+ return { kind: "file-set-changed", detail: `${prevFileNames.size} -> ${currFileNames.size} files` };
617
+ }
618
+ for (const name of currFileNames) {
619
+ if (!prevFileNames.has(name))
620
+ return { kind: "file-set-changed", detail: name };
621
+ }
622
+ for (const file of currentFiles) {
623
+ try {
624
+ if (fs.statSync(file).mtimeMs > builtAtMs)
625
+ return { kind: "mtime-changed", detail: path.basename(file) };
626
+ }
627
+ catch {
628
+ return { kind: "missing-file", detail: path.basename(file) };
629
+ }
630
+ }
631
+ return undefined;
632
+ }
633
+ function inferZeroRowReason(stash, priorReason, warnings, dirPath, dedupedRows) {
634
+ if (dedupedRows > 0)
635
+ return "deduped-zero-row";
636
+ const workflowNoise = warnings.some((warning) => warning.startsWith("Skipped workflow ") && warning.includes(dirPath));
637
+ if (workflowNoise)
638
+ return "workflow-noise";
639
+ if (!stash || stash.entries.length === 0)
640
+ return "empty-generated-set";
641
+ return `zero-row:${priorReason?.kind ?? "unknown"}`;
642
+ }
643
+ async function enhanceDirsWithLlm(db, config, dirsNeedingLlm, signal, enrich = false) {
644
+ if (!enrich)
645
+ return;
439
646
  // Resolve per-pass LLM config via the unified shim. Returns undefined when
440
647
  // either no `akm.llm` is configured or the user opted this pass out via
441
648
  // `index.enrichment.llm = false`. (#208)
@@ -447,12 +654,13 @@ async function enhanceDirsWithLlm(db, config, dirsNeedingLlm) {
447
654
  // and leaving the user wondering why nothing got enhanced.
448
655
  const summary = { attempted: 0, succeeded: 0, failureSamples: [] };
449
656
  for (const { dirPath, files, currentStashDir, stash: originalStash } of dirsNeedingLlm) {
657
+ throwIfAborted(signal);
450
658
  // Only enhance generated entries; user-provided overrides should not be overwritten
451
659
  const generatedEntries = originalStash.entries.filter((e) => e.quality === "generated");
452
660
  if (generatedEntries.length === 0)
453
661
  continue;
454
662
  const generatedStash = { entries: generatedEntries };
455
- const enhanced = await enhanceStashWithLlm(llmConfig, generatedStash, files, summary);
663
+ const enhanced = await enhanceStashWithLlm(llmConfig, generatedStash, files, summary, signal);
456
664
  // Re-upsert the enhanced entries in a single transaction so a crash
457
665
  // cannot leave half the entries updated and the rest stale.
458
666
  db.transaction(() => {
@@ -475,7 +683,8 @@ async function enhanceDirsWithLlm(db, config, dirsNeedingLlm) {
475
683
  warn(`LLM enhancement failed for ${failed}/${summary.attempted} entries — they were left un-enhanced.${sample}`);
476
684
  }
477
685
  }
478
- async function generateEmbeddingsForDb(db, config, onProgress) {
686
+ async function generateEmbeddingsForDb(db, config, onProgress, signal) {
687
+ throwIfAborted(signal);
479
688
  if (config.semanticSearchMode === "off") {
480
689
  onProgress({ phase: "embeddings", message: "Semantic search disabled; skipping embeddings." });
481
690
  return { success: false, reason: "index-missing", message: "Semantic search is disabled." };
@@ -504,6 +713,7 @@ async function generateEmbeddingsForDb(db, config, onProgress) {
504
713
  try {
505
714
  const { embedBatch } = await import("../llm/embedder.js");
506
715
  const { estimateTokenCount } = await import("../llm/embedders/remote.js");
716
+ throwIfAborted(signal);
507
717
  const allEntries = getAllEntriesForEmbedding(db);
508
718
  if (allEntries.length === 0) {
509
719
  onProgress({ phase: "embeddings", message: "Embeddings already up to date." });
@@ -528,7 +738,8 @@ async function generateEmbeddingsForDb(db, config, onProgress) {
528
738
  warnVerbose(`[embed] ${ref} (${chars} chars, est. ${tokens} tokens) → batch ${batchNum}/${totalBatches}`);
529
739
  }
530
740
  }
531
- const embeddings = await embedBatch(texts, config.embedding);
741
+ const embeddings = await embedBatch(texts, config.embedding, signal);
742
+ throwIfAborted(signal);
532
743
  // Wrap all embedding upserts in a single transaction so partial
533
744
  // state is rolled back on failure rather than leaving the table half-filled.
534
745
  db.transaction(() => {
@@ -668,41 +879,31 @@ function verifyIndexState(db, config, totalEntries, embeddingResult) {
668
879
  vecAvailable,
669
880
  };
670
881
  }
671
- function isDirStale(dirPath, currentFiles, previousEntries, builtAtMs) {
672
- // Check if file set changed (additions or deletions)
673
- const prevFileNames = new Set(previousEntries.map((ie) => ie.entry.filename).filter((e) => !!e));
674
- const currFileNames = new Set(currentFiles.map((f) => path.basename(f)));
675
- if (prevFileNames.size !== currFileNames.size)
676
- return true;
677
- for (const name of currFileNames) {
678
- if (!prevFileNames.has(name))
679
- return true;
680
- }
681
- // Check modification times of current files
682
- for (const file of currentFiles) {
683
- try {
684
- if (fs.statSync(file).mtimeMs > builtAtMs)
685
- return true;
686
- }
687
- catch {
688
- return true;
689
- }
690
- }
691
- // Check .stash.json modification time
692
- const stashPath = path.join(dirPath, ".stash.json");
693
- try {
694
- if (fs.statSync(stashPath).mtimeMs > builtAtMs)
695
- return true;
696
- }
697
- catch {
698
- // file doesn't exist, not stale
882
+ function buildIndexedDirCandidate(dirPath, indexableFiles, generated, legacyOverrides) {
883
+ const mergedEntries = legacyOverrides
884
+ ? generated.entries.map((entry) => mergeLegacyEntry(entry, legacyOverrides.entries))
885
+ : generated.entries;
886
+ const stash = mergedEntries.length > 0 ? { entries: mergedEntries } : legacyOverrides;
887
+ const staleFiles = stash ? resolveIndexedFiles(dirPath, indexableFiles, stash) : indexableFiles;
888
+ return { stash, staleFiles };
889
+ }
890
+ function resolveIndexedFiles(dirPath, files, stash) {
891
+ const fileBasenameMap = buildFileBasenameMap(files);
892
+ const resolved = new Set();
893
+ for (const entry of stash.entries) {
894
+ const entryPath = entry.filename
895
+ ? path.join(dirPath, entry.filename)
896
+ : matchEntryToFile(entry.name, fileBasenameMap, files);
897
+ if (entryPath)
898
+ resolved.add(entryPath);
699
899
  }
700
- return false;
900
+ return resolved.size > 0 ? [...resolved] : files;
701
901
  }
702
- async function enhanceStashWithLlm(llmConfig, stash, files, summary) {
902
+ async function enhanceStashWithLlm(llmConfig, stash, files, summary, signal) {
703
903
  const { enhanceMetadata } = await import("../llm/metadata-enhance");
704
904
  const enhanced = [];
705
905
  for (const entry of stash.entries) {
906
+ throwIfAborted(signal);
706
907
  summary.attempted++;
707
908
  try {
708
909
  const entryFile = entry.filename
@@ -717,7 +918,7 @@ async function enhanceStashWithLlm(llmConfig, stash, files, summary) {
717
918
  /* ignore unreadable files */
718
919
  }
719
920
  }
720
- const improvements = await enhanceMetadata(llmConfig, entry, fileContent);
921
+ const improvements = await enhanceMetadata(llmConfig, entry, fileContent, signal);
721
922
  const updated = { ...entry };
722
923
  if (improvements.description)
723
924
  updated.description = improvements.description;
@@ -760,9 +961,9 @@ export function buildFileBasenameMap(files) {
760
961
  * 1. Exact basename match: entry.name === filename without extension
761
962
  * 2. Last path segment match: for entries with names like "dir/sub-entry",
762
963
  * try matching the last segment
763
- * 3. Fallback: first file in the directory, or null if no files are available
964
+ * 3. No implicit file fallback: ambiguous legacy entries are skipped
764
965
  */
765
- export function matchEntryToFile(entryName, fileMap, files) {
966
+ export function matchEntryToFile(entryName, fileMap, _files) {
766
967
  // Exact match on entry name
767
968
  const exact = fileMap.get(entryName);
768
969
  if (exact)
@@ -774,8 +975,20 @@ export function matchEntryToFile(entryName, fileMap, files) {
774
975
  if (segmentMatch)
775
976
  return segmentMatch;
776
977
  }
777
- // Fallback to first file, or null if no files are available
778
- return files[0] || null;
978
+ return null;
979
+ }
980
+ function mergeLegacyEntry(entry, legacyEntries) {
981
+ const legacy = legacyEntries.find((candidate) => candidate.filename === entry.filename);
982
+ if (!legacy)
983
+ return entry;
984
+ return {
985
+ ...entry,
986
+ ...legacy,
987
+ filename: entry.filename,
988
+ source: legacy.source ?? entry.source,
989
+ quality: legacy.quality ?? entry.quality,
990
+ confidence: legacy.confidence ?? entry.confidence,
991
+ };
779
992
  }
780
993
  /**
781
994
  * Look up a single asset by ref. Spec §6.2 — `akm show` queries this and
@@ -803,7 +1016,7 @@ export async function lookup(ref) {
803
1016
  if (sources.length === 0)
804
1017
  return null;
805
1018
  const dbPath = getDbPath();
806
- const db = openDatabase(dbPath);
1019
+ const db = openExistingDatabase(dbPath);
807
1020
  try {
808
1021
  // entry_key shape: `${stashDir}:${type}:${name}`. Suffix-match on
809
1022
  // `:type:name` so we can scope by source dir as a prefix when origin is