akm-cli 0.9.0-beta.52 → 0.9.0-beta.54
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/assets/hints/cli-hints-full.md +6 -5
- package/dist/cli/clack.js +56 -0
- package/dist/cli/confirm.js +1 -1
- package/dist/cli.js +0 -7
- package/dist/commands/env/env-cli.js +3 -2
- package/dist/commands/env/env.js +14 -67
- package/dist/commands/health/checks.js +28 -15
- package/dist/commands/health/html-report.js +33 -10
- package/dist/commands/health.js +222 -22
- package/dist/commands/improve/collapse-detector.js +419 -0
- package/dist/commands/improve/consolidate.js +72 -54
- package/dist/commands/improve/distill.js +79 -13
- package/dist/commands/improve/extract.js +13 -6
- package/dist/commands/improve/homeostatic.js +109 -79
- package/dist/commands/improve/improve-cli.js +67 -1
- package/dist/commands/improve/improve.js +10 -0
- package/dist/commands/improve/loop-stages.js +39 -1
- package/dist/commands/improve/outcome-loop.js +33 -19
- package/dist/commands/improve/preparation.js +36 -11
- package/dist/commands/improve/salience.js +49 -32
- package/dist/commands/read/curate.js +9 -13
- package/dist/commands/read/knowledge.js +4 -0
- package/dist/commands/read/search-cli.js +6 -4
- package/dist/commands/read/search.js +12 -5
- package/dist/commands/read/show.js +6 -8
- package/dist/commands/sources/add-cli.js +1 -1
- package/dist/commands/sources/init.js +12 -0
- package/dist/commands/sources/stash-cli.js +1 -1
- package/dist/commands/tasks/default-tasks.js +12 -0
- package/dist/core/asset/asset-spec.js +3 -2
- package/dist/core/config/config-schema.js +39 -17
- package/dist/core/config/config.js +12 -0
- package/dist/core/eval/rank-metrics.js +113 -0
- package/dist/core/state/migrations.js +56 -0
- package/dist/core/state-db.js +146 -19
- package/dist/core/warn.js +21 -0
- package/dist/indexer/db/db.js +6 -0
- package/dist/indexer/ensure-index.js +36 -92
- package/dist/indexer/index-writer-lock.js +9 -11
- package/dist/indexer/index-written-assets.js +105 -0
- package/dist/indexer/indexer.js +16 -4
- package/dist/indexer/passes/metadata.js +20 -0
- package/dist/indexer/read-preflight.js +23 -0
- package/dist/indexer/search/db-search.js +29 -1
- package/dist/indexer/search/ranking-contributors.js +33 -1
- package/dist/indexer/search/ranking.js +66 -0
- package/dist/indexer/search/search-fields.js +6 -0
- package/dist/indexer/walk/walker.js +21 -13
- package/dist/integrations/agent/detect.js +9 -0
- package/dist/integrations/agent/index.js +1 -1
- package/dist/llm/client.js +12 -0
- package/dist/llm/embedder.js +26 -2
- package/dist/llm/embedders/local.js +7 -1
- package/dist/llm/feature-gate.js +6 -2
- package/dist/output/renderers.js +8 -13
- package/dist/output/shapes/helpers.js +0 -3
- package/dist/output/shapes/passthrough.js +1 -0
- package/dist/scripts/migrate-storage.js +178 -35
- package/dist/scripts/migrations/import-fs-improve-runs-to-db.js +46 -19
- package/dist/setup/detect.js +9 -0
- package/dist/setup/registry-stash-loader.js +12 -0
- package/dist/setup/setup.js +1 -1
- package/dist/storage/repositories/index-db.js +10 -1
- package/dist/tasks/backends/index.js +9 -0
- package/dist/tasks/runner.js +9 -0
- package/package.json +2 -4
|
@@ -2,23 +2,29 @@
|
|
|
2
2
|
// License, v. 2.0. If a copy of the MPL was not distributed with this
|
|
3
3
|
// file, You can obtain one at https://mozilla.org/MPL/2.0/.
|
|
4
4
|
/**
|
|
5
|
-
* Auto-index: silently
|
|
6
|
-
*
|
|
7
|
-
*
|
|
8
|
-
* `akm index
|
|
5
|
+
* Auto-index bootstrap: silently build the local index inline when it cannot
|
|
6
|
+
* serve the caller's stash at all (missing DB, no `entries` table, zero rows,
|
|
7
|
+
* or built for a different stash), so `search`, `show`, and `feedback` work
|
|
8
|
+
* on first use without a manual `akm index`.
|
|
9
9
|
*
|
|
10
|
-
*
|
|
11
|
-
*
|
|
12
|
-
*
|
|
10
|
+
* Content FRESHNESS is intentionally not this module's job on the read path.
|
|
11
|
+
* Writers maintain the index (`indexWrittenAssets` for `remember`/extract
|
|
12
|
+
* session assets; the mutation commands run `akmIndex()` themselves), and the
|
|
13
|
+
* improve cron / explicit `akm index` do full refreshes. Reads serve whatever
|
|
14
|
+
* populated index exists. The previous design — a staleness walk plus a
|
|
15
|
+
* detached background reindex per read — made every read on an actively
|
|
16
|
+
* written stash spawn a writer that the read's own telemetry then queued
|
|
17
|
+
* behind (see docs/design/read-path-reindex-contention-findings.md).
|
|
18
|
+
*
|
|
19
|
+
* `mode: "blocking"` (improve) still checks staleness and rebuilds inline,
|
|
20
|
+
* because its planning logic needs a current `entries` table in-process.
|
|
13
21
|
*/
|
|
14
|
-
import { spawn } from "node:child_process";
|
|
15
22
|
import fs from "node:fs";
|
|
16
23
|
import path from "node:path";
|
|
17
24
|
import { ASSET_SPECS, TYPE_DIRS } from "../core/asset/asset-spec.js";
|
|
18
|
-
import {
|
|
25
|
+
import { getDbPath } from "../core/paths.js";
|
|
19
26
|
import { warn } from "../core/warn.js";
|
|
20
27
|
import { closeDatabase, getEntryCount, getIndexedFilePaths, getMeta, openExistingDatabase } from "./db/db.js";
|
|
21
|
-
import { acquireIndexWriterLease, handoffIndexWriterLeaseToPid } from "./index-writer-lock.js";
|
|
22
28
|
function getIndexableFiles(root, spec) {
|
|
23
29
|
if (!fs.existsSync(root))
|
|
24
30
|
return [];
|
|
@@ -138,12 +144,9 @@ export function isIndexStale(stashDir) {
|
|
|
138
144
|
* i.e. the DB file exists, the `entries` table holds rows, and those rows were
|
|
139
145
|
* built for this stash (it is the stored primary stash or appears in the
|
|
140
146
|
* stored `stashDirs` set). When this is true the index is at worst
|
|
141
|
-
* content-stale, so
|
|
142
|
-
*
|
|
143
|
-
*
|
|
144
|
-
* table, zero rows, or built for a different stash), so a background reindex
|
|
145
|
-
* would leave the caller empty until the next read — those cases must rebuild
|
|
146
|
-
* inline.
|
|
147
|
+
* content-stale, so read paths serve it as-is. When it is false the existing
|
|
148
|
+
* index has nothing relevant to return (no DB, no `entries` table, zero rows,
|
|
149
|
+
* or built for a different stash), so those cases must rebuild inline.
|
|
147
150
|
*/
|
|
148
151
|
function indexCanServeStash(stashDir) {
|
|
149
152
|
const dbPath = getDbPath();
|
|
@@ -174,43 +177,6 @@ function indexCanServeStash(stashDir) {
|
|
|
174
177
|
closeDatabase(db);
|
|
175
178
|
}
|
|
176
179
|
}
|
|
177
|
-
/**
|
|
178
|
-
* Spawn a background `akm index` process. Non-blocking — returns immediately.
|
|
179
|
-
* Background callers share the same global index-writer lease as foreground
|
|
180
|
-
* writers, so stale-read-triggered auto-index attempts coalesce safely.
|
|
181
|
-
*/
|
|
182
|
-
async function spawnBackgroundReindex(_stashDir) {
|
|
183
|
-
const dataDir = getDataDir();
|
|
184
|
-
const logFile = path.join(dataDir, "logs", "index-background.log");
|
|
185
|
-
fs.mkdirSync(path.dirname(logFile), { recursive: true });
|
|
186
|
-
const lease = await acquireIndexWriterLease({ mode: "try", purpose: "background-reindex-spawn" });
|
|
187
|
-
if (!lease)
|
|
188
|
-
return;
|
|
189
|
-
const akmBin = process.argv[0];
|
|
190
|
-
const akmScript = process.argv[1];
|
|
191
|
-
try {
|
|
192
|
-
const child = spawn(akmBin, [akmScript, "index", "--background"], {
|
|
193
|
-
detached: true,
|
|
194
|
-
stdio: ["ignore", fs.openSync(logFile, "a"), fs.openSync(logFile, "a")],
|
|
195
|
-
env: { ...process.env },
|
|
196
|
-
});
|
|
197
|
-
if (!child.pid) {
|
|
198
|
-
lease.release();
|
|
199
|
-
return;
|
|
200
|
-
}
|
|
201
|
-
handoffIndexWriterLeaseToPid(lease, child.pid, "background-reindex");
|
|
202
|
-
try {
|
|
203
|
-
child.unref();
|
|
204
|
-
}
|
|
205
|
-
catch {
|
|
206
|
-
// ignore
|
|
207
|
-
}
|
|
208
|
-
}
|
|
209
|
-
catch (error) {
|
|
210
|
-
lease.release();
|
|
211
|
-
throw error;
|
|
212
|
-
}
|
|
213
|
-
}
|
|
214
180
|
async function runInlineReindex(stashDir) {
|
|
215
181
|
try {
|
|
216
182
|
const { akmIndex } = await import("./indexer.js");
|
|
@@ -219,53 +185,31 @@ async function runInlineReindex(stashDir) {
|
|
|
219
185
|
}
|
|
220
186
|
catch (error) {
|
|
221
187
|
warn("Auto-index failed, proceeding with existing index:", error instanceof Error ? error.message : String(error));
|
|
222
|
-
return
|
|
188
|
+
return false;
|
|
223
189
|
}
|
|
224
190
|
}
|
|
225
191
|
/**
|
|
226
|
-
* Ensure the local index exists and
|
|
192
|
+
* Ensure the local index exists and can serve the caller.
|
|
227
193
|
*
|
|
228
|
-
* Default mode is `background
|
|
229
|
-
*
|
|
230
|
-
*
|
|
231
|
-
*
|
|
232
|
-
* rows) the rebuild runs inline regardless of mode, since there is nothing to
|
|
233
|
-
* proceed against.
|
|
194
|
+
* Default mode is `background` — the read-path contract (`search`, `show`,
|
|
195
|
+
* `feedback`): a populated index built for this stash is served as-is (its
|
|
196
|
+
* freshness is the writers' job, see module doc); an unusable index rebuilds
|
|
197
|
+
* inline, since there is nothing to proceed against.
|
|
234
198
|
*
|
|
235
|
-
* `mode: "blocking"`
|
|
236
|
-
* this for callers like `improve` whose
|
|
237
|
-
* `entries` table in the same process.
|
|
199
|
+
* `mode: "blocking"` additionally treats content-staleness as a rebuild
|
|
200
|
+
* trigger and waits for it. Use this for callers like `improve` whose
|
|
201
|
+
* planning logic depends on a current `entries` table in the same process.
|
|
238
202
|
*
|
|
239
|
-
* Returns `true`
|
|
203
|
+
* Returns `true` only when an inline index run succeeds.
|
|
204
|
+
* A rebuild attempt that fails (throws) resolves to `false`.
|
|
240
205
|
*/
|
|
241
206
|
export async function ensureIndex(stashDir, options = {}) {
|
|
242
|
-
if (
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
// serve this stash (absent DB, no `entries` table, zero rows, or built for a
|
|
246
|
-
// different stash): a background reindex returns immediately and would leave
|
|
247
|
-
// a first-time caller (search, curate, wiki, show, feedback) with empty
|
|
248
|
-
// results. Building inline is a one-off cost; a populated index for this
|
|
249
|
-
// stash that is merely content-stale still refreshes in the background.
|
|
250
|
-
if (options.mode === "blocking" || !indexCanServeStash(stashDir)) {
|
|
251
|
-
return runInlineReindex(stashDir);
|
|
252
|
-
}
|
|
253
|
-
// The background path re-invokes the akm CLI as a detached child via
|
|
254
|
-
// `process.argv[1]`. That is only the akm entrypoint when THIS process is the
|
|
255
|
-
// akm CLI itself — which the CLI startup block signals with AKM_CLI_ENTRY=1.
|
|
256
|
-
// In any other host (the in-process test runner, a library embedding akm),
|
|
257
|
-
// argv[1] points at the host (e.g. the test runner), so spawning it would
|
|
258
|
-
// launch the wrong program and orphan it. Build inline there instead — same
|
|
259
|
-
// resulting index, no detached process.
|
|
260
|
-
if (process.env.AKM_CLI_ENTRY !== "1") {
|
|
207
|
+
if (options.mode === "blocking") {
|
|
208
|
+
if (!isIndexStale(stashDir))
|
|
209
|
+
return false;
|
|
261
210
|
return runInlineReindex(stashDir);
|
|
262
211
|
}
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
}
|
|
267
|
-
catch (error) {
|
|
268
|
-
warn("Background reindex spawn failed, proceeding with existing index:", error instanceof Error ? error.message : String(error));
|
|
269
|
-
return true;
|
|
270
|
-
}
|
|
212
|
+
if (indexCanServeStash(stashDir))
|
|
213
|
+
return false;
|
|
214
|
+
return runInlineReindex(stashDir);
|
|
271
215
|
}
|
|
@@ -7,6 +7,7 @@ import { probeLock, releaseLock, releaseLockIfOwned, tryAcquireLockSync } from "
|
|
|
7
7
|
import { getDbPath, getIndexWriterLockPath } from "../core/paths.js";
|
|
8
8
|
const INDEX_WRITER_LOCK_STALE_AFTER_MS = 12 * 60 * 60 * 1000;
|
|
9
9
|
const INDEX_WRITER_WAIT_MS = 100;
|
|
10
|
+
const DEFAULT_INDEX_WRITER_MAX_WAIT_MS = 10 * 60 * 1000;
|
|
10
11
|
const heldLocks = new Map();
|
|
11
12
|
function buildPayload(purpose, pid = process.pid) {
|
|
12
13
|
return JSON.stringify({
|
|
@@ -46,16 +47,11 @@ function retainHeldLock(lockPath) {
|
|
|
46
47
|
heldLocks.set(lockPath, { depth: 1, exitHandler });
|
|
47
48
|
return { lockPath, release: () => releaseHeldLock(lockPath) };
|
|
48
49
|
}
|
|
49
|
-
function detachHeldLock(lockPath) {
|
|
50
|
-
const held = heldLocks.get(lockPath);
|
|
51
|
-
if (!held)
|
|
52
|
-
return;
|
|
53
|
-
heldLocks.delete(lockPath);
|
|
54
|
-
process.off("exit", held.exitHandler);
|
|
55
|
-
}
|
|
56
50
|
export async function acquireIndexWriterLease(options) {
|
|
57
51
|
const mode = options.mode ?? "wait";
|
|
58
52
|
const lockPath = getIndexWriterLockPath();
|
|
53
|
+
const startedAt = Date.now();
|
|
54
|
+
const maxWaitMs = options.maxWaitMs ?? DEFAULT_INDEX_WRITER_MAX_WAIT_MS;
|
|
59
55
|
fs.mkdirSync(path.dirname(lockPath), { recursive: true });
|
|
60
56
|
if (heldLocks.has(lockPath)) {
|
|
61
57
|
return retainHeldLock(lockPath);
|
|
@@ -75,6 +71,12 @@ export async function acquireIndexWriterLease(options) {
|
|
|
75
71
|
}
|
|
76
72
|
if (mode === "try")
|
|
77
73
|
return undefined;
|
|
74
|
+
// Held by another live process. Time out only *after* a real acquisition
|
|
75
|
+
// attempt, so a caller with maxWaitMs:0 still gets one chance at a free lock
|
|
76
|
+
// instead of throwing before it ever tries.
|
|
77
|
+
if (maxWaitMs >= 0 && Date.now() - startedAt >= maxWaitMs) {
|
|
78
|
+
throw new Error(`timed out waiting for index writer lease for ${options.purpose}`);
|
|
79
|
+
}
|
|
78
80
|
await delay(INDEX_WRITER_WAIT_MS);
|
|
79
81
|
}
|
|
80
82
|
}
|
|
@@ -90,10 +92,6 @@ export async function withIndexWriterLease(options, run) {
|
|
|
90
92
|
lease.release();
|
|
91
93
|
}
|
|
92
94
|
}
|
|
93
|
-
export function handoffIndexWriterLeaseToPid(lease, pid, purpose) {
|
|
94
|
-
fs.writeFileSync(lease.lockPath, buildPayload(purpose, pid), "utf8");
|
|
95
|
-
detachHeldLock(lease.lockPath);
|
|
96
|
-
}
|
|
97
95
|
export function probeIndexWriterLease() {
|
|
98
96
|
return probeLock(getIndexWriterLockPath(), { staleAfterMs: INDEX_WRITER_LOCK_STALE_AFTER_MS });
|
|
99
97
|
}
|
|
@@ -0,0 +1,105 @@
|
|
|
1
|
+
// This Source Code Form is subject to the terms of the Mozilla Public
|
|
2
|
+
// License, v. 2.0. If a copy of the MPL was not distributed with this
|
|
3
|
+
// file, You can obtain one at https://mozilla.org/MPL/2.0/.
|
|
4
|
+
/**
|
|
5
|
+
* Write-path indexing: targeted single-file index updates for asset writers.
|
|
6
|
+
*
|
|
7
|
+
* The index is maintained eagerly by every first-class mutation command
|
|
8
|
+
* (`source add`, `wiki`, `workflow`, `setup` all run `akmIndex()` after
|
|
9
|
+
* writing). The memory write paths — `akm remember` / `writeMarkdownAsset`
|
|
10
|
+
* and extract's session assets — historically did not, which is why reads
|
|
11
|
+
* used to compensate with stale-triggered background reindexes (the
|
|
12
|
+
* lock-contention footgun removed alongside this module's introduction; see
|
|
13
|
+
* docs/design/read-path-reindex-contention-findings.md §7).
|
|
14
|
+
*
|
|
15
|
+
* This is NOT a general reindex. It upserts exactly the files the caller just
|
|
16
|
+
* wrote: frontmatter/metadata via the shared matcher pipeline, the `entries`
|
|
17
|
+
* row, and an incremental FTS refresh. Embeddings, index-time LLM passes,
|
|
18
|
+
* graph extraction, `builtAt`, and the per-dir walk cache are all deliberately
|
|
19
|
+
* untouched — the next full run heals them (the opportunistic-recovery
|
|
20
|
+
* strategy of docs/technical/index-consistency-adr.md).
|
|
21
|
+
*/
|
|
22
|
+
import fs from "node:fs";
|
|
23
|
+
import path from "node:path";
|
|
24
|
+
import { getDbPath } from "../core/paths.js";
|
|
25
|
+
import { warnVerbose } from "../core/warn.js";
|
|
26
|
+
import { closeDatabase, getEntryCount, openExistingDatabase, rebuildFts, upsertEntry } from "./db/db.js";
|
|
27
|
+
import { generateMetadataFlat } from "./passes/metadata.js";
|
|
28
|
+
import { buildSearchText } from "./search/search-fields.js";
|
|
29
|
+
/**
|
|
30
|
+
* Busy-timeout (ms) for write-path index upserts. A real write — unlike the
|
|
31
|
+
* 250ms telemetry inserts — but it must not hang `akm remember` for the full
|
|
32
|
+
* default 30s behind a running full reindex. When it times out, the upsert is
|
|
33
|
+
* skipped and the asset becomes searchable after that reindex instead.
|
|
34
|
+
*/
|
|
35
|
+
export const WRITE_PATH_INDEX_BUSY_TIMEOUT_MS = 5_000;
|
|
36
|
+
/**
|
|
37
|
+
* Index the given just-written asset files into the existing local index.
|
|
38
|
+
*
|
|
39
|
+
* FAIL-OPEN at every step: any error (index.db absent, empty, locked past the
|
|
40
|
+
* busy timeout, unparseable file) is reduced to a verbose-only warning and the
|
|
41
|
+
* write command succeeds untouched. The degraded outcome is exactly the
|
|
42
|
+
* pre-write-path-indexing behavior: the asset appears after the next full
|
|
43
|
+
* `akm index` / improve-cron run.
|
|
44
|
+
*
|
|
45
|
+
* An absent or empty index is skipped on purpose — bootstrap belongs to the
|
|
46
|
+
* first read (`ensureIndex`) or an explicit `akm index`, which also cover
|
|
47
|
+
* embeddings and the other passes this fast path skips.
|
|
48
|
+
*/
|
|
49
|
+
export async function indexWrittenAssets(stashDir, filePaths) {
|
|
50
|
+
try {
|
|
51
|
+
const dbPath = getDbPath();
|
|
52
|
+
if (!fs.existsSync(dbPath))
|
|
53
|
+
return;
|
|
54
|
+
// The full walk never descends into dot-directories (they hold state like
|
|
55
|
+
// `.meta/`, `.stash.json`), and `shouldIndexStashFile` relies on the walker
|
|
56
|
+
// for that — mirror it here so this fast path indexes exactly what a full
|
|
57
|
+
// run would.
|
|
58
|
+
const files = filePaths.filter((f) => {
|
|
59
|
+
if (!fs.existsSync(f))
|
|
60
|
+
return false;
|
|
61
|
+
const rel = path.relative(stashDir, f);
|
|
62
|
+
return !rel.split(/[\\/]+/).some((segment) => segment.startsWith("."));
|
|
63
|
+
});
|
|
64
|
+
if (files.length === 0)
|
|
65
|
+
return;
|
|
66
|
+
// Generate metadata BEFORE opening the DB so the write window stays
|
|
67
|
+
// short. One call per file keeps the entry↔path pairing exact.
|
|
68
|
+
const pairs = [];
|
|
69
|
+
for (const file of files) {
|
|
70
|
+
const generated = await generateMetadataFlat(stashDir, [file]);
|
|
71
|
+
const entry = generated.entries[0];
|
|
72
|
+
// Workflows carry a side-table document upsert this fast path doesn't
|
|
73
|
+
// do; no current caller writes them, but guard so one never lands
|
|
74
|
+
// half-indexed.
|
|
75
|
+
if (entry && entry.type !== "workflow")
|
|
76
|
+
pairs.push({ file, entry });
|
|
77
|
+
}
|
|
78
|
+
if (pairs.length === 0)
|
|
79
|
+
return;
|
|
80
|
+
const db = openExistingDatabase(dbPath);
|
|
81
|
+
try {
|
|
82
|
+
db.exec(`PRAGMA busy_timeout = ${WRITE_PATH_INDEX_BUSY_TIMEOUT_MS}`);
|
|
83
|
+
if (getEntryCount(db) === 0)
|
|
84
|
+
return;
|
|
85
|
+
for (const { file, entry } of pairs) {
|
|
86
|
+
const entryKey = `${stashDir}:${entry.type}:${entry.name}`;
|
|
87
|
+
let entryWithSize = entry;
|
|
88
|
+
try {
|
|
89
|
+
entryWithSize = { ...entry, fileSize: fs.statSync(file).size };
|
|
90
|
+
}
|
|
91
|
+
catch {
|
|
92
|
+
// stat raced a delete — index without the size, like the full walk does.
|
|
93
|
+
}
|
|
94
|
+
upsertEntry(db, entryKey, path.dirname(file), file, stashDir, entryWithSize, buildSearchText(entry));
|
|
95
|
+
}
|
|
96
|
+
rebuildFts(db, { incremental: true });
|
|
97
|
+
}
|
|
98
|
+
finally {
|
|
99
|
+
closeDatabase(db);
|
|
100
|
+
}
|
|
101
|
+
}
|
|
102
|
+
catch (error) {
|
|
103
|
+
warnVerbose("Write-path index update skipped (asset appears after the next full index):", error instanceof Error ? error.message : String(error));
|
|
104
|
+
}
|
|
105
|
+
}
|
package/dist/indexer/indexer.js
CHANGED
|
@@ -118,7 +118,7 @@ async function runWalkPhase(ctx) {
|
|
|
118
118
|
ctx.timing.tWalkEnd = Date.now();
|
|
119
119
|
throwIfAborted(signal);
|
|
120
120
|
// LLM enrichment for directories that need it
|
|
121
|
-
await enhanceDirsWithLlm(db, config, dirsNeedingLlm, onProgress, signal,
|
|
121
|
+
await enhanceDirsWithLlm(db, config, dirsNeedingLlm, onProgress, signal, reEnrich);
|
|
122
122
|
onProgress({
|
|
123
123
|
phase: "llm",
|
|
124
124
|
message: resolveIndexPassLLM("enrichment", config)
|
|
@@ -226,7 +226,19 @@ function runCleanPass(db, dryRun) {
|
|
|
226
226
|
};
|
|
227
227
|
}
|
|
228
228
|
// ── Indexer ──────────────────────────────────────────────────────────────────
|
|
229
|
+
// ── Test seam ────────────────────────────────────────────────────────────────
|
|
230
|
+
// Swap-and-restore override. Inert in production; only tests call the setter.
|
|
231
|
+
let akmIndexOverride;
|
|
232
|
+
/** TEST-ONLY. Swap the implementation of `akmIndex`; pass undefined to restore. */
|
|
233
|
+
export function _setAkmIndexForTests(fake) {
|
|
234
|
+
akmIndexOverride = fake;
|
|
235
|
+
}
|
|
229
236
|
export async function akmIndex(options) {
|
|
237
|
+
if (akmIndexOverride)
|
|
238
|
+
return akmIndexOverride(options);
|
|
239
|
+
return akmIndexReal(options);
|
|
240
|
+
}
|
|
241
|
+
async function akmIndexReal(options) {
|
|
230
242
|
return withIndexWriterLease({ purpose: "akm-index", signal: options?.signal }, async () => {
|
|
231
243
|
const stashDir = options?.stashDir || resolveStashDir();
|
|
232
244
|
const onProgress = options?.onProgress ?? (() => { });
|
|
@@ -640,7 +652,7 @@ async function indexEntries(db, allSourceEntries, isIncremental, builtAtMs, hadR
|
|
|
640
652
|
insertTransaction();
|
|
641
653
|
return { scannedDirs, skippedDirs, generatedCount, warnings, dirsNeedingLlm };
|
|
642
654
|
}
|
|
643
|
-
async function enhanceDirsWithLlm(db, config, dirsNeedingLlm, onProgress, signal,
|
|
655
|
+
async function enhanceDirsWithLlm(db, config, dirsNeedingLlm, onProgress, signal, reEnrich = false) {
|
|
644
656
|
// Resolve per-pass LLM config via the unified shim. Returns undefined when
|
|
645
657
|
// either no `akm.llm` is configured or the user opted this pass out via
|
|
646
658
|
// `index.enrichment.llm = false`. (#208)
|
|
@@ -977,7 +989,7 @@ function resolveIndexedFiles(dirPath, files, stash) {
|
|
|
977
989
|
for (const entry of stash.entries) {
|
|
978
990
|
const entryPath = entry.filename
|
|
979
991
|
? path.join(dirPath, entry.filename)
|
|
980
|
-
: matchEntryToFile(entry.name, fileBasenameMap
|
|
992
|
+
: matchEntryToFile(entry.name, fileBasenameMap);
|
|
981
993
|
if (entryPath)
|
|
982
994
|
resolved.add(entryPath);
|
|
983
995
|
}
|
|
@@ -1096,7 +1108,7 @@ export function buildFileBasenameMap(files) {
|
|
|
1096
1108
|
* try matching the last segment
|
|
1097
1109
|
* 3. No implicit file fallback: ambiguous legacy entries are skipped
|
|
1098
1110
|
*/
|
|
1099
|
-
export function matchEntryToFile(entryName, fileMap
|
|
1111
|
+
export function matchEntryToFile(entryName, fileMap) {
|
|
1100
1112
|
// Exact match on entry name
|
|
1101
1113
|
const exact = fileMap.get(entryName);
|
|
1102
1114
|
if (exact)
|
|
@@ -191,6 +191,15 @@ export function validateStashEntry(entry) {
|
|
|
191
191
|
const contradictedBy = normalizeNonEmptyStringList(e.contradictedBy);
|
|
192
192
|
if (contradictedBy)
|
|
193
193
|
result.contradictedBy = contradictedBy;
|
|
194
|
+
// R5 — consolidation provenance fields must survive the whitelist too, or
|
|
195
|
+
// stash.json-overridden merge products lose merge-following + generation
|
|
196
|
+
// counting in the collapse detector.
|
|
197
|
+
if (typeof e.generation === "number" && Number.isFinite(e.generation) && e.generation > 0) {
|
|
198
|
+
result.generation = Math.floor(e.generation);
|
|
199
|
+
}
|
|
200
|
+
const sourceRefs = normalizeNonEmptyStringList(e.sourceRefs);
|
|
201
|
+
if (sourceRefs)
|
|
202
|
+
result.sourceRefs = sourceRefs;
|
|
194
203
|
const currentBeliefRefs = normalizeNonEmptyStringList(e.currentBeliefRefs);
|
|
195
204
|
if (currentBeliefRefs)
|
|
196
205
|
result.currentBeliefRefs = currentBeliefRefs;
|
|
@@ -342,6 +351,17 @@ export function applyCuratedFrontmatter(entry, fmData) {
|
|
|
342
351
|
const contradictedBy = normalizeStringListOrUndefined(fmData.contradictedBy);
|
|
343
352
|
if (contradictedBy)
|
|
344
353
|
entry.contradictedBy = contradictedBy;
|
|
354
|
+
// R5 — consolidation provenance. `generation` (merge depth counter) and
|
|
355
|
+
// `source_refs` (merge/distill provenance pointers) are written by the
|
|
356
|
+
// improve pipeline; captured into the index so the collapse detector can
|
|
357
|
+
// count over-generation assets and follow merges without filesystem reads.
|
|
358
|
+
const generation = fmData.generation;
|
|
359
|
+
if (typeof generation === "number" && Number.isFinite(generation) && generation > 0) {
|
|
360
|
+
entry.generation = Math.floor(generation);
|
|
361
|
+
}
|
|
362
|
+
const sourceRefs = normalizeStringListOrUndefined(fmData.source_refs);
|
|
363
|
+
if (sourceRefs)
|
|
364
|
+
entry.sourceRefs = sourceRefs;
|
|
345
365
|
const currentBeliefRefs = normalizeStringListOrUndefined(fmData.currentBeliefRefs);
|
|
346
366
|
if (currentBeliefRefs)
|
|
347
367
|
entry.currentBeliefRefs = currentBeliefRefs;
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
// This Source Code Form is subject to the terms of the Mozilla Public
|
|
2
|
+
// License, v. 2.0. If a copy of the MPL was not distributed with this
|
|
3
|
+
// file, You can obtain one at https://mozilla.org/MPL/2.0/.
|
|
4
|
+
import { ensureIndex } from "./ensure-index.js";
|
|
5
|
+
import { resolveSourceEntries } from "./search/search-source.js";
|
|
6
|
+
/** Resolve the active read sources using the same resolution rules as search/show. */
|
|
7
|
+
export function resolveReadSources(overrideStashDir, existingConfig) {
|
|
8
|
+
const sources = resolveSourceEntries(overrideStashDir, existingConfig);
|
|
9
|
+
return { sources, primarySource: sources[0] };
|
|
10
|
+
}
|
|
11
|
+
/** Ensure the primary source index is readable for reads, when a primary exists. */
|
|
12
|
+
export async function ensurePrimaryIndexForRead(primarySource) {
|
|
13
|
+
if (!primarySource?.path)
|
|
14
|
+
return false;
|
|
15
|
+
return ensureIndex(primarySource.path);
|
|
16
|
+
}
|
|
17
|
+
/**
|
|
18
|
+
* Convenience helper for callers that only need to ensure a read index from a
|
|
19
|
+
* configured stash path and default config.
|
|
20
|
+
*/
|
|
21
|
+
export async function ensurePrimaryIndexFromConfig(overrideStashDir, existingConfig) {
|
|
22
|
+
return ensurePrimaryIndexForRead(resolveReadSources(overrideStashDir, existingConfig).primarySource);
|
|
23
|
+
}
|
|
@@ -29,6 +29,29 @@ import { applyRankingRules, combineSearchScores, normalizeFtsScores } from "./ra
|
|
|
29
29
|
import { enrichSearchHit } from "./search-hit-enrichers.js";
|
|
30
30
|
import { buildEditHint, findSourceForPath, isEditable } from "./search-source.js";
|
|
31
31
|
import { deriveSemanticProviderFingerprint, getEffectiveSemanticStatus, isSemanticRuntimeReady, readSemanticStatus, } from "./semantic-status.js";
|
|
32
|
+
/**
|
|
33
|
+
* Age past which search surfaces a "run akm index" hint. Reads serve the
|
|
34
|
+
* existing index as-is (freshness is the writers' job — `indexWrittenAssets`
|
|
35
|
+
* plus full runs), so on installs with no improve cron a hand-edited or
|
|
36
|
+
* git-pulled file stays invisible until someone reindexes. The hint makes that
|
|
37
|
+
* actionable without re-introducing read-triggered reindexing.
|
|
38
|
+
*/
|
|
39
|
+
const STALE_INDEX_HINT_MS = 7 * 24 * 60 * 60 * 1000;
|
|
40
|
+
function buildStaleIndexHint(db) {
|
|
41
|
+
try {
|
|
42
|
+
const builtAt = getMeta(db, "builtAt");
|
|
43
|
+
if (!builtAt)
|
|
44
|
+
return undefined;
|
|
45
|
+
const ageMs = Date.now() - new Date(builtAt).getTime();
|
|
46
|
+
if (!Number.isFinite(ageMs) || ageMs < STALE_INDEX_HINT_MS)
|
|
47
|
+
return undefined;
|
|
48
|
+
const days = Math.floor(ageMs / (24 * 60 * 60 * 1000));
|
|
49
|
+
return `Search index was last built ${days} day(s) ago. Files added or edited outside akm since then are not searchable — run 'akm index' to refresh.`;
|
|
50
|
+
}
|
|
51
|
+
catch {
|
|
52
|
+
return undefined;
|
|
53
|
+
}
|
|
54
|
+
}
|
|
32
55
|
export function buildLocalAction(type, ref, registry = defaultRendererRegistry) {
|
|
33
56
|
return buildActionFromContributors({ type, ref }, defaultActionContributors(registry)) ?? `akm show ${ref}`;
|
|
34
57
|
}
|
|
@@ -95,7 +118,9 @@ export async function searchLocal(input) {
|
|
|
95
118
|
if (config.semanticSearchMode === "auto" && semanticStatus === "blocked") {
|
|
96
119
|
warnings.push("Semantic search is currently blocked. Using keyword search until the semantic backend is healthy again.");
|
|
97
120
|
}
|
|
98
|
-
//
|
|
121
|
+
// Bootstrap-only: builds the index inline when it cannot serve this stash.
|
|
122
|
+
// Content freshness is the writers' job (indexWrittenAssets + full runs);
|
|
123
|
+
// reads serve the existing index as-is.
|
|
99
124
|
await ensureIndex(stashDir);
|
|
100
125
|
const dbPath = getDbPath();
|
|
101
126
|
if (!fs.existsSync(dbPath)) {
|
|
@@ -117,6 +142,9 @@ export async function searchLocal(input) {
|
|
|
117
142
|
mode: "keyword",
|
|
118
143
|
};
|
|
119
144
|
}
|
|
145
|
+
const staleHint = buildStaleIndexHint(db);
|
|
146
|
+
if (staleHint)
|
|
147
|
+
warnings.push(staleHint);
|
|
120
148
|
const { hits, embedMs, rankMs } = await searchDatabase(db, query, searchType, limit, stashDir, allSourceDirs, config, sources, rendererRegistry, filters, includeProposed, beliefFilter, restrictToSources, includeExcludedTypes, disableProjectContext, disableScopedUtility);
|
|
121
149
|
return {
|
|
122
150
|
hits,
|
|
@@ -17,6 +17,14 @@ const TYPE_BOOST = {
|
|
|
17
17
|
const MAX_BOOST_SUM = 3.0;
|
|
18
18
|
const UTILITY_WEIGHT = 0.5;
|
|
19
19
|
const UTILITY_MAX_BOOST = 1.5;
|
|
20
|
+
/**
|
|
21
|
+
* R2 (docs/design/improve-self-learning-analysis.md) — weight of the improve
|
|
22
|
+
* loop's `asset_salience.rank_score` in user-facing ranking. Bounded well
|
|
23
|
+
* below the utility boost so the composed signal refines, never dominates,
|
|
24
|
+
* lexical/semantic relevance. rank_score ∈ [0,1] → boost ∈ [1, 1.2].
|
|
25
|
+
*/
|
|
26
|
+
const SALIENCE_WEIGHT = 0.2;
|
|
27
|
+
const SALIENCE_MAX_BOOST = 1.2;
|
|
20
28
|
/**
|
|
21
29
|
* Phase 2A / Rec 5: default recency half-life (days) used when no
|
|
22
30
|
* `utilityDecayConfig` is supplied to the ranking pipeline. Matches the
|
|
@@ -334,7 +342,31 @@ export const defaultRankingContributors = [
|
|
|
334
342
|
pinnedFactRankingContributor,
|
|
335
343
|
projectContextRankingContributor,
|
|
336
344
|
];
|
|
337
|
-
|
|
345
|
+
/**
|
|
346
|
+
* R2 — compose the improve loop's salience core into user-facing ranking.
|
|
347
|
+
*
|
|
348
|
+
* `asset_salience.rank_score` (encoding + outcome + retrieval projection,
|
|
349
|
+
* maintained every improve run) previously drove only improve's INTERNAL
|
|
350
|
+
* maintenance selection — the "better assets surface more" loop ran solely
|
|
351
|
+
* through the utility EMA. This bounded multiplicative boost closes the outer
|
|
352
|
+
* loop: usage/outcome-reinforced assets rank higher in `search`/`curate`.
|
|
353
|
+
*/
|
|
354
|
+
const salienceRankingContributor = {
|
|
355
|
+
name: "salience-ranking",
|
|
356
|
+
appliesTo(item, ctx) {
|
|
357
|
+
const rank = ctx.salienceRankScores?.get(item.id);
|
|
358
|
+
return rank !== undefined && rank > 0;
|
|
359
|
+
},
|
|
360
|
+
apply(item, ctx) {
|
|
361
|
+
const rank = ctx.salienceRankScores?.get(item.id) ?? 0;
|
|
362
|
+
const rawBoost = 1 + Math.min(1, Math.max(0, rank)) * SALIENCE_WEIGHT;
|
|
363
|
+
item.score *= Math.min(rawBoost, SALIENCE_MAX_BOOST);
|
|
364
|
+
},
|
|
365
|
+
};
|
|
366
|
+
export const defaultUtilityRankingContributors = [
|
|
367
|
+
utilityRankingContributor,
|
|
368
|
+
salienceRankingContributor,
|
|
369
|
+
];
|
|
338
370
|
export function applyScoreContributors(item, ctx, contributors = defaultRankingContributors) {
|
|
339
371
|
let boostSum = 0;
|
|
340
372
|
for (const contributor of contributors) {
|
|
@@ -1,8 +1,68 @@
|
|
|
1
1
|
// This Source Code Form is subject to the terms of the Mozilla Public
|
|
2
2
|
// License, v. 2.0. If a copy of the MPL was not distributed with this
|
|
3
3
|
// file, You can obtain one at https://mozilla.org/MPL/2.0/.
|
|
4
|
+
import fs from "node:fs";
|
|
5
|
+
import { makeAssetRef } from "../../core/asset/asset-ref.js";
|
|
6
|
+
import { getStateDbPath } from "../../core/state-db.js";
|
|
7
|
+
import { openDatabase } from "../../storage/database.js";
|
|
4
8
|
import { getUtilityScoresByIds } from "../db/db.js";
|
|
5
9
|
import { applyScoreContributors, applyUtilityContributors } from "./ranking-contributors.js";
|
|
10
|
+
/**
|
|
11
|
+
* R2 — best-effort load of `asset_salience.rank_score` from state.db for the
|
|
12
|
+
* ranked items. Fail-open: any error (state.db locked by a concurrent improve
|
|
13
|
+
* run, missing table, unreadable path) returns an empty map, which makes the
|
|
14
|
+
* salience contributor a no-op — byte-identical to pre-R2 ranking.
|
|
15
|
+
*
|
|
16
|
+
* Deliberately NOT `openStateDatabase()`: that helper runs migrations and sets
|
|
17
|
+
* a 30 s busy timeout — too heavy for a search hot path. This opens read-only,
|
|
18
|
+
* never creates or migrates state.db (missing file / missing table = empty
|
|
19
|
+
* map), and caps lock waits at 250 ms so a concurrent improve run can only
|
|
20
|
+
* ever cost the search a quarter second, not a stall.
|
|
21
|
+
*/
|
|
22
|
+
export function loadSalienceRankScores(items) {
|
|
23
|
+
const result = new Map();
|
|
24
|
+
if (items.length === 0)
|
|
25
|
+
return result;
|
|
26
|
+
try {
|
|
27
|
+
const dbPath = getStateDbPath();
|
|
28
|
+
if (!fs.existsSync(dbPath))
|
|
29
|
+
return result; // improve loop has never run here
|
|
30
|
+
const idByRef = new Map();
|
|
31
|
+
for (const item of items) {
|
|
32
|
+
idByRef.set(makeAssetRef(item.entry.type, item.entry.name), item.id);
|
|
33
|
+
}
|
|
34
|
+
const stateDb = openDatabase(dbPath, { readonly: true });
|
|
35
|
+
try {
|
|
36
|
+
try {
|
|
37
|
+
stateDb.exec("PRAGMA busy_timeout = 250");
|
|
38
|
+
}
|
|
39
|
+
catch {
|
|
40
|
+
// pragma failure on a readonly handle is fine — default timeout applies
|
|
41
|
+
}
|
|
42
|
+
const refs = [...idByRef.keys()];
|
|
43
|
+
const CHUNK = 500;
|
|
44
|
+
for (let i = 0; i < refs.length; i += CHUNK) {
|
|
45
|
+
const chunk = refs.slice(i, i + CHUNK);
|
|
46
|
+
const placeholders = chunk.map(() => "?").join(",");
|
|
47
|
+
const rows = stateDb
|
|
48
|
+
.prepare(`SELECT asset_ref, rank_score FROM asset_salience WHERE asset_ref IN (${placeholders})`)
|
|
49
|
+
.all(...chunk);
|
|
50
|
+
for (const row of rows) {
|
|
51
|
+
const id = idByRef.get(row.asset_ref);
|
|
52
|
+
if (id !== undefined)
|
|
53
|
+
result.set(id, row.rank_score);
|
|
54
|
+
}
|
|
55
|
+
}
|
|
56
|
+
}
|
|
57
|
+
finally {
|
|
58
|
+
stateDb.close();
|
|
59
|
+
}
|
|
60
|
+
}
|
|
61
|
+
catch {
|
|
62
|
+
// Fail open — search must never break because state.db is unavailable.
|
|
63
|
+
}
|
|
64
|
+
return result;
|
|
65
|
+
}
|
|
6
66
|
export function normalizeFtsScores(results) {
|
|
7
67
|
const ftsScoreMap = new Map();
|
|
8
68
|
if (results.length === 0)
|
|
@@ -71,12 +131,18 @@ export function applyRankingRules(options) {
|
|
|
71
131
|
applyScoreContributors(item, rankingContext);
|
|
72
132
|
}
|
|
73
133
|
const { global: utilScoresMap, scoped: scopedUtilScoresMap } = getUtilityScoresByIds(options.db, options.items.map((item) => item.id), options.scopeKey);
|
|
134
|
+
// R2 — compose the improve loop's salience into user-facing ranking.
|
|
135
|
+
// undefined = load from state.db (default); null = explicitly disabled.
|
|
136
|
+
const salienceRankScores = options.salienceRankScores === null
|
|
137
|
+
? new Map()
|
|
138
|
+
: (options.salienceRankScores ?? loadSalienceRankScores(options.items));
|
|
74
139
|
const utilityContext = {
|
|
75
140
|
...rankingContext,
|
|
76
141
|
utilityScores: utilScoresMap,
|
|
77
142
|
scopedUtilityScores: scopedUtilScoresMap,
|
|
78
143
|
utilityDecayConfig: options.utilityDecayConfig,
|
|
79
144
|
positiveFeedbackCounts: options.positiveFeedbackCounts,
|
|
145
|
+
salienceRankScores,
|
|
80
146
|
};
|
|
81
147
|
for (const item of options.items) {
|
|
82
148
|
applyUtilityContributors(item, utilityContext);
|
|
@@ -11,6 +11,12 @@
|
|
|
11
11
|
* - hints: searchHints + examples + usage + intent fields
|
|
12
12
|
* - content: TOC headings (lowest-weight catch-all)
|
|
13
13
|
*/
|
|
14
|
+
// NOTE (R5): the collapse detector's frozen canary queries are built from the
|
|
15
|
+
// same surface this function indexes (name tokens / tags / description) and
|
|
16
|
+
// scored via FTS against it. Changing what buildSearchFields includes shifts
|
|
17
|
+
// the detector's recall baseline for ALL existing canary sets — coordinate
|
|
18
|
+
// with src/commands/improve/collapse-detector.ts (buildCanaryQuery) and expect
|
|
19
|
+
// operators to re-mint via `akm improve canary --refresh` after such a change.
|
|
14
20
|
export function buildSearchFields(entry) {
|
|
15
21
|
const name = entry.name.replace(/[-_]/g, " ").toLowerCase();
|
|
16
22
|
const description = (entry.description ?? "").toLowerCase();
|