akm-cli 0.7.4 → 0.8.0-rc.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (300) hide show
  1. package/CHANGELOG.md +224 -1
  2. package/README.md +22 -6
  3. package/SECURITY.md +93 -0
  4. package/dist/cli/config-migrate.js +144 -0
  5. package/dist/cli/config-validate.js +39 -0
  6. package/dist/cli/confirm.js +73 -0
  7. package/dist/cli/parse-args.js +133 -0
  8. package/dist/cli/shared.js +129 -0
  9. package/dist/cli.js +2631 -1440
  10. package/dist/commands/add-cli.js +279 -0
  11. package/dist/commands/agent-dispatch.js +110 -0
  12. package/dist/commands/agent-support.js +68 -0
  13. package/dist/commands/completions.js +3 -0
  14. package/dist/commands/config-cli.js +130 -534
  15. package/dist/commands/consolidate.js +2122 -0
  16. package/dist/commands/curate.js +45 -3
  17. package/dist/commands/db-cli.js +23 -0
  18. package/dist/commands/distill-promotion-policy.js +660 -0
  19. package/dist/commands/distill.js +1081 -73
  20. package/dist/commands/env.js +213 -0
  21. package/dist/commands/eval-cases.js +43 -0
  22. package/dist/commands/events.js +15 -24
  23. package/dist/commands/extract-cli.js +127 -0
  24. package/dist/commands/extract-prompt.js +204 -0
  25. package/dist/commands/extract.js +477 -0
  26. package/dist/commands/feedback-cli.js +331 -0
  27. package/dist/commands/graph.js +477 -0
  28. package/dist/commands/health.js +1302 -0
  29. package/dist/commands/help/help-accept.md +12 -0
  30. package/dist/commands/help/help-improve.md +69 -0
  31. package/dist/commands/help/help-proposals.md +18 -0
  32. package/dist/commands/help/help-propose.md +17 -0
  33. package/dist/commands/help/help-reject.md +11 -0
  34. package/dist/commands/history.js +54 -46
  35. package/dist/commands/improve-auto-accept.js +97 -0
  36. package/dist/commands/improve-cli.js +217 -0
  37. package/dist/commands/improve-profiles.js +166 -0
  38. package/dist/commands/improve-result-file.js +167 -0
  39. package/dist/commands/improve.js +2373 -0
  40. package/dist/commands/info.js +5 -2
  41. package/dist/commands/init.js +50 -2
  42. package/dist/commands/installed-stashes.js +102 -139
  43. package/dist/commands/knowledge.js +136 -0
  44. package/dist/commands/lint/agent-linter.js +49 -0
  45. package/dist/commands/lint/base-linter.js +479 -0
  46. package/dist/commands/lint/command-linter.js +49 -0
  47. package/dist/commands/lint/default-linter.js +16 -0
  48. package/dist/commands/lint/env-key-rules.js +154 -0
  49. package/dist/commands/lint/index.js +196 -0
  50. package/dist/commands/lint/knowledge-linter.js +16 -0
  51. package/dist/commands/lint/markdown-insertion.js +343 -0
  52. package/dist/commands/lint/memory-linter.js +61 -0
  53. package/dist/commands/lint/registry.js +36 -0
  54. package/dist/commands/lint/skill-linter.js +45 -0
  55. package/dist/commands/lint/task-linter.js +50 -0
  56. package/dist/commands/lint/types.js +4 -0
  57. package/dist/commands/lint/workflow-linter.js +56 -0
  58. package/dist/commands/lint.js +4 -0
  59. package/dist/commands/migration-help.js +3 -0
  60. package/dist/commands/proposal.js +67 -12
  61. package/dist/commands/propose.js +120 -45
  62. package/dist/commands/reflect.js +1104 -60
  63. package/dist/commands/registry-cli.js +150 -0
  64. package/dist/commands/registry-search.js +5 -2
  65. package/dist/commands/remember-cli.js +257 -0
  66. package/dist/commands/remember.js +70 -7
  67. package/dist/commands/schema-repair.js +203 -0
  68. package/dist/commands/search.js +115 -14
  69. package/dist/commands/secret.js +173 -0
  70. package/dist/commands/self-update.js +3 -0
  71. package/dist/commands/show.js +158 -60
  72. package/dist/commands/source-add.js +17 -45
  73. package/dist/commands/source-clone.js +3 -0
  74. package/dist/commands/source-manage.js +14 -19
  75. package/dist/commands/tasks.js +437 -0
  76. package/dist/commands/url-checker.js +42 -0
  77. package/dist/core/action-contributors.js +28 -0
  78. package/dist/core/asset-ref.js +17 -2
  79. package/dist/core/asset-registry.js +12 -17
  80. package/dist/core/asset-serialize.js +88 -0
  81. package/dist/core/asset-spec.js +67 -1
  82. package/dist/core/common.js +182 -0
  83. package/dist/core/concurrent.js +25 -0
  84. package/dist/core/config-io.js +347 -0
  85. package/dist/core/config-migration.js +622 -0
  86. package/dist/core/config-schema.js +534 -0
  87. package/dist/core/config-sources.js +108 -0
  88. package/dist/core/config-types.js +4 -0
  89. package/dist/core/config-walker.js +337 -0
  90. package/dist/core/config.js +364 -968
  91. package/dist/core/errors.js +42 -20
  92. package/dist/core/events.js +105 -135
  93. package/dist/core/file-lock.js +104 -0
  94. package/dist/core/frontmatter.js +75 -8
  95. package/dist/core/lesson-lint.js +3 -0
  96. package/dist/core/markdown.js +20 -0
  97. package/dist/core/memory-belief.js +62 -0
  98. package/dist/core/memory-contradiction-detect.js +274 -0
  99. package/dist/core/memory-improve.js +806 -0
  100. package/dist/core/parse.js +158 -0
  101. package/dist/core/paths.js +280 -14
  102. package/dist/core/proposal-quality-validators.js +380 -0
  103. package/dist/core/proposal-validators.js +69 -0
  104. package/dist/core/proposals.js +512 -42
  105. package/dist/core/state-db.js +1068 -0
  106. package/dist/core/text-truncation.js +107 -0
  107. package/dist/core/time.js +54 -0
  108. package/dist/core/tty.js +59 -0
  109. package/dist/core/warn.js +64 -1
  110. package/dist/core/write-source.js +3 -0
  111. package/dist/indexer/db-backup.js +391 -0
  112. package/dist/indexer/db-search.js +198 -489
  113. package/dist/indexer/db.js +990 -108
  114. package/dist/indexer/ensure-index.js +136 -0
  115. package/dist/indexer/file-context.js +3 -0
  116. package/dist/indexer/graph-boost.js +376 -101
  117. package/dist/indexer/graph-db.js +391 -0
  118. package/dist/indexer/graph-dedup.js +95 -0
  119. package/dist/indexer/graph-extraction.js +550 -114
  120. package/dist/indexer/index-context.js +4 -0
  121. package/dist/indexer/indexer.js +547 -309
  122. package/dist/indexer/llm-cache.js +52 -0
  123. package/dist/indexer/manifest.js +3 -0
  124. package/dist/indexer/matchers.js +167 -160
  125. package/dist/indexer/memory-inference.js +152 -74
  126. package/dist/indexer/metadata-contributors.js +29 -0
  127. package/dist/indexer/metadata.js +275 -196
  128. package/dist/indexer/path-resolver.js +92 -0
  129. package/dist/indexer/project-context.js +192 -0
  130. package/dist/indexer/ranking-contributors.js +331 -0
  131. package/dist/indexer/ranking.js +81 -0
  132. package/dist/indexer/search-fields.js +5 -9
  133. package/dist/indexer/search-hit-enrichers.js +111 -0
  134. package/dist/indexer/search-source.js +44 -10
  135. package/dist/indexer/semantic-status.js +6 -17
  136. package/dist/indexer/staleness-detect.js +447 -0
  137. package/dist/indexer/usage-events.js +12 -9
  138. package/dist/indexer/walker.js +28 -0
  139. package/dist/integrations/agent/builders.js +135 -0
  140. package/dist/integrations/agent/config.js +122 -230
  141. package/dist/integrations/agent/detect.js +3 -0
  142. package/dist/integrations/agent/index.js +7 -13
  143. package/dist/integrations/agent/model-aliases.js +55 -0
  144. package/dist/integrations/agent/profiles.js +70 -5
  145. package/dist/integrations/agent/prompts.js +250 -36
  146. package/dist/integrations/agent/runner.js +151 -0
  147. package/dist/integrations/agent/sdk-runner.js +126 -0
  148. package/dist/integrations/agent/spawn.js +183 -35
  149. package/dist/integrations/github.js +3 -0
  150. package/dist/integrations/lockfile.js +32 -69
  151. package/dist/integrations/session-logs/index.js +69 -0
  152. package/dist/integrations/session-logs/inline-refs.js +35 -0
  153. package/dist/integrations/session-logs/pre-filter.js +152 -0
  154. package/dist/integrations/session-logs/providers/claude-code.js +282 -0
  155. package/dist/integrations/session-logs/providers/opencode.js +258 -0
  156. package/dist/integrations/session-logs/types.js +4 -0
  157. package/dist/llm/call-ai.js +62 -0
  158. package/dist/llm/client.js +79 -88
  159. package/dist/llm/embedder.js +20 -29
  160. package/dist/llm/embedders/cache.js +3 -7
  161. package/dist/llm/embedders/local.js +42 -1
  162. package/dist/llm/embedders/remote.js +20 -8
  163. package/dist/llm/embedders/types.js +3 -7
  164. package/dist/llm/feature-gate.js +95 -48
  165. package/dist/llm/graph-extract.js +676 -72
  166. package/dist/llm/index-passes.js +44 -29
  167. package/dist/llm/memory-infer.js +80 -71
  168. package/dist/llm/metadata-enhance.js +42 -29
  169. package/dist/llm/prompts/extract-session.md +80 -0
  170. package/dist/llm/prompts/graph-extract-user-prompt.md +35 -0
  171. package/dist/output/cli-hints-full.md +292 -0
  172. package/dist/output/cli-hints-short.md +66 -0
  173. package/dist/output/cli-hints.js +7 -311
  174. package/dist/output/context.js +60 -8
  175. package/dist/output/renderers.js +306 -258
  176. package/dist/output/shapes/curate.js +56 -0
  177. package/dist/output/shapes/distill.js +10 -0
  178. package/dist/output/shapes/env-list.js +19 -0
  179. package/dist/output/shapes/events.js +11 -0
  180. package/dist/output/shapes/helpers.js +424 -0
  181. package/dist/output/shapes/history.js +7 -0
  182. package/dist/output/shapes/passthrough.js +102 -0
  183. package/dist/output/shapes/proposal-accept.js +7 -0
  184. package/dist/output/shapes/proposal-diff.js +7 -0
  185. package/dist/output/shapes/proposal-list.js +7 -0
  186. package/dist/output/shapes/proposal-producer.js +11 -0
  187. package/dist/output/shapes/proposal-reject.js +7 -0
  188. package/dist/output/shapes/proposal-show.js +7 -0
  189. package/dist/output/shapes/registry-search.js +6 -0
  190. package/dist/output/shapes/registry.js +30 -0
  191. package/dist/output/shapes/search.js +6 -0
  192. package/dist/output/shapes/secret-list.js +19 -0
  193. package/dist/output/shapes/show.js +6 -0
  194. package/dist/output/shapes/vault-list.js +19 -0
  195. package/dist/output/shapes.js +51 -511
  196. package/dist/output/text/add.js +6 -0
  197. package/dist/output/text/clone.js +6 -0
  198. package/dist/output/text/config.js +6 -0
  199. package/dist/output/text/curate.js +6 -0
  200. package/dist/output/text/distill.js +7 -0
  201. package/dist/output/text/enable-disable.js +7 -0
  202. package/dist/output/text/events.js +10 -0
  203. package/dist/output/text/feedback.js +6 -0
  204. package/dist/output/text/helpers.js +1039 -0
  205. package/dist/output/text/history.js +7 -0
  206. package/dist/output/text/import.js +6 -0
  207. package/dist/output/text/index.js +6 -0
  208. package/dist/output/text/info.js +6 -0
  209. package/dist/output/text/init.js +6 -0
  210. package/dist/output/text/list.js +6 -0
  211. package/dist/output/text/proposal-producer.js +8 -0
  212. package/dist/output/text/proposal.js +11 -0
  213. package/dist/output/text/registry-commands.js +11 -0
  214. package/dist/output/text/registry.js +30 -0
  215. package/dist/output/text/remember.js +6 -0
  216. package/dist/output/text/remove.js +6 -0
  217. package/dist/output/text/save.js +6 -0
  218. package/dist/output/text/search.js +6 -0
  219. package/dist/output/text/show.js +6 -0
  220. package/dist/output/text/update.js +6 -0
  221. package/dist/output/text/upgrade.js +6 -0
  222. package/dist/output/text/vault.js +16 -0
  223. package/dist/output/text/wiki.js +15 -0
  224. package/dist/output/text/workflow.js +14 -0
  225. package/dist/output/text.js +44 -1093
  226. package/dist/registry/build-index.js +3 -0
  227. package/dist/registry/create-provider-registry.js +3 -0
  228. package/dist/registry/factory.js +4 -1
  229. package/dist/registry/origin-resolve.js +3 -0
  230. package/dist/registry/providers/index.js +3 -0
  231. package/dist/registry/providers/skills-sh.js +71 -50
  232. package/dist/registry/providers/static-index.js +53 -48
  233. package/dist/registry/providers/types.js +3 -24
  234. package/dist/registry/resolve.js +11 -16
  235. package/dist/registry/types.js +3 -0
  236. package/dist/scripts/migrate-storage.js +17750 -0
  237. package/dist/scripts/migrations/import-fs-improve-runs-to-db.js +9031 -0
  238. package/dist/scripts/migrations/v16-to-v17.js +141 -0
  239. package/dist/setup/detect.js +3 -0
  240. package/dist/setup/ripgrep-install.js +3 -0
  241. package/dist/setup/ripgrep-resolve.js +3 -0
  242. package/dist/setup/setup.js +775 -37
  243. package/dist/setup/steps.js +3 -15
  244. package/dist/sources/include.js +3 -0
  245. package/dist/sources/provider-factory.js +5 -12
  246. package/dist/sources/provider.js +3 -20
  247. package/dist/sources/providers/filesystem.js +19 -23
  248. package/dist/sources/providers/git.js +179 -20
  249. package/dist/sources/providers/index.js +3 -0
  250. package/dist/sources/providers/install-types.js +3 -13
  251. package/dist/sources/providers/npm.js +3 -4
  252. package/dist/sources/providers/provider-utils.js +3 -0
  253. package/dist/sources/providers/sync-from-ref.js +3 -11
  254. package/dist/sources/providers/tar-utils.js +3 -0
  255. package/dist/sources/providers/website.js +18 -22
  256. package/dist/sources/resolve.js +3 -0
  257. package/dist/sources/types.js +3 -0
  258. package/dist/sources/website-ingest.js +7 -0
  259. package/dist/tasks/backends/cron.js +203 -0
  260. package/dist/tasks/backends/exec-utils.js +28 -0
  261. package/dist/tasks/backends/index.js +24 -0
  262. package/dist/tasks/backends/launchd-template.xml +19 -0
  263. package/dist/tasks/backends/launchd.js +187 -0
  264. package/dist/tasks/backends/schtasks-template.xml +29 -0
  265. package/dist/tasks/backends/schtasks.js +215 -0
  266. package/dist/tasks/parser.js +211 -0
  267. package/dist/tasks/resolveAkmBin.js +87 -0
  268. package/dist/tasks/runner.js +458 -0
  269. package/dist/tasks/schedule.js +227 -0
  270. package/dist/tasks/schema.js +15 -0
  271. package/dist/tasks/validator.js +62 -0
  272. package/dist/version.js +3 -0
  273. package/dist/wiki/index-template.md +12 -0
  274. package/dist/wiki/ingest-workflow-template.md +54 -0
  275. package/dist/wiki/log-template.md +8 -0
  276. package/dist/wiki/schema-template.md +61 -0
  277. package/dist/wiki/wiki-templates.js +15 -0
  278. package/dist/wiki/wiki.js +13 -61
  279. package/dist/workflows/authoring.js +8 -25
  280. package/dist/workflows/cli.js +3 -0
  281. package/dist/workflows/db.js +141 -2
  282. package/dist/workflows/document-cache.js +3 -10
  283. package/dist/workflows/parser.js +3 -0
  284. package/dist/workflows/renderer.js +11 -3
  285. package/dist/workflows/runs.js +91 -89
  286. package/dist/workflows/schema.js +3 -0
  287. package/dist/workflows/scope-key.js +79 -0
  288. package/dist/workflows/validator.js +4 -8
  289. package/dist/workflows/workflow-template.md +24 -0
  290. package/docs/README.md +10 -2
  291. package/docs/data-and-telemetry.md +225 -0
  292. package/docs/migration/release-notes/0.7.0.md +1 -1
  293. package/docs/migration/release-notes/0.7.4.md +1 -1
  294. package/docs/migration/release-notes/0.7.5.md +20 -0
  295. package/docs/migration/release-notes/0.8.0.md +48 -0
  296. package/docs/migration/v0.7-to-v0.8.md +1307 -0
  297. package/package.json +29 -11
  298. package/dist/commands/install-audit.js +0 -381
  299. package/dist/commands/vault.js +0 -333
  300. package/dist/templates/wiki-templates.js +0 -100
@@ -1,16 +1,22 @@
1
+ // This Source Code Form is subject to the terms of the Mozilla Public
2
+ // License, v. 2.0. If a copy of the MPL was not distributed with this
3
+ // file, You can obtain one at https://mozilla.org/MPL/2.0/.
1
4
  import { Database } from "bun:sqlite";
2
5
  import fs from "node:fs";
3
6
  import { createRequire } from "node:module";
4
7
  import path from "node:path";
5
8
  import { parseAssetRef } from "../core/asset-ref";
6
9
  import { getDbPath } from "../core/paths";
10
+ import { REGISTRY_INDEX_CACHE_DDL } from "../core/state-db";
7
11
  import { warn } from "../core/warn";
8
12
  import { cosineSimilarity } from "../llm/embedders/types";
13
+ import { backupDataDir, EMBEDDING_DIM_CHANGE_REASON } from "./db-backup";
9
14
  import { buildSearchFields } from "./search-fields";
10
15
  import { ensureUsageEventsSchema } from "./usage-events";
11
16
  // ── Constants ───────────────────────────────────────────────────────────────
12
- export const DB_VERSION = 10;
17
+ export const DB_VERSION = 17;
13
18
  export const EMBEDDING_DIM = 384;
19
+ export const GRAPH_SCHEMA_VERSION = 3;
14
20
  // ── Database lifecycle ──────────────────────────────────────────────────────
15
21
  export function openDatabase(dbPath, options) {
16
22
  const resolvedPath = dbPath ?? getDbPath();
@@ -24,11 +30,39 @@ export function openDatabase(dbPath, options) {
24
30
  db.exec("PRAGMA foreign_keys = ON");
25
31
  // Try to load sqlite-vec extension
26
32
  loadVecExtension(db);
27
- ensureSchema(db, options?.embeddingDim ?? EMBEDDING_DIM);
33
+ // Dim resolution: explicit option wins; otherwise consult the on-disk
34
+ // config so unparameterised opens (registry providers, graph helpers,
35
+ // ad-hoc CLI subcommands) honour the operator-declared dimension. Only if
36
+ // both are absent do we fall through to the no-clobber path, which keeps
37
+ // ensureSchema from touching `index_meta.embeddingDim` at all.
38
+ const resolvedDim = options?.embeddingDim ?? resolveConfiguredEmbeddingDim();
39
+ ensureSchema(db, resolvedDim, { dataDir: dir });
28
40
  // Warn once at init if using JS fallback with many entries
29
41
  warnIfVecMissing(db, { once: true });
30
42
  return db;
31
43
  }
44
+ /**
45
+ * Read the operator-configured embedding dimension from the on-disk config.
46
+ * Returns `undefined` when no config file is present, when the config has
47
+ * no `embedding.dimension` set, or when reading the config throws (e.g.
48
+ * inside isolated test fixtures with no XDG home). Failure is silent on
49
+ * purpose — every openDatabase() call would otherwise have to handle a
50
+ * config-not-found error path, and the fallback (no-clobber semantics) is
51
+ * already correct.
52
+ */
53
+ function resolveConfiguredEmbeddingDim() {
54
+ try {
55
+ const { loadConfig } = require("../core/config");
56
+ const dim = loadConfig().embedding?.dimension;
57
+ if (typeof dim === "number" && Number.isInteger(dim) && dim > 0 && dim <= 4096) {
58
+ return dim;
59
+ }
60
+ return undefined;
61
+ }
62
+ catch {
63
+ return undefined;
64
+ }
65
+ }
32
66
  export function openExistingDatabase(dbPath) {
33
67
  const resolvedPath = dbPath ?? getDbPath();
34
68
  const db = new Database(resolvedPath);
@@ -86,7 +120,7 @@ export function warnIfVecMissing(db, { once } = { once: false }) {
86
120
  /* embeddings table may not exist yet during init */
87
121
  }
88
122
  }
89
- function ensureSchema(db, embeddingDim) {
123
+ function ensureSchema(db, embeddingDim, options) {
90
124
  // Create meta table first so we can check version
91
125
  db.exec(`
92
126
  CREATE TABLE IF NOT EXISTS index_meta (
@@ -94,6 +128,39 @@ function ensureSchema(db, embeddingDim) {
94
128
  value TEXT NOT NULL
95
129
  );
96
130
  `);
131
+ // MVP DB-backup hook (0.8.x): when the stored DB version differs from the
132
+ // running binary's DB_VERSION, snapshot the data directory BEFORE
133
+ // `handleVersionUpgrade()` drops tables. This is best-effort —
134
+ // `backupDataDir` returns null on opt-out, missing data dir, low free
135
+ // space, or copy errors, and we proceed with the upgrade in all cases.
136
+ // The proper migration framework lands in 0.9.0; until then this lets
137
+ // operators recover with `scripts/migrations/restore-data-dir.sh`.
138
+ if (options?.dataDir) {
139
+ const storedVersionRaw = getMeta(db, "version");
140
+ const storedVersion = storedVersionRaw !== undefined && storedVersionRaw !== "" ? Number.parseInt(storedVersionRaw, 10) : null;
141
+ const willUpgrade = storedVersionRaw !== undefined && storedVersionRaw !== "" && storedVersionRaw !== String(DB_VERSION);
142
+ if (willUpgrade) {
143
+ try {
144
+ // Pass env explicitly so tests can override AKM_DB_BACKUP / AKM_DB_BACKUP_RETAIN
145
+ // without mutating process.env. Production callers default to process.env.
146
+ const result = backupDataDir({
147
+ dataDir: options.dataDir,
148
+ sourceVersion: storedVersion !== null && !Number.isNaN(storedVersion) ? storedVersion : null,
149
+ targetVersion: DB_VERSION,
150
+ env: process.env,
151
+ });
152
+ if (result) {
153
+ warn("[akm] data directory backed up to %s before v%s→v%d upgrade", result.path, storedVersionRaw, DB_VERSION);
154
+ }
155
+ }
156
+ catch (err) {
157
+ // Defensive — backupDataDir already swallows most errors, but if it
158
+ // throws for an unexpected reason we must still proceed with the
159
+ // upgrade so the user isn't locked out of their binary.
160
+ warn("[akm] pre-upgrade data dir backup raised an unexpected error — %s; upgrade will proceed without a snapshot", err instanceof Error ? err.message : String(err));
161
+ }
162
+ }
163
+ }
97
164
  // Check stored version — if it differs from DB_VERSION, drop and recreate all tables.
98
165
  // Usage events are preserved across version upgrades so that utility score
99
166
  // history is not silently lost. The backup is captured here and threaded
@@ -110,12 +177,24 @@ function ensureSchema(db, embeddingDim) {
110
177
  stash_dir TEXT NOT NULL,
111
178
  entry_json TEXT NOT NULL,
112
179
  search_text TEXT NOT NULL,
113
- entry_type TEXT NOT NULL
180
+ entry_type TEXT NOT NULL,
181
+ derived_from TEXT
114
182
  );
115
183
 
116
184
  CREATE INDEX IF NOT EXISTS idx_entries_dir ON entries(dir_path);
117
185
  CREATE INDEX IF NOT EXISTS idx_entries_type ON entries(entry_type);
186
+ CREATE INDEX IF NOT EXISTS idx_entries_file_path ON entries(file_path);
118
187
  `);
188
+ // Phase 5A / DB v17: backfill `derived_from` column + index on databases
189
+ // that were created at v17 fresh OR carry a partial v17 schema (a DB whose
190
+ // `index_meta.version` was bumped to 17 but whose `entries` table still
191
+ // lacks the column — this happens when a previous v17 binary opened a
192
+ // pre-v17 DB without taking the upgrade path because no version mismatch
193
+ // was seen at boot). The PRAGMA-then-ALTER guard runs unconditionally so
194
+ // both fresh and partial schemas converge. The CREATE INDEX for
195
+ // `derived_from` MUST run after this helper so we never reference a
196
+ // column that has not yet been added on partial schemas.
197
+ ensureDerivedFromColumn(db);
119
198
  // Validated WorkflowDocument JSON, one row per indexed workflow entry.
120
199
  // Pure index data — fully rebuilt on each `akm index`. ON DELETE CASCADE
121
200
  // means clearing entries (full rebuild or per-dir delete) drops these too.
@@ -174,6 +253,20 @@ function ensureSchema(db, embeddingDim) {
174
253
  updated_at TEXT NOT NULL DEFAULT (datetime('now')),
175
254
  FOREIGN KEY (entry_id) REFERENCES entries(id) ON DELETE CASCADE
176
255
  );
256
+ `);
257
+ // Per-project scoped utility scores — tracks usage per (entry, cwd-anchor)
258
+ // so assets useful in project A don't pollute rankings in project B.
259
+ // The global utility_scores table is preserved as a fallback / cold-start aid.
260
+ db.exec(`
261
+ CREATE TABLE IF NOT EXISTS utility_scores_scoped (
262
+ entry_id INTEGER NOT NULL,
263
+ scope_key TEXT NOT NULL,
264
+ utility REAL NOT NULL DEFAULT 0,
265
+ last_used_at INTEGER NOT NULL,
266
+ PRIMARY KEY (entry_id, scope_key)
267
+ );
268
+ CREATE INDEX IF NOT EXISTS idx_utility_scores_scoped_entry_id
269
+ ON utility_scores_scoped(entry_id);
177
270
  `);
178
271
  db.exec(`
179
272
  CREATE TABLE IF NOT EXISTS index_dir_state (
@@ -183,6 +276,97 @@ function ensureSchema(db, embeddingDim) {
183
276
  reason TEXT NOT NULL,
184
277
  updated_at TEXT NOT NULL
185
278
  );
279
+ `);
280
+ // LLM enrichment result cache. Stores a SHA-256 body hash and the JSON
281
+ // result for each asset so that subsequent `akm index --enrich` runs can
282
+ // skip the LLM call when the body hasn't changed. The cache is keyed by
283
+ // a stable asset_ref string (e.g. the absolute file path for graph/memory
284
+ // passes, or `entryKey:passId` for the metadata-enhance pass).
285
+ // Entries are cleaned up when assets are removed or --re-enrich is used.
286
+ db.exec(`
287
+ CREATE TABLE IF NOT EXISTS llm_enrichment_cache (
288
+ asset_ref TEXT NOT NULL,
289
+ cache_variant TEXT NOT NULL,
290
+ body_hash TEXT NOT NULL,
291
+ result_json TEXT NOT NULL,
292
+ updated_at INTEGER NOT NULL,
293
+ PRIMARY KEY (asset_ref, cache_variant)
294
+ );
295
+
296
+ CREATE INDEX IF NOT EXISTS idx_llm_cache_updated
297
+ ON llm_enrichment_cache(updated_at);
298
+ `);
299
+ // Graph extraction tables — schema v2 (entry_id PK).
300
+ //
301
+ // graph_files is keyed on entries.id so child tables cascade-delete cleanly
302
+ // when an entry is removed, and so JOINs from graph rows to entries are a
303
+ // direct PK lookup. (stash_root, file_path) is retained as UNIQUE so the
304
+ // extractor's path-based upsert still works.
305
+ //
306
+ // graph_file_entities and graph_file_relations no longer duplicate file_path;
307
+ // they reference entry_id and inherit stash scoping via graph_files.
308
+ db.exec(`
309
+ CREATE TABLE IF NOT EXISTS graph_meta (
310
+ stash_root TEXT PRIMARY KEY,
311
+ schema_version INTEGER NOT NULL,
312
+ generated_at TEXT NOT NULL,
313
+ considered_files INTEGER NOT NULL DEFAULT 0,
314
+ extracted_files INTEGER NOT NULL DEFAULT 0,
315
+ entity_count INTEGER NOT NULL DEFAULT 0,
316
+ relation_count INTEGER NOT NULL DEFAULT 0,
317
+ extraction_coverage REAL NOT NULL DEFAULT 0,
318
+ density REAL NOT NULL DEFAULT 0,
319
+ extractor_id TEXT,
320
+ extraction_run_id TEXT,
321
+ model TEXT,
322
+ prompt_version TEXT,
323
+ batch_size INTEGER,
324
+ cache_hits INTEGER NOT NULL DEFAULT 0,
325
+ cache_misses INTEGER NOT NULL DEFAULT 0,
326
+ truncation_count INTEGER NOT NULL DEFAULT 0,
327
+ failure_count INTEGER NOT NULL DEFAULT 0
328
+ );
329
+
330
+ CREATE TABLE IF NOT EXISTS graph_files (
331
+ entry_id INTEGER PRIMARY KEY REFERENCES entries(id) ON DELETE CASCADE,
332
+ stash_root TEXT NOT NULL,
333
+ file_path TEXT NOT NULL,
334
+ file_order INTEGER NOT NULL,
335
+ file_type TEXT NOT NULL,
336
+ body_hash TEXT NOT NULL,
337
+ confidence REAL,
338
+ status TEXT NOT NULL DEFAULT 'extracted',
339
+ reason TEXT,
340
+ extraction_run_id TEXT,
341
+ UNIQUE(stash_root, file_path)
342
+ );
343
+
344
+ CREATE INDEX IF NOT EXISTS idx_graph_files_stash_order
345
+ ON graph_files(stash_root, file_order);
346
+
347
+ CREATE TABLE IF NOT EXISTS graph_file_entities (
348
+ entry_id INTEGER NOT NULL REFERENCES graph_files(entry_id) ON DELETE CASCADE,
349
+ entity_order INTEGER NOT NULL,
350
+ stash_root TEXT NOT NULL,
351
+ entity_norm TEXT NOT NULL,
352
+ entity TEXT NOT NULL,
353
+ PRIMARY KEY (entry_id, entity_order)
354
+ );
355
+
356
+ CREATE INDEX IF NOT EXISTS idx_graph_file_entities_entity_norm
357
+ ON graph_file_entities(stash_root, entity_norm);
358
+
359
+ CREATE TABLE IF NOT EXISTS graph_file_relations (
360
+ entry_id INTEGER NOT NULL REFERENCES graph_files(entry_id) ON DELETE CASCADE,
361
+ relation_order INTEGER NOT NULL,
362
+ from_entity_norm TEXT NOT NULL,
363
+ from_entity TEXT NOT NULL,
364
+ to_entity_norm TEXT NOT NULL,
365
+ to_entity TEXT NOT NULL,
366
+ relation_type TEXT,
367
+ confidence REAL,
368
+ PRIMARY KEY (entry_id, relation_order)
369
+ );
186
370
  `);
187
371
  // FTS-dirty queue. Created here (not lazily on first upsert) so the
188
372
  // per-entry write path doesn't issue a CREATE TABLE IF NOT EXISTS on
@@ -194,59 +378,89 @@ function ensureSchema(db, embeddingDim) {
194
378
  );
195
379
  `);
196
380
  // sqlite-vec table
381
+ //
382
+ // Dimension contract:
383
+ // - When `embeddingDim` is `undefined`, the caller did NOT request a
384
+ // specific dim. Do not touch `index_meta.embeddingDim` and do not run
385
+ // the dim-change wipe — fall back to the stored dim (or the static
386
+ // default) only when we have to materialise the vec table for the
387
+ // first time. Without this guard, registry-side and other dim-unaware
388
+ // `openDatabase()` callers would silently overwrite the dim-aware
389
+ // improve/index value and oscillate the stored dim.
390
+ // - When `embeddingDim` is a number, the caller explicitly asked for
391
+ // that dim and owns the dim-change/backup/wipe semantics.
392
+ const dimExplicit = embeddingDim !== undefined;
393
+ const effectiveDim = embeddingDim ?? (Number(getMeta(db, "embeddingDim")) || EMBEDDING_DIM);
197
394
  if (isVecAvailable(db)) {
198
395
  // Check if stored embedding dimension differs from configured one
199
- const storedDim = getMeta(db, "embeddingDim");
200
- if (storedDim && storedDim !== String(embeddingDim)) {
201
- try {
202
- db.exec("DROP TABLE IF EXISTS entries_vec");
203
- }
204
- catch {
205
- /* ignore */
206
- }
207
- // Delete stale BLOB embeddings so they don't produce silently wrong
208
- // similarity scores against the new-dimension vec table.
209
- try {
210
- db.exec("DELETE FROM embeddings");
211
- }
212
- catch {
213
- /* ignore */
396
+ if (dimExplicit) {
397
+ const storedDim = getMeta(db, "embeddingDim");
398
+ if (storedDim && storedDim !== String(embeddingDim)) {
399
+ // Re-embedding the whole stash is expensive (LLM API calls + cache
400
+ // misses), so snapshot the data dir before we drop the vec table and
401
+ // wipe `embeddings`. This is the SAME hook the version-upgrade path
402
+ // uses earlier in this function, just gated on embedding-dim mismatch
403
+ // and tagged so operators can tell the two backup kinds apart.
404
+ backupBeforeEmbeddingDimChange(options?.dataDir, storedDim, String(embeddingDim));
405
+ try {
406
+ db.exec("DROP TABLE IF EXISTS entries_vec");
407
+ }
408
+ catch {
409
+ /* ignore */
410
+ }
411
+ // Delete stale BLOB embeddings so they don't produce silently wrong
412
+ // similarity scores against the new-dimension vec table.
413
+ try {
414
+ db.exec("DELETE FROM embeddings");
415
+ }
416
+ catch {
417
+ /* ignore */
418
+ }
419
+ setMeta(db, "hasEmbeddings", "0");
214
420
  }
215
- setMeta(db, "hasEmbeddings", "0");
216
421
  }
217
422
  const vecExists = db.prepare("SELECT name FROM sqlite_master WHERE type='table' AND name='entries_vec'").get();
218
423
  if (!vecExists) {
219
- if (!Number.isInteger(embeddingDim) || embeddingDim <= 0 || embeddingDim > 4096) {
220
- throw new Error(`Invalid embedding dimension: ${embeddingDim}`);
424
+ if (!Number.isInteger(effectiveDim) || effectiveDim <= 0 || effectiveDim > 4096) {
425
+ throw new Error(`Invalid embedding dimension: ${effectiveDim}`);
221
426
  }
222
427
  db.exec(`
223
428
  CREATE VIRTUAL TABLE entries_vec USING vec0(
224
429
  id INTEGER PRIMARY KEY,
225
- embedding FLOAT[${embeddingDim}]
430
+ embedding FLOAT[${effectiveDim}]
226
431
  );
227
432
  `);
228
433
  }
229
- setMeta(db, "embeddingDim", String(embeddingDim));
434
+ if (dimExplicit) {
435
+ setMeta(db, "embeddingDim", String(embeddingDim));
436
+ }
230
437
  }
231
438
  else {
232
439
  // Also purge BLOB embeddings on dimension change (JS fallback path).
233
440
  // When sqlite-vec is unavailable, entries_vec doesn't exist but the BLOB
234
441
  // embeddings table still stores vectors. If the configured dimension
235
442
  // changes, those stored BLOBs become silently incompatible.
236
- const storedDim = getMeta(db, "embeddingDim");
237
- if (storedDim && storedDim !== String(embeddingDim)) {
238
- try {
239
- db.exec("DELETE FROM embeddings");
240
- }
241
- catch {
242
- /* ignore */
443
+ if (dimExplicit) {
444
+ const storedDim = getMeta(db, "embeddingDim");
445
+ if (storedDim && storedDim !== String(embeddingDim)) {
446
+ backupBeforeEmbeddingDimChange(options?.dataDir, storedDim, String(embeddingDim));
447
+ try {
448
+ db.exec("DELETE FROM embeddings");
449
+ }
450
+ catch {
451
+ /* ignore */
452
+ }
453
+ setMeta(db, "hasEmbeddings", "0");
243
454
  }
244
- setMeta(db, "hasEmbeddings", "0");
455
+ setMeta(db, "embeddingDim", String(embeddingDim));
245
456
  }
246
- setMeta(db, "embeddingDim", String(embeddingDim));
247
457
  }
248
458
  // Usage telemetry table
249
459
  ensureUsageEventsSchema(db);
460
+ // Registry index cache table — caches remote registry index documents so
461
+ // `akm search` does not hit the network on every invocation. The DDL is
462
+ // defined in state-db.ts and shared here to avoid duplication.
463
+ db.exec(REGISTRY_INDEX_CACHE_DDL);
250
464
  // Restore usage_events backed up by the version-upgrade path above.
251
465
  restoreUsageEventsBackup(db, usageBackup);
252
466
  }
@@ -277,11 +491,23 @@ function handleVersionUpgrade(db) {
277
491
  /* table may not exist in older versions */
278
492
  }
279
493
  db.exec("DROP TABLE IF EXISTS utility_scores");
494
+ db.exec("DROP TABLE IF EXISTS utility_scores_scoped");
495
+ db.exec("DROP INDEX IF EXISTS idx_utility_scores_scoped_entry_id");
280
496
  db.exec("DROP TABLE IF EXISTS usage_events");
281
497
  db.exec("DROP TABLE IF EXISTS embeddings");
282
498
  db.exec("DROP TABLE IF EXISTS entries_vec");
283
499
  db.exec("DROP TABLE IF EXISTS entries_fts");
284
500
  db.exec("DROP TABLE IF EXISTS index_dir_state");
501
+ db.exec("DROP TABLE IF EXISTS llm_enrichment_cache");
502
+ db.exec("DROP INDEX IF EXISTS idx_llm_cache_updated");
503
+ db.exec("DROP TABLE IF EXISTS graph_file_relations");
504
+ db.exec("DROP TABLE IF EXISTS graph_file_entities");
505
+ db.exec("DROP TABLE IF EXISTS graph_files");
506
+ db.exec("DROP TABLE IF EXISTS graph_meta");
507
+ db.exec("DROP TABLE IF EXISTS graph_relations");
508
+ db.exec("DROP TABLE IF EXISTS graph_entities");
509
+ db.exec("DROP TABLE IF EXISTS graph_nodes");
510
+ db.exec("DROP TABLE IF EXISTS graph_stashes");
285
511
  db.exec("DROP INDEX IF EXISTS idx_entries_dir");
286
512
  db.exec("DROP INDEX IF EXISTS idx_entries_type");
287
513
  db.exec("DROP TABLE IF EXISTS entries");
@@ -289,6 +515,48 @@ function handleVersionUpgrade(db) {
289
515
  warn("[akm] Index rebuilt due to version upgrade. Run 'akm index' to repopulate.");
290
516
  return usageBackup;
291
517
  }
518
+ /**
519
+ * Snapshot the data directory before the embedding-dimension drop path wipes
520
+ * `embeddings` and recreates `entries_vec`. Re-embedding a real-world stash
521
+ * is expensive (LLM calls + cache misses), so we capture the pre-drop state
522
+ * here using the same MVP backup helper the version-upgrade hook uses
523
+ * earlier in {@link ensureSchema}.
524
+ *
525
+ * The backup is tagged with the `embedding-dim-change` reason so it lands in
526
+ * `<dataDir>/backups/<timestamp>-embedding-dim-change/` instead of the
527
+ * version-upgrade-flavored `<timestamp>-pre-v<N>/` directory. Restoration
528
+ * works identically via `scripts/migrations/restore-data-dir.sh`.
529
+ *
530
+ * Failures are non-fatal — they downgrade to a warning and the destructive
531
+ * ops run anyway, matching the version-upgrade hook's behavior so a broken
532
+ * backup cannot brick a binary that bumped the configured dim. Likewise,
533
+ * `AKM_DB_BACKUP=0` opts out via the same path.
534
+ */
535
+ function backupBeforeEmbeddingDimChange(dataDir, fromDim, toDim) {
536
+ if (!dataDir)
537
+ return;
538
+ try {
539
+ const result = backupDataDir({
540
+ dataDir,
541
+ // The DB version isn't changing here — pass the current DB_VERSION for
542
+ // both source and target so the metadata sidecar still records the
543
+ // running binary's version for forensic context.
544
+ sourceVersion: DB_VERSION,
545
+ targetVersion: DB_VERSION,
546
+ reason: EMBEDDING_DIM_CHANGE_REASON,
547
+ env: process.env,
548
+ });
549
+ if (result) {
550
+ warn("[akm] embedding dimension changed %s→%s; data directory backed up to %s; embeddings will be regenerated", fromDim, toDim, result.path);
551
+ }
552
+ }
553
+ catch (err) {
554
+ // Defensive — backupDataDir already swallows most errors, but if it
555
+ // throws for an unexpected reason we must still proceed with the drop
556
+ // so the user isn't locked out of their binary on a changed dim.
557
+ warn("[akm] pre-embedding-dim-change data dir backup raised an unexpected error — %s; embeddings will be regenerated without a snapshot", err instanceof Error ? err.message : String(err));
558
+ }
559
+ }
292
560
  /**
293
561
  * Re-insert backed-up `usage_events` rows into the freshly-created table.
294
562
  *
@@ -383,6 +651,12 @@ export function deleteIndexDirStatesByStashDir(db, stashDir) {
383
651
  db.prepare("DELETE FROM index_dir_state WHERE dir_path = ? OR dir_path LIKE ?").run(stashDir, `${stashDir}${path.sep}%`);
384
652
  }
385
653
  // ── Entry operations ────────────────────────────────────────────────────────
654
+ /**
655
+ * SQLite parameter chunk size — chosen well below SQLITE_MAX_VARIABLE_NUMBER
656
+ * (default 999 on most builds) so multi-row `IN (?, ?, ...)` queries stay
657
+ * within bounds. Shared by helpers below.
658
+ */
659
+ const SQLITE_CHUNK_SIZE = 500;
386
660
  /**
387
661
  * Insert or update an entry in the `entries` table. Returns the row id.
388
662
  *
@@ -396,7 +670,11 @@ export function upsertEntry(db, entryKey, dirPath, filePath, stashDir, entry, se
396
670
  // every call. The dirty-mark INSERT and the upsert-with-RETURNING
397
671
  // share the same WeakMap so they live and die with the connection.
398
672
  const stmts = getUpsertStmts(db);
399
- const result = stmts.upsert.get(entryKey, dirPath, filePath, stashDir, JSON.stringify(entry), searchText, entry.type);
673
+ // Phase 5A / Advantage D5: surface derived memory parent ref into the
674
+ // dedicated `derived_from` column so retrieval-time lookup (parent→child)
675
+ // does not have to scan + JSON-decode every memory row.
676
+ const derivedFrom = typeof entry.derivedFrom === "string" && entry.derivedFrom.trim() ? entry.derivedFrom.trim() : null;
677
+ const result = stmts.upsert.get(entryKey, dirPath, filePath, stashDir, JSON.stringify(entry), searchText, entry.type, derivedFrom);
400
678
  if (!result)
401
679
  throw new Error("upsertEntry: entry_key not found after upsert");
402
680
  // Mark this entry as FTS-dirty so `rebuildFts({ incremental: true })`
@@ -415,15 +693,16 @@ function getUpsertStmts(db) {
415
693
  // SELECT round-trip needed (last_insert_rowid() is unreliable for
416
694
  // ON CONFLICT). Use `.get()` so a single row comes back.
417
695
  upsert: db.prepare(`
418
- INSERT INTO entries (entry_key, dir_path, file_path, stash_dir, entry_json, search_text, entry_type)
419
- VALUES (?, ?, ?, ?, ?, ?, ?)
696
+ INSERT INTO entries (entry_key, dir_path, file_path, stash_dir, entry_json, search_text, entry_type, derived_from)
697
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?)
420
698
  ON CONFLICT(entry_key) DO UPDATE SET
421
699
  dir_path = excluded.dir_path,
422
700
  file_path = excluded.file_path,
423
701
  stash_dir = excluded.stash_dir,
424
702
  entry_json = excluded.entry_json,
425
703
  search_text = excluded.search_text,
426
- entry_type = excluded.entry_type
704
+ entry_type = excluded.entry_type,
705
+ derived_from = excluded.derived_from
427
706
  RETURNING id
428
707
  `),
429
708
  markDirty: db.prepare("INSERT OR IGNORE INTO entries_fts_dirty (entry_id) VALUES (?)"),
@@ -431,21 +710,128 @@ function getUpsertStmts(db) {
431
710
  upsertStmtsByDb.set(db, stmts);
432
711
  return stmts;
433
712
  }
434
- export function deleteEntriesByDir(db, dirPath) {
713
+ /**
714
+ * Phase 5A / DB v17 schema guard.
715
+ *
716
+ * Ensures the `entries.derived_from` column + index exist on the open
717
+ * connection. Called from `ensureSchema()` after the entries CREATE so that
718
+ * legacy databases (created against a pre-v17 binary but reopened without
719
+ * triggering `handleVersionUpgrade()`) still gain the new column without
720
+ * data loss. Idempotent: a `PRAGMA table_info` lookup gates the ALTER.
721
+ */
722
+ function ensureDerivedFromColumn(db) {
723
+ try {
724
+ const cols = db.prepare("PRAGMA table_info(entries)").all();
725
+ const hasColumn = cols.some((c) => c.name === "derived_from");
726
+ if (!hasColumn) {
727
+ db.exec("ALTER TABLE entries ADD COLUMN derived_from TEXT");
728
+ }
729
+ // Index creation is idempotent on its own; safe to call unconditionally.
730
+ db.exec("CREATE INDEX IF NOT EXISTS idx_entries_derived_from ON entries(derived_from)");
731
+ }
732
+ catch {
733
+ /* table may not exist on a brand-new DB before CREATE — caller is responsible */
734
+ }
735
+ }
736
+ /**
737
+ * Phase 5A / Advantage D5: look up the derived-memory child row whose
738
+ * `derived_from` column matches `parentRef` (e.g. `"memory:claude-prefs"`).
739
+ *
740
+ * Returns the most-recently-updated derived child when multiple exist (one
741
+ * parent should yield exactly one `.derived` child in practice, but the
742
+ * ordering keeps results deterministic). Returns `null` when no derived
743
+ * child has been indexed for this parent.
744
+ */
745
+ export function getDerivedForParent(db, parentRef) {
746
+ if (!parentRef)
747
+ return null;
748
+ try {
749
+ const row = db
750
+ .prepare(`SELECT id, entry_key, dir_path, file_path, stash_dir, entry_json, search_text
751
+ FROM entries
752
+ WHERE derived_from = ?
753
+ ORDER BY id DESC
754
+ LIMIT 1`)
755
+ .get(parentRef);
756
+ if (!row)
757
+ return null;
758
+ let entry;
759
+ try {
760
+ entry = JSON.parse(row.entry_json);
761
+ }
762
+ catch {
763
+ warn(`[db] getDerivedForParent: skipping entry id=${row.id} — corrupt entry_json`);
764
+ return null;
765
+ }
766
+ return {
767
+ id: row.id,
768
+ entryKey: row.entry_key,
769
+ dirPath: row.dir_path,
770
+ filePath: row.file_path,
771
+ stashDir: row.stash_dir,
772
+ entry,
773
+ searchText: row.search_text,
774
+ };
775
+ }
776
+ catch {
777
+ /* `derived_from` column may not exist on legacy DBs that haven't been
778
+ rebuilt; treat as "no derived child". */
779
+ return null;
780
+ }
781
+ }
782
+ /**
783
+ * Phase 2A / Rec 5: bulk-load positive feedback event counts for the given
784
+ * entry ids. Used by the utility-decay forgetting curve to stabilize
785
+ * (extend the half-life of) memories that have repeatedly proven useful.
786
+ *
787
+ * Returns a `Map<entryId, count>` containing only entries with at least one
788
+ * positive feedback event — missing ids implicitly map to `0`. Chunks at
789
+ * `SQLITE_CHUNK_SIZE` (500) to respect `SQLITE_MAX_VARIABLE_NUMBER`.
790
+ *
791
+ * Cheap when called with zero ids, and silently empty when the
792
+ * `usage_events` table is missing.
793
+ */
794
+ export function getPositiveFeedbackCountsByIds(db, ids) {
795
+ const result = new Map();
796
+ if (ids.length === 0)
797
+ return result;
798
+ for (let i = 0; i < ids.length; i += SQLITE_CHUNK_SIZE) {
799
+ const chunk = ids.slice(i, i + SQLITE_CHUNK_SIZE);
800
+ const placeholders = chunk.map(() => "?").join(",");
801
+ try {
802
+ const rows = db
803
+ .prepare(`SELECT entry_id, COUNT(*) AS cnt
804
+ FROM usage_events
805
+ WHERE event_type = 'feedback'
806
+ AND signal = 'positive'
807
+ AND entry_id IN (${placeholders})
808
+ GROUP BY entry_id`)
809
+ .all(...chunk);
810
+ for (const row of rows) {
811
+ if (row.entry_id !== null && row.cnt > 0) {
812
+ result.set(row.entry_id, row.cnt);
813
+ }
814
+ }
815
+ }
816
+ catch {
817
+ /* usage_events table may be missing on legacy DBs — treat as zero counts */
818
+ }
819
+ }
820
+ return result;
821
+ }
822
+ function deleteEntriesWhere(db, column, value) {
435
823
  db.transaction(() => {
436
- const ids = db.prepare("SELECT id FROM entries WHERE dir_path = ?").all(dirPath);
824
+ const ids = db.prepare(`SELECT id FROM entries WHERE ${column} = ?`).all(value);
437
825
  deleteRelatedRows(db, ids);
438
- db.prepare("DELETE FROM entries WHERE dir_path = ?").run(dirPath);
826
+ db.prepare(`DELETE FROM entries WHERE ${column} = ?`).run(value);
439
827
  })();
440
828
  }
829
+ export function deleteEntriesByDir(db, dirPath) {
830
+ deleteEntriesWhere(db, "dir_path", dirPath);
831
+ }
441
832
  export function deleteEntriesByStashDir(db, stashDir) {
442
- db.transaction(() => {
443
- const ids = db.prepare("SELECT id FROM entries WHERE stash_dir = ?").all(stashDir);
444
- deleteRelatedRows(db, ids);
445
- db.prepare("DELETE FROM entries WHERE stash_dir = ?").run(stashDir);
446
- })();
833
+ deleteEntriesWhere(db, "stash_dir", stashDir);
447
834
  }
448
- const SQLITE_CHUNK_SIZE = 500;
449
835
  function deleteRelatedRows(db, ids) {
450
836
  if (ids.length === 0)
451
837
  return;
@@ -480,13 +866,6 @@ function deleteRelatedRows(db, ids) {
480
866
  catch {
481
867
  /* ignore */
482
868
  }
483
- // Also delete from FTS table so orphaned FTS rows don't remain
484
- try {
485
- db.prepare(`DELETE FROM entries_fts WHERE entry_id IN (${placeholders})`).run(...chunk);
486
- }
487
- catch {
488
- /* ignore */
489
- }
490
869
  if (vecAvail) {
491
870
  try {
492
871
  db.prepare(`DELETE FROM entries_vec WHERE id IN (${placeholders})`).run(...chunk);
@@ -502,6 +881,12 @@ function deleteRelatedRows(db, ids) {
502
881
  catch {
503
882
  /* ignore */
504
883
  }
884
+ try {
885
+ db.prepare(`DELETE FROM utility_scores_scoped WHERE entry_id IN (${placeholders})`).run(...chunk);
886
+ }
887
+ catch {
888
+ /* ignore */
889
+ }
505
890
  // Clean up usage events before deleting entries
506
891
  try {
507
892
  db.prepare(`DELETE FROM usage_events WHERE entry_id IN (${placeholders})`).run(...chunk);
@@ -511,6 +896,26 @@ function deleteRelatedRows(db, ids) {
511
896
  }
512
897
  }
513
898
  }
899
+ /**
900
+ * Delete entries by their primary key IDs, along with all related rows
901
+ * (embeddings, entries_vec, entries_fts, utility_scores, usage_events).
902
+ *
903
+ * Used by the `--clean` post-pass to remove stale entries whose source files
904
+ * no longer exist on disk.
905
+ */
906
+ export function deleteEntriesByIds(db, ids) {
907
+ if (ids.length === 0)
908
+ return;
909
+ db.transaction(() => {
910
+ const idObjs = ids.map((id) => ({ id }));
911
+ deleteRelatedRows(db, idObjs);
912
+ for (let i = 0; i < ids.length; i += SQLITE_CHUNK_SIZE) {
913
+ const chunk = ids.slice(i, i + SQLITE_CHUNK_SIZE);
914
+ const placeholders = chunk.map(() => "?").join(",");
915
+ db.prepare(`DELETE FROM entries WHERE id IN (${placeholders})`).run(...chunk);
916
+ }
917
+ })();
918
+ }
514
919
  /**
515
920
  * Rebuild the FTS5 search index.
516
921
  *
@@ -585,19 +990,32 @@ export function rebuildFts(db, options) {
585
990
  }
586
991
  // ── Vector operations ───────────────────────────────────────────────────────
587
992
  export function upsertEmbedding(db, entryId, embedding) {
993
+ // Pre-flight FK guard: when an entry is deleted between when its id is queued
994
+ // for embedding and when this INSERT runs (e.g. consolidation deletes during
995
+ // a concurrent improve cycle), the INSERT throws "FOREIGN KEY constraint failed"
996
+ // and rolls back the entire batch transaction in the caller, losing every
997
+ // embedding for that run. A cheap SELECT here turns the race into a clean skip.
998
+ const exists = db.prepare("SELECT 1 FROM entries WHERE id = ?").get(entryId);
999
+ if (!exists)
1000
+ return false;
588
1001
  const buf = float32Buffer(embedding);
589
1002
  // Always write to BLOB table (works without sqlite-vec)
590
1003
  db.prepare("INSERT OR REPLACE INTO embeddings (id, embedding) VALUES (?, ?)").run(entryId, buf);
591
- // Also write to sqlite-vec table when available (fast path)
1004
+ // Also write to sqlite-vec table when available (fast path).
1005
+ // Wrapped in a transaction so a crash between DELETE and INSERT does not
1006
+ // leave the entry missing from the vec table.
592
1007
  if (isVecAvailable(db)) {
593
1008
  try {
594
- db.prepare("DELETE FROM entries_vec WHERE id = ?").run(entryId);
1009
+ db.transaction(() => {
1010
+ db.prepare("DELETE FROM entries_vec WHERE id = ?").run(entryId);
1011
+ db.prepare("INSERT INTO entries_vec (id, embedding) VALUES (?, ?)").run(entryId, buf);
1012
+ })();
595
1013
  }
596
1014
  catch {
597
- /* ignore */
1015
+ /* ignore — vec table unavailable or constraint failure */
598
1016
  }
599
- db.prepare("INSERT INTO entries_vec (id, embedding) VALUES (?, ?)").run(entryId, buf);
600
1017
  }
1018
+ return true;
601
1019
  }
602
1020
  export function searchVec(db, queryEmbedding, k) {
603
1021
  // Fast path: use sqlite-vec when available
@@ -617,6 +1035,23 @@ export function searchVec(db, queryEmbedding, k) {
617
1035
  // Fallback: JS-based cosine similarity over BLOB table
618
1036
  return searchBlobVec(db, queryEmbedding, k);
619
1037
  }
1038
+ /**
1039
+ * Return the k nearest neighbours of an already-indexed entry using its
1040
+ * persisted embedding — no re-embedding, no network. Decodes the stored BLOB by
1041
+ * byte length (dim = bytes / 4) and reuses searchVec (sqlite-vec fast path or
1042
+ * JS-cosine fallback). Returns [] when the entry has no stored embedding or the
1043
+ * BLOB is corrupt. The query entry itself is typically returned with distance
1044
+ * ~0 — callers should filter it out by id.
1045
+ */
1046
+ export function getNeighborsByEntryId(db, id, k) {
1047
+ const row = db.prepare("SELECT embedding FROM embeddings WHERE id = ?").get(id);
1048
+ if (!row)
1049
+ return [];
1050
+ const queryEmbedding = bufferToFloat32(row.embedding, Math.floor(row.embedding.byteLength / 4));
1051
+ if (!queryEmbedding)
1052
+ return [];
1053
+ return searchVec(db, queryEmbedding, k);
1054
+ }
620
1055
  function float32Buffer(vec) {
621
1056
  const f32 = new Float32Array(vec);
622
1057
  return Buffer.from(f32.buffer);
@@ -723,7 +1158,7 @@ function runFtsQuery(db, ftsQuery, limit, entryType) {
723
1158
  JOIN entries e ON e.id = f.entry_id
724
1159
  WHERE entries_fts MATCH ?
725
1160
  AND e.entry_type = ?
726
- ORDER BY bm25Score
1161
+ ORDER BY bm25Score, e.id ASC
727
1162
  LIMIT ?
728
1163
  `;
729
1164
  params = [ftsQuery, entryType, limit];
@@ -735,7 +1170,7 @@ function runFtsQuery(db, ftsQuery, limit, entryType) {
735
1170
  FROM entries_fts f
736
1171
  JOIN entries e ON e.id = f.entry_id
737
1172
  WHERE entries_fts MATCH ?
738
- ORDER BY bm25Score
1173
+ ORDER BY bm25Score, e.id ASC
739
1174
  LIMIT ?
740
1175
  `;
741
1176
  params = [ftsQuery, limit];
@@ -784,21 +1219,7 @@ export function sanitizeFtsQuery(query) {
784
1219
  // contain ALL terms.
785
1220
  return tokens.join(" ");
786
1221
  }
787
- // ── All entries ─────────────────────────────────────────────────────────────
788
- export function getAllEntries(db, entryType) {
789
- let sql;
790
- let params;
791
- if (entryType && entryType !== "any") {
792
- sql =
793
- "SELECT id, entry_key, dir_path, file_path, stash_dir, entry_json, search_text FROM entries WHERE entry_type = ?";
794
- params = [entryType];
795
- }
796
- else {
797
- sql = "SELECT id, entry_key, dir_path, file_path, stash_dir, entry_json, search_text FROM entries";
798
- params = [];
799
- }
800
- const rows = db.prepare(sql).all(...params);
801
- // Guard against corrupt JSON — skip the row rather than crashing
1222
+ function parseEntryRows(rows, context) {
802
1223
  const entries = [];
803
1224
  for (const row of rows) {
804
1225
  let entry;
@@ -806,7 +1227,7 @@ export function getAllEntries(db, entryType) {
806
1227
  entry = JSON.parse(row.entry_json);
807
1228
  }
808
1229
  catch {
809
- warn(`[db] getAllEntries: skipping entry id=${row.id} — corrupt entry_json`);
1230
+ warn(`[db] ${context}: skipping entry id=${row.id} — corrupt entry_json`);
810
1231
  continue;
811
1232
  }
812
1233
  entries.push({
@@ -821,13 +1242,38 @@ export function getAllEntries(db, entryType) {
821
1242
  }
822
1243
  return entries;
823
1244
  }
1245
+ export function getAllEntries(db, entryType) {
1246
+ let sql;
1247
+ let params;
1248
+ if (entryType && entryType !== "any") {
1249
+ sql =
1250
+ "SELECT id, entry_key, dir_path, file_path, stash_dir, entry_json, search_text FROM entries WHERE entry_type = ?";
1251
+ params = [entryType];
1252
+ }
1253
+ else {
1254
+ sql = "SELECT id, entry_key, dir_path, file_path, stash_dir, entry_json, search_text FROM entries";
1255
+ params = [];
1256
+ }
1257
+ const rows = db.prepare(sql).all(...params);
1258
+ return parseEntryRows(rows, "getAllEntries");
1259
+ }
824
1260
  export function findEntryIdByRef(db, ref) {
825
1261
  const parsed = parseAssetRef(ref);
826
- const suffix = `${parsed.type}:${parsed.name}`;
827
- const row = db
828
- .prepare("SELECT id FROM entries WHERE entry_type = ? AND substr(entry_key, length(entry_key) - length(?) + 1) = ? LIMIT 1")
829
- .get(parsed.type, suffix, suffix);
830
- return row?.id;
1262
+ const nameVariants = [parsed.name];
1263
+ if (parsed.name.endsWith(".md")) {
1264
+ nameVariants.push(parsed.name.slice(0, -3));
1265
+ }
1266
+ else {
1267
+ nameVariants.push(`${parsed.name}.md`);
1268
+ }
1269
+ const stmt = db.prepare("SELECT id FROM entries WHERE entry_type = ? AND substr(entry_key, length(entry_key) - length(?) + 1) = ? LIMIT 1");
1270
+ for (const name of nameVariants) {
1271
+ const suffix = `${parsed.type}:${name}`;
1272
+ const row = stmt.get(parsed.type, suffix, suffix);
1273
+ if (row)
1274
+ return row.id;
1275
+ }
1276
+ return undefined;
831
1277
  }
832
1278
  export function getEntryCount(db) {
833
1279
  const row = db.prepare("SELECT COUNT(*) AS cnt FROM entries").get();
@@ -856,28 +1302,7 @@ export function getEntriesByDir(db, dirPath) {
856
1302
  const rows = db
857
1303
  .prepare("SELECT id, entry_key, dir_path, file_path, stash_dir, entry_json, search_text FROM entries WHERE dir_path = ?")
858
1304
  .all(dirPath);
859
- // Guard against corrupt JSON — skip the row rather than crashing
860
- const entries = [];
861
- for (const row of rows) {
862
- let entry;
863
- try {
864
- entry = JSON.parse(row.entry_json);
865
- }
866
- catch {
867
- warn(`[db] getEntriesByDir: skipping entry id=${row.id} — corrupt entry_json`);
868
- continue;
869
- }
870
- entries.push({
871
- id: row.id,
872
- entryKey: row.entry_key,
873
- dirPath: row.dir_path,
874
- filePath: row.file_path,
875
- stashDir: row.stash_dir,
876
- entry,
877
- searchText: row.search_text,
878
- });
879
- }
880
- return entries;
1305
+ return parseEntryRows(rows, "getEntriesByDir");
881
1306
  }
882
1307
  /**
883
1308
  * Get the utility score for an entry, or undefined if none exists.
@@ -900,12 +1325,17 @@ export function getUtilityScore(db, entryId) {
900
1325
  }
901
1326
  /**
902
1327
  * Batch-load utility scores for multiple entry IDs in a single query.
903
- * Returns a Map keyed by entry_id for O(1) lookup.
1328
+ * Returns a `{ global, scoped }` pair, both Maps keyed by entry_id.
1329
+ *
1330
+ * When `scopeKey` is provided a second query runs against
1331
+ * `utility_scores_scoped` and the result is returned as `scoped`.
1332
+ * Both maps are always present; `scoped` is empty when `scopeKey` is absent.
904
1333
  */
905
- export function getUtilityScoresByIds(db, ids) {
1334
+ export function getUtilityScoresByIds(db, ids, scopeKey) {
1335
+ const global = new Map();
1336
+ const scoped = new Map();
906
1337
  if (ids.length === 0)
907
- return new Map();
908
- const result = new Map();
1338
+ return { global, scoped };
909
1339
  // Process in chunks to stay within SQLITE_MAX_VARIABLE_NUMBER
910
1340
  for (let i = 0; i < ids.length; i += SQLITE_CHUNK_SIZE) {
911
1341
  const chunk = ids.slice(i, i + SQLITE_CHUNK_SIZE);
@@ -914,7 +1344,7 @@ export function getUtilityScoresByIds(db, ids) {
914
1344
  .prepare(`SELECT entry_id, utility, show_count, search_count, select_rate, last_used_at, updated_at FROM utility_scores WHERE entry_id IN (${placeholders})`)
915
1345
  .all(...chunk);
916
1346
  for (const row of rows) {
917
- result.set(row.entry_id, {
1347
+ global.set(row.entry_id, {
918
1348
  entryId: row.entry_id,
919
1349
  utility: row.utility,
920
1350
  showCount: row.show_count,
@@ -924,13 +1354,34 @@ export function getUtilityScoresByIds(db, ids) {
924
1354
  updatedAt: row.updated_at,
925
1355
  });
926
1356
  }
1357
+ if (scopeKey) {
1358
+ const scopedRows = db
1359
+ .prepare(`SELECT entry_id, scope_key, utility, last_used_at FROM utility_scores_scoped WHERE scope_key = ? AND entry_id IN (${placeholders})`)
1360
+ .all(scopeKey, ...chunk);
1361
+ for (const row of scopedRows) {
1362
+ scoped.set(row.entry_id, {
1363
+ entryId: row.entry_id,
1364
+ scopeKey: row.scope_key,
1365
+ utility: row.utility,
1366
+ lastUsedAt: row.last_used_at,
1367
+ });
1368
+ }
1369
+ }
927
1370
  }
928
- return result;
1371
+ return { global, scoped };
929
1372
  }
930
1373
  /**
931
1374
  * Insert or update a utility score for an entry.
932
1375
  */
933
1376
  export function upsertUtilityScore(db, entryId, data) {
1377
+ // Pre-flight FK guard (mirrors `upsertEmbedding`): when an entry is
1378
+ // deleted between when its id is aggregated from usage_events and when
1379
+ // this INSERT runs, the FK constraint fails and rolls back the entire
1380
+ // finalize transaction. A cheap SELECT here turns the race into a
1381
+ // clean skip. Returns false when the entry no longer exists.
1382
+ const exists = db.prepare("SELECT 1 FROM entries WHERE id = ?").get(entryId);
1383
+ if (!exists)
1384
+ return false;
934
1385
  db.prepare(`
935
1386
  INSERT INTO utility_scores (entry_id, utility, show_count, search_count, select_rate, last_used_at, updated_at)
936
1387
  VALUES (?, ?, ?, ?, ?, ?, datetime('now'))
@@ -942,4 +1393,435 @@ export function upsertUtilityScore(db, entryId, data) {
942
1393
  last_used_at = excluded.last_used_at,
943
1394
  updated_at = datetime('now')
944
1395
  `).run(entryId, data.utility, data.showCount, data.searchCount, data.selectRate, data.lastUsedAt ?? null);
1396
+ return true;
1397
+ }
1398
+ /**
1399
+ * Look up a cached LLM result for the given asset_ref.
1400
+ *
1401
+ * Returns `undefined` when no entry exists OR when the stored body_hash
1402
+ * doesn't match `currentBodyHash` (body has changed since the result was
1403
+ * cached). In both cases the caller should invoke the LLM and write a new
1404
+ * cache entry.
1405
+ */
1406
+ export function getLlmCacheEntry(db, assetRef, currentBodyHash, cacheVariant = "") {
1407
+ const row = db
1408
+ .prepare("SELECT asset_ref, cache_variant, body_hash, result_json, updated_at FROM llm_enrichment_cache WHERE asset_ref = ? AND cache_variant = ?")
1409
+ .get(assetRef, cacheVariant);
1410
+ if (!row)
1411
+ return undefined;
1412
+ // Hash mismatch → body changed, treat as cache miss.
1413
+ if (row.body_hash !== currentBodyHash)
1414
+ return undefined;
1415
+ return {
1416
+ assetRef: row.asset_ref,
1417
+ cacheVariant: row.cache_variant,
1418
+ bodyHash: row.body_hash,
1419
+ resultJson: row.result_json,
1420
+ updatedAt: row.updated_at,
1421
+ };
1422
+ }
1423
+ /**
1424
+ * Batched variant of {@link getLlmCacheEntry}. Fetches every cache row whose
1425
+ * `asset_ref` is in `refs` with a single `IN (...)` query (chunked to respect
1426
+ * SQLITE_MAX_VARIABLE_NUMBER), returning a `Map<assetRef, LlmCacheEntry>`.
1427
+ *
1428
+ * Unlike `getLlmCacheEntry`, this does NOT filter by body hash — callers must
1429
+ * compare `entry.bodyHash` against the current body hash themselves. This lets
1430
+ * the batch path issue one DB query per chunk instead of one per file.
1431
+ */
1432
+ export function getLlmCacheEntriesByRefs(db, refs, cacheVariant = "") {
1433
+ const result = new Map();
1434
+ if (refs.length === 0)
1435
+ return result;
1436
+ for (let i = 0; i < refs.length; i += SQLITE_CHUNK_SIZE) {
1437
+ const chunk = refs.slice(i, i + SQLITE_CHUNK_SIZE);
1438
+ const placeholders = chunk.map(() => "?").join(", ");
1439
+ const rows = db
1440
+ .prepare(`SELECT asset_ref, cache_variant, body_hash, result_json, updated_at FROM llm_enrichment_cache
1441
+ WHERE cache_variant = ? AND asset_ref IN (${placeholders})`)
1442
+ .all(cacheVariant, ...chunk);
1443
+ for (const row of rows) {
1444
+ result.set(row.asset_ref, {
1445
+ assetRef: row.asset_ref,
1446
+ cacheVariant: row.cache_variant,
1447
+ bodyHash: row.body_hash,
1448
+ resultJson: row.result_json,
1449
+ updatedAt: row.updated_at,
1450
+ });
1451
+ }
1452
+ }
1453
+ return result;
1454
+ }
1455
+ /**
1456
+ * Insert or update a cached LLM result for the given asset_ref.
1457
+ */
1458
+ export function upsertLlmCacheEntry(db, assetRef, bodyHash, resultJson, cacheVariant = "") {
1459
+ db.prepare(`INSERT INTO llm_enrichment_cache (asset_ref, cache_variant, body_hash, result_json, updated_at)
1460
+ VALUES (?, ?, ?, ?, ?)
1461
+ ON CONFLICT(asset_ref, cache_variant) DO UPDATE SET
1462
+ body_hash = excluded.body_hash,
1463
+ result_json = excluded.result_json,
1464
+ updated_at = excluded.updated_at`).run(assetRef, cacheVariant, bodyHash, resultJson, Date.now());
1465
+ }
1466
+ /**
1467
+ * Delete LLM cache entries whose asset_ref is no longer present in the
1468
+ * `entries` table. Should be called during the cleanup phase of each index
1469
+ * run to prevent the cache from growing unboundedly as assets are removed.
1470
+ *
1471
+ * The join uses a LIKE match against the entries `file_path` column because
1472
+ * graph/memory cache refs are absolute file paths, while enrichment cache
1473
+ * refs are entry_key strings — we preserve any entry that still has a
1474
+ * corresponding row in either the entries table (by entry_key) or that
1475
+ * matches a live file_path.
1476
+ */
1477
+ export function clearStaleCacheEntries(db) {
1478
+ try {
1479
+ db.exec(`
1480
+ DELETE FROM llm_enrichment_cache
1481
+ WHERE asset_ref NOT IN (SELECT file_path FROM entries)
1482
+ AND asset_ref NOT IN (SELECT entry_key FROM entries)
1483
+ `);
1484
+ }
1485
+ catch {
1486
+ /* ignore — table may not exist in very old DBs opened without ensureSchema */
1487
+ }
1488
+ }
1489
+ /**
1490
+ * Compute a stable SHA-256 hex digest of a UTF-8 string using Bun's native
1491
+ * hashing. Used as the body_hash key in `llm_enrichment_cache`.
1492
+ *
1493
+ * Bun.CryptoHasher is synchronous and allocation-free compared to Web Crypto,
1494
+ * making it suitable for use inside tight per-asset loops.
1495
+ */
1496
+ export function computeBodyHash(body) {
1497
+ const hasher = new Bun.CryptoHasher("sha256");
1498
+ hasher.update(body);
1499
+ return hasher.digest("hex");
1500
+ }
1501
+ /**
1502
+ * Count search and show events for the given entry refs.
1503
+ * Returns a Map<ref, count> with only refs that have at least one event.
1504
+ * Used by the improve loop to find high-retrieval assets without feedback.
1505
+ */
1506
+ export function getRetrievalCounts(db, refs) {
1507
+ if (refs.length === 0)
1508
+ return new Map();
1509
+ const result = new Map();
1510
+ // Chunk to stay within SQLITE_MAX_VARIABLE_NUMBER (same pattern as getUtilityScoresByIds).
1511
+ for (let i = 0; i < refs.length; i += SQLITE_CHUNK_SIZE) {
1512
+ const chunk = refs.slice(i, i + SQLITE_CHUNK_SIZE);
1513
+ const placeholders = chunk.map(() => "?").join(", ");
1514
+ const rows = db
1515
+ .prepare(`SELECT entry_ref, COUNT(*) AS cnt FROM usage_events
1516
+ WHERE event_type IN ('search','show') AND entry_ref IN (${placeholders})
1517
+ GROUP BY entry_ref`)
1518
+ .all(...chunk);
1519
+ for (const r of rows)
1520
+ result.set(r.entry_ref, r.cnt);
1521
+ }
1522
+ return result;
1523
+ }
1524
+ /**
1525
+ * Apply a MemRL reward signal to a batch of entries via exponential moving
1526
+ * average (EMA): next = clamp(current + lr * (reward - current), 0, 1).
1527
+ *
1528
+ * Wrapped in a single transaction so all bumps succeed or fail together.
1529
+ * The indexer (`akm index`) will overwrite these values at next reindex run;
1530
+ * bumps are intentionally temporary hints between index runs, not permanent
1531
+ * overrides.
1532
+ *
1533
+ * When `scopeKey` is provided, also writes a scoped bump to
1534
+ * `utility_scores_scoped` so per-project usage signals accumulate alongside
1535
+ * the global ones. The global table is always updated regardless.
1536
+ */
1537
+ export function bumpUtilityScoresBatch(db, entryIds, reward, lr = 0.1, scopeKey) {
1538
+ if (entryIds.length === 0)
1539
+ return;
1540
+ db.transaction(() => {
1541
+ const { global: scoreMap } = getUtilityScoresByIds(db, entryIds);
1542
+ const now = new Date().toISOString();
1543
+ const nowMs = Date.now();
1544
+ const stmt = db.prepare(`INSERT INTO utility_scores (entry_id, utility, show_count, search_count, select_rate, last_used_at, updated_at)
1545
+ VALUES (?, ?, 0, 0, 0, ?, ?)
1546
+ ON CONFLICT(entry_id) DO UPDATE SET
1547
+ utility = excluded.utility,
1548
+ updated_at = excluded.updated_at`);
1549
+ // Prepare scoped upsert once outside the loop when scopeKey is present.
1550
+ const scopedStmt = scopeKey
1551
+ ? db.prepare(`INSERT INTO utility_scores_scoped (entry_id, scope_key, utility, last_used_at)
1552
+ VALUES (?, ?, ?, ?)
1553
+ ON CONFLICT(entry_id, scope_key) DO UPDATE SET
1554
+ utility = excluded.utility,
1555
+ last_used_at = excluded.last_used_at`)
1556
+ : null;
1557
+ for (const entryId of entryIds) {
1558
+ const existing = scoreMap.get(entryId);
1559
+ const current = existing?.utility ?? 0;
1560
+ const next = Math.max(0, Math.min(1, current + lr * (reward - current)));
1561
+ stmt.run(entryId, next, now, now);
1562
+ if (scopedStmt && scopeKey) {
1563
+ // Retrieve the current scoped utility so we can apply the same EMA.
1564
+ const scopedCurrent = getScopedUtility(db, entryId, scopeKey);
1565
+ const scopedNext = Math.max(0, Math.min(1, scopedCurrent + lr * (reward - scopedCurrent)));
1566
+ scopedStmt.run(entryId, scopeKey, scopedNext, nowMs);
1567
+ }
1568
+ }
1569
+ })();
1570
+ }
1571
+ /**
1572
+ * Return the current utility value for a single (entry_id, scope_key) pair.
1573
+ * Returns 0 when no row exists yet.
1574
+ */
1575
+ function getScopedUtility(db, entryId, scopeKey) {
1576
+ const row = db
1577
+ .prepare("SELECT utility FROM utility_scores_scoped WHERE entry_id = ? AND scope_key = ?")
1578
+ .get(entryId, scopeKey);
1579
+ return row?.utility ?? 0;
1580
+ }
1581
+ // ── Indexer-phase helpers (moved from indexer.ts) ────────────────────────────
1582
+ /**
1583
+ * Return all entries that do not yet have an embedding row.
1584
+ * Used by the embedding phase to determine which entries need vectors generated.
1585
+ */
1586
+ export function getAllEntriesForEmbedding(db) {
1587
+ return db
1588
+ .prepare(`
1589
+ SELECT e.id, e.search_text AS searchText, e.entry_key AS entryKey, e.file_path AS filePath FROM entries e
1590
+ WHERE NOT EXISTS (SELECT 1 FROM embeddings b WHERE b.id = e.id)
1591
+ AND e.entry_type != 'vault'
1592
+ `)
1593
+ .all();
1594
+ }
1595
+ /**
1596
+ * Upsert a workflow document record for an indexed entry.
1597
+ * Persists the parsed workflow AST as JSON alongside a FNV-1a hash of the
1598
+ * source content for future incremental fast-paths.
1599
+ */
1600
+ export function upsertWorkflowDocument(db, entryId, doc, content) {
1601
+ const sourceHash = computeSourceHash(content);
1602
+ db.prepare(`INSERT INTO workflow_documents (entry_id, schema_version, document_json, source_path, source_hash, updated_at)
1603
+ VALUES (?, ?, ?, ?, ?, ?)
1604
+ ON CONFLICT(entry_id) DO UPDATE SET
1605
+ schema_version = excluded.schema_version,
1606
+ document_json = excluded.document_json,
1607
+ source_path = excluded.source_path,
1608
+ source_hash = excluded.source_hash,
1609
+ updated_at = excluded.updated_at`).run(entryId, doc.schemaVersion, JSON.stringify(doc), doc.source.path, sourceHash, new Date().toISOString());
1610
+ }
1611
+ /**
1612
+ * Compute a cheap FNV-1a hash of a buffer for source-identity tracking.
1613
+ * Not security-sensitive; used as an incremental fast-path skip key.
1614
+ */
1615
+ export function computeSourceHash(content) {
1616
+ let hash = 0x811c9dc5;
1617
+ for (let i = 0; i < content.length; i++) {
1618
+ hash ^= content[i];
1619
+ hash = Math.imul(hash, 0x01000193);
1620
+ }
1621
+ return (hash >>> 0).toString(16);
1622
+ }
1623
+ /**
1624
+ * Return distinct zero-result search queries from the `usage_events` table
1625
+ * within the given lookback window.
1626
+ *
1627
+ * Reads from `usage_events` (event_type = 'search') where the metadata JSON
1628
+ * blob contains `resultCount = 0`. The `search_events` table never existed;
1629
+ * all errors are caught and an empty array is returned so callers never need
1630
+ * to guard against DB schema differences.
1631
+ */
1632
+ export function getZeroResultSearches(db, sinceDays = 30) {
1633
+ const since = new Date(Date.now() - sinceDays * 24 * 60 * 60 * 1000).toISOString();
1634
+ try {
1635
+ const rows = db
1636
+ .prepare(`SELECT DISTINCT json_extract(metadata, '$.query') AS query
1637
+ FROM usage_events
1638
+ WHERE event_type = 'search'
1639
+ AND created_at >= ?
1640
+ AND json_extract(metadata, '$.resultCount') = 0
1641
+ ORDER BY created_at DESC LIMIT 20`)
1642
+ .all(since);
1643
+ return rows.map((r) => r.query).filter((q) => q !== null);
1644
+ }
1645
+ catch {
1646
+ return []; // table may not exist in older DBs
1647
+ }
1648
+ }
1649
+ /**
1650
+ * Look up an entry by its integer numeric id.
1651
+ * Returns null when no matching row is found.
1652
+ */
1653
+ export function getEntryByRef(db, type, name) {
1654
+ return db.prepare("SELECT id FROM entries WHERE entry_type = ? AND entry_key = ?").get(type, `${type}:${name}`);
1655
+ }
1656
+ /**
1657
+ * MemRL learning rate for feedback-driven utility updates (F-5 / #386).
1658
+ *
1659
+ * Follows the bounded-step formula from MemRL (arXiv:2601.03192):
1660
+ * next = clamp(current + lr × (reward − current), 0, 1)
1661
+ *
1662
+ * This replaces the unbounded `-0.03 × negativeCount` delta that could
1663
+ * silently remove high-utility assets from the improvement loop.
1664
+ */
1665
+ const FEEDBACK_LR = 0.1;
1666
+ /**
1667
+ * Positive reward signal for a single positive feedback event.
1668
+ * Reward 1.0 means "fully correct / helpful".
1669
+ */
1670
+ const FEEDBACK_REWARD_POSITIVE = 1.0;
1671
+ /**
1672
+ * Negative reward signal for a single negative feedback event.
1673
+ * Reward 0.0 means "not helpful" (lowest MemRL signal).
1674
+ */
1675
+ const FEEDBACK_REWARD_NEGATIVE = 0.0;
1676
+ /**
1677
+ * Maximum total negative utility delta allowed in a single
1678
+ * `applyFeedbackToUtilityScore` call regardless of negativeCount.
1679
+ *
1680
+ * This caps the per-day negative impact (the function is called once per
1681
+ * feedback event — spamming 10 negatives in one session can move utility
1682
+ * at most `MAX_NEG_DELTA_PER_CALL`). The cap prevents a noisy negative-
1683
+ * feedback stream from silently destroying a high-utility asset's ranking.
1684
+ */
1685
+ const MAX_NEG_DELTA_PER_CALL = 0.15;
1686
+ /**
1687
+ * Utility threshold below which a review-needed escalation is triggered.
1688
+ * When a previously high-utility asset (≥ HIGH_UTILITY_THRESHOLD) drops
1689
+ * below this value, the caller should create an escalation proposal.
1690
+ */
1691
+ export const UTILITY_REVIEW_THRESHOLD = 0.5;
1692
+ /**
1693
+ * Utility level considered "high" — assets above this are tracked for
1694
+ * threshold-crossing escalation.
1695
+ */
1696
+ export const HIGH_UTILITY_THRESHOLD = 0.5;
1697
+ /**
1698
+ * Apply accumulated feedback counts to the utility score of an entry using the
1699
+ * MemRL bounded-step EMA formula (F-5 / #386, arXiv:2601.03192).
1700
+ *
1701
+ * Replaces the previous unbounded `-0.03 × negativeCount` formula with:
1702
+ *
1703
+ * reward = weighted average of positive and negative signals
1704
+ * nextUtil = clamp(currentUtil + lr × (reward − currentUtil), 0, 1)
1705
+ *
1706
+ * The negative impact is additionally capped at {@link MAX_NEG_DELTA_PER_CALL}
1707
+ * to prevent a noisy feedback stream from silently erasing a high-utility asset.
1708
+ *
1709
+ * A new entry starts at 0.5 (neutral midpoint) before the EMA step is applied.
1710
+ *
1711
+ * Returns a {@link FeedbackUtilityResult} so the caller can detect when a
1712
+ * previously high-utility asset crosses below the review threshold and create
1713
+ * an escalation proposal.
1714
+ */
1715
+ export function applyFeedbackToUtilityScore(db, entryId, positiveCount, negativeCount) {
1716
+ const existing = getUtilityScore(db, entryId);
1717
+ const previousUtility = existing?.utility ?? 0.5;
1718
+ if (positiveCount === 0 && negativeCount === 0) {
1719
+ return { previousUtility, nextUtility: previousUtility, crossedReviewThreshold: false };
1720
+ }
1721
+ const total = positiveCount + negativeCount;
1722
+ // Weighted reward: proportion of positive signals.
1723
+ const reward = positiveCount > 0 && negativeCount === 0
1724
+ ? FEEDBACK_REWARD_POSITIVE
1725
+ : negativeCount > 0 && positiveCount === 0
1726
+ ? FEEDBACK_REWARD_NEGATIVE
1727
+ : (positiveCount * FEEDBACK_REWARD_POSITIVE + negativeCount * FEEDBACK_REWARD_NEGATIVE) / total;
1728
+ // MemRL bounded-step EMA: lr × (reward − current)
1729
+ let delta = FEEDBACK_LR * (reward - previousUtility);
1730
+ // Per-call negative cap: if delta is negative (net negative feedback), cap it.
1731
+ if (delta < 0) {
1732
+ delta = Math.max(delta, -MAX_NEG_DELTA_PER_CALL);
1733
+ }
1734
+ const nextUtility = Math.max(0, Math.min(1, previousUtility + delta));
1735
+ const now = new Date().toISOString();
1736
+ db.prepare(`
1737
+ INSERT INTO utility_scores (entry_id, utility, show_count, search_count, select_rate, last_used_at, updated_at)
1738
+ VALUES (?, ?, 0, 0, 0, ?, ?)
1739
+ ON CONFLICT(entry_id) DO UPDATE SET
1740
+ utility = ?,
1741
+ updated_at = ?
1742
+ `).run(entryId, nextUtility, now, now, nextUtility, now);
1743
+ const crossedReviewThreshold = previousUtility >= HIGH_UTILITY_THRESHOLD && nextUtility < UTILITY_REVIEW_THRESHOLD;
1744
+ return { previousUtility, nextUtility, crossedReviewThreshold };
1745
+ }
1746
+ /**
1747
+ * Re-link detached usage_events to their current entry_ids via entry_ref.
1748
+ *
1749
+ * After a full rebuild, entry IDs change. This query matches events to their
1750
+ * new entry rows using the stable `entry_ref` ("type:name") column so usage
1751
+ * history survives a full reindex.
1752
+ */
1753
+ export function relinkUsageEvents(db) {
1754
+ try {
1755
+ // Step 1: null out stale entry_ids (entry was deleted, re-keyed, etc).
1756
+ // Leaving them in place would let `recomputeUtilityScores` aggregate
1757
+ // by an entry_id that no longer exists in `entries`, then trip the FK
1758
+ // constraint on the utility_scores INSERT and roll back the entire
1759
+ // finalize transaction. Nulled rows can be re-resolved by step 2 below;
1760
+ // events whose entry is permanently gone simply stay null and age out
1761
+ // via the 90-day retention policy.
1762
+ db.exec(`
1763
+ UPDATE usage_events
1764
+ SET entry_id = NULL
1765
+ WHERE entry_id IS NOT NULL
1766
+ AND entry_id NOT IN (SELECT id FROM entries)
1767
+ `);
1768
+ // Step 2: re-resolve any null entry_id from entry_ref against the
1769
+ // current entries table. Picks up entries that were re-created with
1770
+ // the same ref (e.g. an asset moved between sources).
1771
+ db.exec(`
1772
+ UPDATE usage_events SET entry_id = (
1773
+ SELECT e.id FROM entries e
1774
+ WHERE substr(e.entry_key, length(e.entry_key) - length(usage_events.entry_ref)) = ':' || usage_events.entry_ref
1775
+ LIMIT 1
1776
+ )
1777
+ WHERE entry_id IS NULL AND entry_ref IS NOT NULL
1778
+ `);
1779
+ }
1780
+ catch {
1781
+ /* ignore if table doesn't exist yet */
1782
+ }
1783
+ }
1784
+ // ── registry_index_cache helpers ─────────────────────────────────────────────
1785
+ /**
1786
+ * Upsert a registry index cache entry in index.db.
1787
+ *
1788
+ * @param db - Open index.db connection (from openDatabase / openExistingDatabase).
1789
+ * @param registryUrl - Canonical URL of the registry (used as primary key).
1790
+ * @param indexJson - Serialised registry index document (JSON string).
1791
+ * @param opts.etag - HTTP ETag from the response (optional).
1792
+ * @param opts.lastModified - HTTP Last-Modified from the response (optional).
1793
+ */
1794
+ export function upsertRegistryIndexCache(db, registryUrl, indexJson, opts) {
1795
+ db.prepare(`
1796
+ INSERT INTO registry_index_cache (registry_url, fetched_at, etag, last_modified, index_json)
1797
+ VALUES (?, ?, ?, ?, ?)
1798
+ ON CONFLICT(registry_url) DO UPDATE SET
1799
+ fetched_at = excluded.fetched_at,
1800
+ etag = excluded.etag,
1801
+ last_modified = excluded.last_modified,
1802
+ index_json = excluded.index_json
1803
+ `).run(registryUrl, new Date().toISOString(), opts?.etag ?? null, opts?.lastModified ?? null, indexJson);
1804
+ }
1805
+ /**
1806
+ * Look up a cached registry index entry from index.db.
1807
+ * Returns undefined when not found or when the entry is older than `maxAgeMs`.
1808
+ *
1809
+ * TTL check: if `Date.now() - new Date(fetched_at).getTime() > maxAgeMs` the
1810
+ * entry is considered a cache miss and undefined is returned.
1811
+ *
1812
+ * @param db - Open index.db connection.
1813
+ * @param registryUrl - Canonical URL of the registry (primary key).
1814
+ * @param maxAgeMs - Maximum age in milliseconds before the entry is stale (default: 1 hour).
1815
+ */
1816
+ export function getRegistryIndexCache(db, registryUrl, maxAgeMs = 3_600_000 /* 1 hour */) {
1817
+ const row = db
1818
+ .prepare(`SELECT fetched_at, etag, last_modified, index_json
1819
+ FROM registry_index_cache WHERE registry_url = ?`)
1820
+ .get(registryUrl);
1821
+ if (!row)
1822
+ return undefined;
1823
+ const fetchedAt = Date.parse(row.fetched_at);
1824
+ if (Number.isNaN(fetchedAt) || Date.now() - fetchedAt > maxAgeMs)
1825
+ return undefined;
1826
+ return { indexJson: row.index_json, etag: row.etag, lastModified: row.last_modified };
945
1827
  }