akm-cli 0.7.5 → 0.8.0-rc.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/{.github/CHANGELOG.md → CHANGELOG.md} +113 -2
- package/README.md +20 -4
- package/SECURITY.md +93 -0
- package/dist/cli/config-migrate.js +144 -0
- package/dist/cli/config-validate.js +39 -0
- package/dist/cli/confirm.js +73 -0
- package/dist/cli/parse-args.js +133 -0
- package/dist/cli.js +1995 -551
- package/dist/commands/agent-dispatch.js +110 -0
- package/dist/commands/agent-support.js +68 -0
- package/dist/commands/completions.js +3 -0
- package/dist/commands/config-cli.js +130 -534
- package/dist/commands/consolidate.js +1531 -0
- package/dist/commands/curate.js +44 -3
- package/dist/commands/db-cli.js +23 -0
- package/dist/commands/distill-promotion-policy.js +660 -0
- package/dist/commands/distill.js +990 -75
- package/dist/commands/eval-cases.js +43 -0
- package/dist/commands/events.js +5 -23
- package/dist/commands/graph.js +477 -0
- package/dist/commands/health.js +400 -0
- package/dist/commands/help/help-accept.md +9 -0
- package/dist/commands/help/help-improve.md +77 -0
- package/dist/commands/help/help-proposals.md +15 -0
- package/dist/commands/help/help-propose.md +17 -0
- package/dist/commands/help/help-reject.md +8 -0
- package/dist/commands/history.js +54 -46
- package/dist/commands/improve-profiles.js +146 -0
- package/dist/commands/improve-result-file.js +103 -0
- package/dist/commands/improve.js +2175 -0
- package/dist/commands/info.js +5 -2
- package/dist/commands/init.js +50 -2
- package/dist/commands/installed-stashes.js +102 -139
- package/dist/commands/knowledge.js +136 -0
- package/dist/commands/lint/agent-linter.js +49 -0
- package/dist/commands/lint/base-linter.js +479 -0
- package/dist/commands/lint/command-linter.js +49 -0
- package/dist/commands/lint/default-linter.js +16 -0
- package/dist/commands/lint/index.js +183 -0
- package/dist/commands/lint/knowledge-linter.js +16 -0
- package/dist/commands/lint/markdown-insertion.js +343 -0
- package/dist/commands/lint/memory-linter.js +61 -0
- package/dist/commands/lint/registry.js +36 -0
- package/dist/commands/lint/skill-linter.js +45 -0
- package/dist/commands/lint/task-linter.js +50 -0
- package/dist/commands/lint/types.js +4 -0
- package/dist/commands/lint/vault-key-rules.js +139 -0
- package/dist/commands/lint/workflow-linter.js +56 -0
- package/dist/commands/lint.js +4 -0
- package/dist/commands/migration-help.js +5 -2
- package/dist/commands/proposal.js +66 -12
- package/dist/commands/propose.js +86 -31
- package/dist/commands/reflect.js +1119 -73
- package/dist/commands/registry-search.js +5 -2
- package/dist/commands/remember.js +69 -6
- package/dist/commands/schema-repair.js +203 -0
- package/dist/commands/search.js +115 -14
- package/dist/commands/self-update.js +3 -0
- package/dist/commands/show.js +144 -25
- package/dist/commands/source-add.js +17 -45
- package/dist/commands/source-clone.js +3 -0
- package/dist/commands/source-manage.js +14 -19
- package/dist/commands/tasks.js +438 -0
- package/dist/commands/url-checker.js +42 -0
- package/dist/commands/vault.js +130 -77
- package/dist/core/action-contributors.js +28 -0
- package/dist/core/asset-ref.js +7 -0
- package/dist/core/asset-registry.js +7 -16
- package/dist/core/asset-serialize.js +88 -0
- package/dist/core/asset-spec.js +22 -0
- package/dist/core/common.js +157 -0
- package/dist/core/concurrent.js +25 -0
- package/dist/core/config-io.js +347 -0
- package/dist/core/config-migration.js +625 -0
- package/dist/core/config-schema.js +501 -0
- package/dist/core/config-sources.js +108 -0
- package/dist/core/config-types.js +4 -0
- package/dist/core/config-walker.js +337 -0
- package/dist/core/config.js +327 -987
- package/dist/core/errors.js +40 -19
- package/dist/core/events.js +91 -138
- package/dist/core/file-lock.js +104 -0
- package/dist/core/frontmatter.js +3 -6
- package/dist/core/lesson-lint.js +3 -0
- package/dist/core/markdown.js +20 -0
- package/dist/core/memory-belief.js +62 -0
- package/dist/core/memory-contradiction-detect.js +274 -0
- package/dist/core/memory-improve.js +806 -0
- package/dist/core/parse.js +158 -0
- package/dist/core/paths.js +326 -14
- package/dist/core/proposal-quality-validators.js +364 -0
- package/dist/core/proposal-validators.js +69 -0
- package/dist/core/proposals.js +498 -42
- package/dist/core/state-db.js +927 -0
- package/dist/core/text-truncation.js +107 -0
- package/dist/core/time.js +54 -0
- package/dist/core/warn.js +62 -1
- package/dist/core/write-source.js +3 -0
- package/dist/indexer/db-backup.js +391 -0
- package/dist/indexer/db-search.js +152 -253
- package/dist/indexer/db.js +933 -103
- package/dist/indexer/ensure-index.js +64 -0
- package/dist/indexer/file-context.js +3 -0
- package/dist/indexer/graph-boost.js +376 -101
- package/dist/indexer/graph-db.js +391 -0
- package/dist/indexer/graph-dedup.js +95 -0
- package/dist/indexer/graph-extraction.js +550 -124
- package/dist/indexer/index-context.js +4 -0
- package/dist/indexer/indexer.js +506 -291
- package/dist/indexer/llm-cache.js +47 -0
- package/dist/indexer/manifest.js +3 -0
- package/dist/indexer/matchers.js +148 -160
- package/dist/indexer/memory-inference.js +99 -74
- package/dist/indexer/metadata-contributors.js +29 -0
- package/dist/indexer/metadata.js +255 -196
- package/dist/indexer/path-resolver.js +92 -0
- package/dist/indexer/project-context.js +192 -0
- package/dist/indexer/ranking-contributors.js +331 -0
- package/dist/indexer/ranking.js +81 -0
- package/dist/indexer/search-fields.js +5 -9
- package/dist/indexer/search-hit-enrichers.js +111 -0
- package/dist/indexer/search-source.js +44 -10
- package/dist/indexer/semantic-status.js +5 -16
- package/dist/indexer/staleness-detect.js +447 -0
- package/dist/indexer/usage-events.js +12 -9
- package/dist/indexer/walker.js +28 -0
- package/dist/integrations/agent/builders.js +135 -0
- package/dist/integrations/agent/config.js +122 -230
- package/dist/integrations/agent/detect.js +3 -0
- package/dist/integrations/agent/index.js +7 -13
- package/dist/integrations/agent/model-aliases.js +55 -0
- package/dist/integrations/agent/profiles.js +70 -5
- package/dist/integrations/agent/prompts.js +150 -74
- package/dist/integrations/agent/runner.js +151 -0
- package/dist/integrations/agent/sdk-runner.js +126 -0
- package/dist/integrations/agent/spawn.js +118 -23
- package/dist/integrations/github.js +3 -0
- package/dist/integrations/lockfile.js +32 -69
- package/dist/integrations/session-logs/index.js +68 -0
- package/dist/integrations/session-logs/providers/claude-code.js +59 -0
- package/dist/integrations/session-logs/providers/opencode.js +55 -0
- package/dist/integrations/session-logs/types.js +4 -0
- package/dist/llm/call-ai.js +62 -0
- package/dist/llm/client.js +72 -124
- package/dist/llm/embedder.js +3 -19
- package/dist/llm/embedders/cache.js +3 -7
- package/dist/llm/embedders/local.js +3 -0
- package/dist/llm/embedders/remote.js +20 -8
- package/dist/llm/embedders/types.js +3 -7
- package/dist/llm/feature-gate.js +89 -48
- package/dist/llm/graph-extract.js +676 -70
- package/dist/llm/index-passes.js +9 -23
- package/dist/llm/memory-infer.js +52 -71
- package/dist/llm/metadata-enhance.js +42 -29
- package/dist/llm/prompts/graph-extract-user-prompt.md +35 -0
- package/dist/output/cli-hints-full.md +281 -0
- package/dist/output/cli-hints-short.md +65 -0
- package/dist/output/cli-hints.js +5 -318
- package/dist/output/context.js +3 -0
- package/dist/output/renderers.js +223 -256
- package/dist/output/shapes.js +150 -105
- package/dist/output/text.js +318 -30
- package/dist/registry/build-index.js +3 -0
- package/dist/registry/create-provider-registry.js +3 -0
- package/dist/registry/factory.js +3 -0
- package/dist/registry/origin-resolve.js +3 -0
- package/dist/registry/providers/index.js +3 -0
- package/dist/registry/providers/skills-sh.js +70 -49
- package/dist/registry/providers/static-index.js +53 -48
- package/dist/registry/providers/types.js +3 -24
- package/dist/registry/resolve.js +11 -16
- package/dist/registry/types.js +3 -0
- package/dist/scripts/migrate-storage.js +17307 -0
- package/dist/scripts/migrations/import-fs-improve-runs-to-db.js +8900 -0
- package/dist/scripts/migrations/v16-to-v17.js +141 -0
- package/dist/setup/detect.js +3 -0
- package/dist/setup/ripgrep-install.js +3 -0
- package/dist/setup/ripgrep-resolve.js +3 -0
- package/dist/setup/setup.js +775 -37
- package/dist/setup/steps.js +3 -15
- package/dist/sources/include.js +3 -0
- package/dist/sources/provider-factory.js +5 -12
- package/dist/sources/provider.js +3 -20
- package/dist/sources/providers/filesystem.js +19 -23
- package/dist/sources/providers/git.js +7 -5
- package/dist/sources/providers/index.js +3 -0
- package/dist/sources/providers/install-types.js +3 -13
- package/dist/sources/providers/npm.js +3 -4
- package/dist/sources/providers/provider-utils.js +3 -0
- package/dist/sources/providers/sync-from-ref.js +3 -11
- package/dist/sources/providers/tar-utils.js +3 -0
- package/dist/sources/providers/website.js +18 -22
- package/dist/sources/resolve.js +3 -0
- package/dist/sources/types.js +3 -0
- package/dist/sources/website-ingest.js +7 -0
- package/dist/tasks/backends/cron.js +203 -0
- package/dist/tasks/backends/exec-utils.js +28 -0
- package/dist/tasks/backends/index.js +24 -0
- package/dist/tasks/backends/launchd-template.xml +19 -0
- package/dist/tasks/backends/launchd.js +187 -0
- package/dist/tasks/backends/schtasks-template.xml +29 -0
- package/dist/tasks/backends/schtasks.js +215 -0
- package/dist/tasks/parser.js +211 -0
- package/dist/tasks/resolveAkmBin.js +87 -0
- package/dist/tasks/runner.js +458 -0
- package/dist/tasks/schedule.js +211 -0
- package/dist/tasks/schema.js +15 -0
- package/dist/tasks/validator.js +62 -0
- package/dist/version.js +3 -0
- package/dist/wiki/index-template.md +12 -0
- package/dist/wiki/ingest-workflow-template.md +54 -0
- package/dist/wiki/log-template.md +8 -0
- package/dist/wiki/schema-template.md +61 -0
- package/dist/wiki/wiki-templates.js +15 -0
- package/dist/wiki/wiki.js +13 -61
- package/dist/workflows/authoring.js +8 -25
- package/dist/workflows/cli.js +3 -0
- package/dist/workflows/db.js +140 -10
- package/dist/workflows/document-cache.js +3 -10
- package/dist/workflows/parser.js +3 -0
- package/dist/workflows/renderer.js +11 -3
- package/dist/workflows/runs.js +62 -91
- package/dist/workflows/schema.js +3 -0
- package/dist/workflows/scope-key.js +3 -0
- package/dist/workflows/validator.js +4 -8
- package/dist/workflows/workflow-template.md +24 -0
- package/docs/README.md +9 -2
- package/docs/data-and-telemetry.md +225 -0
- package/docs/migration/release-notes/0.7.0.md +1 -1
- package/docs/migration/release-notes/0.7.5.md +2 -2
- package/docs/migration/release-notes/0.8.0.md +48 -0
- package/docs/migration/v0.7-to-v0.8.md +1307 -0
- package/package.json +20 -8
- package/.github/LICENSE +0 -374
- package/dist/commands/install-audit.js +0 -381
- package/dist/templates/wiki-templates.js +0 -100
package/dist/indexer/db.js
CHANGED
|
@@ -1,16 +1,22 @@
|
|
|
1
|
+
// This Source Code Form is subject to the terms of the Mozilla Public
|
|
2
|
+
// License, v. 2.0. If a copy of the MPL was not distributed with this
|
|
3
|
+
// file, You can obtain one at https://mozilla.org/MPL/2.0/.
|
|
1
4
|
import { Database } from "bun:sqlite";
|
|
2
5
|
import fs from "node:fs";
|
|
3
6
|
import { createRequire } from "node:module";
|
|
4
7
|
import path from "node:path";
|
|
5
8
|
import { parseAssetRef } from "../core/asset-ref";
|
|
6
9
|
import { getDbPath } from "../core/paths";
|
|
10
|
+
import { REGISTRY_INDEX_CACHE_DDL } from "../core/state-db";
|
|
7
11
|
import { warn } from "../core/warn";
|
|
8
12
|
import { cosineSimilarity } from "../llm/embedders/types";
|
|
13
|
+
import { backupDataDir, EMBEDDING_DIM_CHANGE_REASON } from "./db-backup";
|
|
9
14
|
import { buildSearchFields } from "./search-fields";
|
|
10
15
|
import { ensureUsageEventsSchema } from "./usage-events";
|
|
11
16
|
// ── Constants ───────────────────────────────────────────────────────────────
|
|
12
|
-
export const DB_VERSION =
|
|
17
|
+
export const DB_VERSION = 17;
|
|
13
18
|
export const EMBEDDING_DIM = 384;
|
|
19
|
+
export const GRAPH_SCHEMA_VERSION = 3;
|
|
14
20
|
// ── Database lifecycle ──────────────────────────────────────────────────────
|
|
15
21
|
export function openDatabase(dbPath, options) {
|
|
16
22
|
const resolvedPath = dbPath ?? getDbPath();
|
|
@@ -24,11 +30,39 @@ export function openDatabase(dbPath, options) {
|
|
|
24
30
|
db.exec("PRAGMA foreign_keys = ON");
|
|
25
31
|
// Try to load sqlite-vec extension
|
|
26
32
|
loadVecExtension(db);
|
|
27
|
-
|
|
33
|
+
// Dim resolution: explicit option wins; otherwise consult the on-disk
|
|
34
|
+
// config so unparameterised opens (registry providers, graph helpers,
|
|
35
|
+
// ad-hoc CLI subcommands) honour the operator-declared dimension. Only if
|
|
36
|
+
// both are absent do we fall through to the no-clobber path, which keeps
|
|
37
|
+
// ensureSchema from touching `index_meta.embeddingDim` at all.
|
|
38
|
+
const resolvedDim = options?.embeddingDim ?? resolveConfiguredEmbeddingDim();
|
|
39
|
+
ensureSchema(db, resolvedDim, { dataDir: dir });
|
|
28
40
|
// Warn once at init if using JS fallback with many entries
|
|
29
41
|
warnIfVecMissing(db, { once: true });
|
|
30
42
|
return db;
|
|
31
43
|
}
|
|
44
|
+
/**
|
|
45
|
+
* Read the operator-configured embedding dimension from the on-disk config.
|
|
46
|
+
* Returns `undefined` when no config file is present, when the config has
|
|
47
|
+
* no `embedding.dimension` set, or when reading the config throws (e.g.
|
|
48
|
+
* inside isolated test fixtures with no XDG home). Failure is silent on
|
|
49
|
+
* purpose — every openDatabase() call would otherwise have to handle a
|
|
50
|
+
* config-not-found error path, and the fallback (no-clobber semantics) is
|
|
51
|
+
* already correct.
|
|
52
|
+
*/
|
|
53
|
+
function resolveConfiguredEmbeddingDim() {
|
|
54
|
+
try {
|
|
55
|
+
const { loadConfig } = require("../core/config");
|
|
56
|
+
const dim = loadConfig().embedding?.dimension;
|
|
57
|
+
if (typeof dim === "number" && Number.isInteger(dim) && dim > 0 && dim <= 4096) {
|
|
58
|
+
return dim;
|
|
59
|
+
}
|
|
60
|
+
return undefined;
|
|
61
|
+
}
|
|
62
|
+
catch {
|
|
63
|
+
return undefined;
|
|
64
|
+
}
|
|
65
|
+
}
|
|
32
66
|
export function openExistingDatabase(dbPath) {
|
|
33
67
|
const resolvedPath = dbPath ?? getDbPath();
|
|
34
68
|
const db = new Database(resolvedPath);
|
|
@@ -86,7 +120,7 @@ export function warnIfVecMissing(db, { once } = { once: false }) {
|
|
|
86
120
|
/* embeddings table may not exist yet during init */
|
|
87
121
|
}
|
|
88
122
|
}
|
|
89
|
-
function ensureSchema(db, embeddingDim) {
|
|
123
|
+
function ensureSchema(db, embeddingDim, options) {
|
|
90
124
|
// Create meta table first so we can check version
|
|
91
125
|
db.exec(`
|
|
92
126
|
CREATE TABLE IF NOT EXISTS index_meta (
|
|
@@ -94,6 +128,39 @@ function ensureSchema(db, embeddingDim) {
|
|
|
94
128
|
value TEXT NOT NULL
|
|
95
129
|
);
|
|
96
130
|
`);
|
|
131
|
+
// MVP DB-backup hook (0.8.x): when the stored DB version differs from the
|
|
132
|
+
// running binary's DB_VERSION, snapshot the data directory BEFORE
|
|
133
|
+
// `handleVersionUpgrade()` drops tables. This is best-effort —
|
|
134
|
+
// `backupDataDir` returns null on opt-out, missing data dir, low free
|
|
135
|
+
// space, or copy errors, and we proceed with the upgrade in all cases.
|
|
136
|
+
// The proper migration framework lands in 0.9.0; until then this lets
|
|
137
|
+
// operators recover with `scripts/migrations/restore-data-dir.sh`.
|
|
138
|
+
if (options?.dataDir) {
|
|
139
|
+
const storedVersionRaw = getMeta(db, "version");
|
|
140
|
+
const storedVersion = storedVersionRaw !== undefined && storedVersionRaw !== "" ? Number.parseInt(storedVersionRaw, 10) : null;
|
|
141
|
+
const willUpgrade = storedVersionRaw !== undefined && storedVersionRaw !== "" && storedVersionRaw !== String(DB_VERSION);
|
|
142
|
+
if (willUpgrade) {
|
|
143
|
+
try {
|
|
144
|
+
// Pass env explicitly so tests can override AKM_DB_BACKUP / AKM_DB_BACKUP_RETAIN
|
|
145
|
+
// without mutating process.env. Production callers default to process.env.
|
|
146
|
+
const result = backupDataDir({
|
|
147
|
+
dataDir: options.dataDir,
|
|
148
|
+
sourceVersion: storedVersion !== null && !Number.isNaN(storedVersion) ? storedVersion : null,
|
|
149
|
+
targetVersion: DB_VERSION,
|
|
150
|
+
env: process.env,
|
|
151
|
+
});
|
|
152
|
+
if (result) {
|
|
153
|
+
warn("[akm] data directory backed up to %s before v%s→v%d upgrade", result.path, storedVersionRaw, DB_VERSION);
|
|
154
|
+
}
|
|
155
|
+
}
|
|
156
|
+
catch (err) {
|
|
157
|
+
// Defensive — backupDataDir already swallows most errors, but if it
|
|
158
|
+
// throws for an unexpected reason we must still proceed with the
|
|
159
|
+
// upgrade so the user isn't locked out of their binary.
|
|
160
|
+
warn("[akm] pre-upgrade data dir backup raised an unexpected error — %s; upgrade will proceed without a snapshot", err instanceof Error ? err.message : String(err));
|
|
161
|
+
}
|
|
162
|
+
}
|
|
163
|
+
}
|
|
97
164
|
// Check stored version — if it differs from DB_VERSION, drop and recreate all tables.
|
|
98
165
|
// Usage events are preserved across version upgrades so that utility score
|
|
99
166
|
// history is not silently lost. The backup is captured here and threaded
|
|
@@ -110,12 +177,24 @@ function ensureSchema(db, embeddingDim) {
|
|
|
110
177
|
stash_dir TEXT NOT NULL,
|
|
111
178
|
entry_json TEXT NOT NULL,
|
|
112
179
|
search_text TEXT NOT NULL,
|
|
113
|
-
entry_type TEXT NOT NULL
|
|
180
|
+
entry_type TEXT NOT NULL,
|
|
181
|
+
derived_from TEXT
|
|
114
182
|
);
|
|
115
183
|
|
|
116
184
|
CREATE INDEX IF NOT EXISTS idx_entries_dir ON entries(dir_path);
|
|
117
185
|
CREATE INDEX IF NOT EXISTS idx_entries_type ON entries(entry_type);
|
|
186
|
+
CREATE INDEX IF NOT EXISTS idx_entries_file_path ON entries(file_path);
|
|
118
187
|
`);
|
|
188
|
+
// Phase 5A / DB v17: backfill `derived_from` column + index on databases
|
|
189
|
+
// that were created at v17 fresh OR carry a partial v17 schema (a DB whose
|
|
190
|
+
// `index_meta.version` was bumped to 17 but whose `entries` table still
|
|
191
|
+
// lacks the column — this happens when a previous v17 binary opened a
|
|
192
|
+
// pre-v17 DB without taking the upgrade path because no version mismatch
|
|
193
|
+
// was seen at boot). The PRAGMA-then-ALTER guard runs unconditionally so
|
|
194
|
+
// both fresh and partial schemas converge. The CREATE INDEX for
|
|
195
|
+
// `derived_from` MUST run after this helper so we never reference a
|
|
196
|
+
// column that has not yet been added on partial schemas.
|
|
197
|
+
ensureDerivedFromColumn(db);
|
|
119
198
|
// Validated WorkflowDocument JSON, one row per indexed workflow entry.
|
|
120
199
|
// Pure index data — fully rebuilt on each `akm index`. ON DELETE CASCADE
|
|
121
200
|
// means clearing entries (full rebuild or per-dir delete) drops these too.
|
|
@@ -174,6 +253,20 @@ function ensureSchema(db, embeddingDim) {
|
|
|
174
253
|
updated_at TEXT NOT NULL DEFAULT (datetime('now')),
|
|
175
254
|
FOREIGN KEY (entry_id) REFERENCES entries(id) ON DELETE CASCADE
|
|
176
255
|
);
|
|
256
|
+
`);
|
|
257
|
+
// Per-project scoped utility scores — tracks usage per (entry, cwd-anchor)
|
|
258
|
+
// so assets useful in project A don't pollute rankings in project B.
|
|
259
|
+
// The global utility_scores table is preserved as a fallback / cold-start aid.
|
|
260
|
+
db.exec(`
|
|
261
|
+
CREATE TABLE IF NOT EXISTS utility_scores_scoped (
|
|
262
|
+
entry_id INTEGER NOT NULL,
|
|
263
|
+
scope_key TEXT NOT NULL,
|
|
264
|
+
utility REAL NOT NULL DEFAULT 0,
|
|
265
|
+
last_used_at INTEGER NOT NULL,
|
|
266
|
+
PRIMARY KEY (entry_id, scope_key)
|
|
267
|
+
);
|
|
268
|
+
CREATE INDEX IF NOT EXISTS idx_utility_scores_scoped_entry_id
|
|
269
|
+
ON utility_scores_scoped(entry_id);
|
|
177
270
|
`);
|
|
178
271
|
db.exec(`
|
|
179
272
|
CREATE TABLE IF NOT EXISTS index_dir_state (
|
|
@@ -183,6 +276,97 @@ function ensureSchema(db, embeddingDim) {
|
|
|
183
276
|
reason TEXT NOT NULL,
|
|
184
277
|
updated_at TEXT NOT NULL
|
|
185
278
|
);
|
|
279
|
+
`);
|
|
280
|
+
// LLM enrichment result cache. Stores a SHA-256 body hash and the JSON
|
|
281
|
+
// result for each asset so that subsequent `akm index --enrich` runs can
|
|
282
|
+
// skip the LLM call when the body hasn't changed. The cache is keyed by
|
|
283
|
+
// a stable asset_ref string (e.g. the absolute file path for graph/memory
|
|
284
|
+
// passes, or `entryKey:passId` for the metadata-enhance pass).
|
|
285
|
+
// Entries are cleaned up when assets are removed or --re-enrich is used.
|
|
286
|
+
db.exec(`
|
|
287
|
+
CREATE TABLE IF NOT EXISTS llm_enrichment_cache (
|
|
288
|
+
asset_ref TEXT NOT NULL,
|
|
289
|
+
cache_variant TEXT NOT NULL,
|
|
290
|
+
body_hash TEXT NOT NULL,
|
|
291
|
+
result_json TEXT NOT NULL,
|
|
292
|
+
updated_at INTEGER NOT NULL,
|
|
293
|
+
PRIMARY KEY (asset_ref, cache_variant)
|
|
294
|
+
);
|
|
295
|
+
|
|
296
|
+
CREATE INDEX IF NOT EXISTS idx_llm_cache_updated
|
|
297
|
+
ON llm_enrichment_cache(updated_at);
|
|
298
|
+
`);
|
|
299
|
+
// Graph extraction tables — schema v2 (entry_id PK).
|
|
300
|
+
//
|
|
301
|
+
// graph_files is keyed on entries.id so child tables cascade-delete cleanly
|
|
302
|
+
// when an entry is removed, and so JOINs from graph rows to entries are a
|
|
303
|
+
// direct PK lookup. (stash_root, file_path) is retained as UNIQUE so the
|
|
304
|
+
// extractor's path-based upsert still works.
|
|
305
|
+
//
|
|
306
|
+
// graph_file_entities and graph_file_relations no longer duplicate file_path;
|
|
307
|
+
// they reference entry_id and inherit stash scoping via graph_files.
|
|
308
|
+
db.exec(`
|
|
309
|
+
CREATE TABLE IF NOT EXISTS graph_meta (
|
|
310
|
+
stash_root TEXT PRIMARY KEY,
|
|
311
|
+
schema_version INTEGER NOT NULL,
|
|
312
|
+
generated_at TEXT NOT NULL,
|
|
313
|
+
considered_files INTEGER NOT NULL DEFAULT 0,
|
|
314
|
+
extracted_files INTEGER NOT NULL DEFAULT 0,
|
|
315
|
+
entity_count INTEGER NOT NULL DEFAULT 0,
|
|
316
|
+
relation_count INTEGER NOT NULL DEFAULT 0,
|
|
317
|
+
extraction_coverage REAL NOT NULL DEFAULT 0,
|
|
318
|
+
density REAL NOT NULL DEFAULT 0,
|
|
319
|
+
extractor_id TEXT,
|
|
320
|
+
extraction_run_id TEXT,
|
|
321
|
+
model TEXT,
|
|
322
|
+
prompt_version TEXT,
|
|
323
|
+
batch_size INTEGER,
|
|
324
|
+
cache_hits INTEGER NOT NULL DEFAULT 0,
|
|
325
|
+
cache_misses INTEGER NOT NULL DEFAULT 0,
|
|
326
|
+
truncation_count INTEGER NOT NULL DEFAULT 0,
|
|
327
|
+
failure_count INTEGER NOT NULL DEFAULT 0
|
|
328
|
+
);
|
|
329
|
+
|
|
330
|
+
CREATE TABLE IF NOT EXISTS graph_files (
|
|
331
|
+
entry_id INTEGER PRIMARY KEY REFERENCES entries(id) ON DELETE CASCADE,
|
|
332
|
+
stash_root TEXT NOT NULL,
|
|
333
|
+
file_path TEXT NOT NULL,
|
|
334
|
+
file_order INTEGER NOT NULL,
|
|
335
|
+
file_type TEXT NOT NULL,
|
|
336
|
+
body_hash TEXT NOT NULL,
|
|
337
|
+
confidence REAL,
|
|
338
|
+
status TEXT NOT NULL DEFAULT 'extracted',
|
|
339
|
+
reason TEXT,
|
|
340
|
+
extraction_run_id TEXT,
|
|
341
|
+
UNIQUE(stash_root, file_path)
|
|
342
|
+
);
|
|
343
|
+
|
|
344
|
+
CREATE INDEX IF NOT EXISTS idx_graph_files_stash_order
|
|
345
|
+
ON graph_files(stash_root, file_order);
|
|
346
|
+
|
|
347
|
+
CREATE TABLE IF NOT EXISTS graph_file_entities (
|
|
348
|
+
entry_id INTEGER NOT NULL REFERENCES graph_files(entry_id) ON DELETE CASCADE,
|
|
349
|
+
entity_order INTEGER NOT NULL,
|
|
350
|
+
stash_root TEXT NOT NULL,
|
|
351
|
+
entity_norm TEXT NOT NULL,
|
|
352
|
+
entity TEXT NOT NULL,
|
|
353
|
+
PRIMARY KEY (entry_id, entity_order)
|
|
354
|
+
);
|
|
355
|
+
|
|
356
|
+
CREATE INDEX IF NOT EXISTS idx_graph_file_entities_entity_norm
|
|
357
|
+
ON graph_file_entities(stash_root, entity_norm);
|
|
358
|
+
|
|
359
|
+
CREATE TABLE IF NOT EXISTS graph_file_relations (
|
|
360
|
+
entry_id INTEGER NOT NULL REFERENCES graph_files(entry_id) ON DELETE CASCADE,
|
|
361
|
+
relation_order INTEGER NOT NULL,
|
|
362
|
+
from_entity_norm TEXT NOT NULL,
|
|
363
|
+
from_entity TEXT NOT NULL,
|
|
364
|
+
to_entity_norm TEXT NOT NULL,
|
|
365
|
+
to_entity TEXT NOT NULL,
|
|
366
|
+
relation_type TEXT,
|
|
367
|
+
confidence REAL,
|
|
368
|
+
PRIMARY KEY (entry_id, relation_order)
|
|
369
|
+
);
|
|
186
370
|
`);
|
|
187
371
|
// FTS-dirty queue. Created here (not lazily on first upsert) so the
|
|
188
372
|
// per-entry write path doesn't issue a CREATE TABLE IF NOT EXISTS on
|
|
@@ -194,59 +378,89 @@ function ensureSchema(db, embeddingDim) {
|
|
|
194
378
|
);
|
|
195
379
|
`);
|
|
196
380
|
// sqlite-vec table
|
|
381
|
+
//
|
|
382
|
+
// Dimension contract:
|
|
383
|
+
// - When `embeddingDim` is `undefined`, the caller did NOT request a
|
|
384
|
+
// specific dim. Do not touch `index_meta.embeddingDim` and do not run
|
|
385
|
+
// the dim-change wipe — fall back to the stored dim (or the static
|
|
386
|
+
// default) only when we have to materialise the vec table for the
|
|
387
|
+
// first time. Without this guard, registry-side and other dim-unaware
|
|
388
|
+
// `openDatabase()` callers would silently overwrite the dim-aware
|
|
389
|
+
// improve/index value and oscillate the stored dim.
|
|
390
|
+
// - When `embeddingDim` is a number, the caller explicitly asked for
|
|
391
|
+
// that dim and owns the dim-change/backup/wipe semantics.
|
|
392
|
+
const dimExplicit = embeddingDim !== undefined;
|
|
393
|
+
const effectiveDim = embeddingDim ?? (Number(getMeta(db, "embeddingDim")) || EMBEDDING_DIM);
|
|
197
394
|
if (isVecAvailable(db)) {
|
|
198
395
|
// Check if stored embedding dimension differs from configured one
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
396
|
+
if (dimExplicit) {
|
|
397
|
+
const storedDim = getMeta(db, "embeddingDim");
|
|
398
|
+
if (storedDim && storedDim !== String(embeddingDim)) {
|
|
399
|
+
// Re-embedding the whole stash is expensive (LLM API calls + cache
|
|
400
|
+
// misses), so snapshot the data dir before we drop the vec table and
|
|
401
|
+
// wipe `embeddings`. This is the SAME hook the version-upgrade path
|
|
402
|
+
// uses earlier in this function, just gated on embedding-dim mismatch
|
|
403
|
+
// and tagged so operators can tell the two backup kinds apart.
|
|
404
|
+
backupBeforeEmbeddingDimChange(options?.dataDir, storedDim, String(embeddingDim));
|
|
405
|
+
try {
|
|
406
|
+
db.exec("DROP TABLE IF EXISTS entries_vec");
|
|
407
|
+
}
|
|
408
|
+
catch {
|
|
409
|
+
/* ignore */
|
|
410
|
+
}
|
|
411
|
+
// Delete stale BLOB embeddings so they don't produce silently wrong
|
|
412
|
+
// similarity scores against the new-dimension vec table.
|
|
413
|
+
try {
|
|
414
|
+
db.exec("DELETE FROM embeddings");
|
|
415
|
+
}
|
|
416
|
+
catch {
|
|
417
|
+
/* ignore */
|
|
418
|
+
}
|
|
419
|
+
setMeta(db, "hasEmbeddings", "0");
|
|
214
420
|
}
|
|
215
|
-
setMeta(db, "hasEmbeddings", "0");
|
|
216
421
|
}
|
|
217
422
|
const vecExists = db.prepare("SELECT name FROM sqlite_master WHERE type='table' AND name='entries_vec'").get();
|
|
218
423
|
if (!vecExists) {
|
|
219
|
-
if (!Number.isInteger(
|
|
220
|
-
throw new Error(`Invalid embedding dimension: ${
|
|
424
|
+
if (!Number.isInteger(effectiveDim) || effectiveDim <= 0 || effectiveDim > 4096) {
|
|
425
|
+
throw new Error(`Invalid embedding dimension: ${effectiveDim}`);
|
|
221
426
|
}
|
|
222
427
|
db.exec(`
|
|
223
428
|
CREATE VIRTUAL TABLE entries_vec USING vec0(
|
|
224
429
|
id INTEGER PRIMARY KEY,
|
|
225
|
-
embedding FLOAT[${
|
|
430
|
+
embedding FLOAT[${effectiveDim}]
|
|
226
431
|
);
|
|
227
432
|
`);
|
|
228
433
|
}
|
|
229
|
-
|
|
434
|
+
if (dimExplicit) {
|
|
435
|
+
setMeta(db, "embeddingDim", String(embeddingDim));
|
|
436
|
+
}
|
|
230
437
|
}
|
|
231
438
|
else {
|
|
232
439
|
// Also purge BLOB embeddings on dimension change (JS fallback path).
|
|
233
440
|
// When sqlite-vec is unavailable, entries_vec doesn't exist but the BLOB
|
|
234
441
|
// embeddings table still stores vectors. If the configured dimension
|
|
235
442
|
// changes, those stored BLOBs become silently incompatible.
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
443
|
+
if (dimExplicit) {
|
|
444
|
+
const storedDim = getMeta(db, "embeddingDim");
|
|
445
|
+
if (storedDim && storedDim !== String(embeddingDim)) {
|
|
446
|
+
backupBeforeEmbeddingDimChange(options?.dataDir, storedDim, String(embeddingDim));
|
|
447
|
+
try {
|
|
448
|
+
db.exec("DELETE FROM embeddings");
|
|
449
|
+
}
|
|
450
|
+
catch {
|
|
451
|
+
/* ignore */
|
|
452
|
+
}
|
|
453
|
+
setMeta(db, "hasEmbeddings", "0");
|
|
243
454
|
}
|
|
244
|
-
setMeta(db, "
|
|
455
|
+
setMeta(db, "embeddingDim", String(embeddingDim));
|
|
245
456
|
}
|
|
246
|
-
setMeta(db, "embeddingDim", String(embeddingDim));
|
|
247
457
|
}
|
|
248
458
|
// Usage telemetry table
|
|
249
459
|
ensureUsageEventsSchema(db);
|
|
460
|
+
// Registry index cache table — caches remote registry index documents so
|
|
461
|
+
// `akm search` does not hit the network on every invocation. The DDL is
|
|
462
|
+
// defined in state-db.ts and shared here to avoid duplication.
|
|
463
|
+
db.exec(REGISTRY_INDEX_CACHE_DDL);
|
|
250
464
|
// Restore usage_events backed up by the version-upgrade path above.
|
|
251
465
|
restoreUsageEventsBackup(db, usageBackup);
|
|
252
466
|
}
|
|
@@ -277,11 +491,23 @@ function handleVersionUpgrade(db) {
|
|
|
277
491
|
/* table may not exist in older versions */
|
|
278
492
|
}
|
|
279
493
|
db.exec("DROP TABLE IF EXISTS utility_scores");
|
|
494
|
+
db.exec("DROP TABLE IF EXISTS utility_scores_scoped");
|
|
495
|
+
db.exec("DROP INDEX IF EXISTS idx_utility_scores_scoped_entry_id");
|
|
280
496
|
db.exec("DROP TABLE IF EXISTS usage_events");
|
|
281
497
|
db.exec("DROP TABLE IF EXISTS embeddings");
|
|
282
498
|
db.exec("DROP TABLE IF EXISTS entries_vec");
|
|
283
499
|
db.exec("DROP TABLE IF EXISTS entries_fts");
|
|
284
500
|
db.exec("DROP TABLE IF EXISTS index_dir_state");
|
|
501
|
+
db.exec("DROP TABLE IF EXISTS llm_enrichment_cache");
|
|
502
|
+
db.exec("DROP INDEX IF EXISTS idx_llm_cache_updated");
|
|
503
|
+
db.exec("DROP TABLE IF EXISTS graph_file_relations");
|
|
504
|
+
db.exec("DROP TABLE IF EXISTS graph_file_entities");
|
|
505
|
+
db.exec("DROP TABLE IF EXISTS graph_files");
|
|
506
|
+
db.exec("DROP TABLE IF EXISTS graph_meta");
|
|
507
|
+
db.exec("DROP TABLE IF EXISTS graph_relations");
|
|
508
|
+
db.exec("DROP TABLE IF EXISTS graph_entities");
|
|
509
|
+
db.exec("DROP TABLE IF EXISTS graph_nodes");
|
|
510
|
+
db.exec("DROP TABLE IF EXISTS graph_stashes");
|
|
285
511
|
db.exec("DROP INDEX IF EXISTS idx_entries_dir");
|
|
286
512
|
db.exec("DROP INDEX IF EXISTS idx_entries_type");
|
|
287
513
|
db.exec("DROP TABLE IF EXISTS entries");
|
|
@@ -289,6 +515,48 @@ function handleVersionUpgrade(db) {
|
|
|
289
515
|
warn("[akm] Index rebuilt due to version upgrade. Run 'akm index' to repopulate.");
|
|
290
516
|
return usageBackup;
|
|
291
517
|
}
|
|
518
|
+
/**
|
|
519
|
+
* Snapshot the data directory before the embedding-dimension drop path wipes
|
|
520
|
+
* `embeddings` and recreates `entries_vec`. Re-embedding a real-world stash
|
|
521
|
+
* is expensive (LLM calls + cache misses), so we capture the pre-drop state
|
|
522
|
+
* here using the same MVP backup helper the version-upgrade hook uses
|
|
523
|
+
* earlier in {@link ensureSchema}.
|
|
524
|
+
*
|
|
525
|
+
* The backup is tagged with the `embedding-dim-change` reason so it lands in
|
|
526
|
+
* `<dataDir>/backups/<timestamp>-embedding-dim-change/` instead of the
|
|
527
|
+
* version-upgrade-flavored `<timestamp>-pre-v<N>/` directory. Restoration
|
|
528
|
+
* works identically via `scripts/migrations/restore-data-dir.sh`.
|
|
529
|
+
*
|
|
530
|
+
* Failures are non-fatal — they downgrade to a warning and the destructive
|
|
531
|
+
* ops run anyway, matching the version-upgrade hook's behavior so a broken
|
|
532
|
+
* backup cannot brick a binary that bumped the configured dim. Likewise,
|
|
533
|
+
* `AKM_DB_BACKUP=0` opts out via the same path.
|
|
534
|
+
*/
|
|
535
|
+
function backupBeforeEmbeddingDimChange(dataDir, fromDim, toDim) {
|
|
536
|
+
if (!dataDir)
|
|
537
|
+
return;
|
|
538
|
+
try {
|
|
539
|
+
const result = backupDataDir({
|
|
540
|
+
dataDir,
|
|
541
|
+
// The DB version isn't changing here — pass the current DB_VERSION for
|
|
542
|
+
// both source and target so the metadata sidecar still records the
|
|
543
|
+
// running binary's version for forensic context.
|
|
544
|
+
sourceVersion: DB_VERSION,
|
|
545
|
+
targetVersion: DB_VERSION,
|
|
546
|
+
reason: EMBEDDING_DIM_CHANGE_REASON,
|
|
547
|
+
env: process.env,
|
|
548
|
+
});
|
|
549
|
+
if (result) {
|
|
550
|
+
warn("[akm] embedding dimension changed %s→%s; data directory backed up to %s; embeddings will be regenerated", fromDim, toDim, result.path);
|
|
551
|
+
}
|
|
552
|
+
}
|
|
553
|
+
catch (err) {
|
|
554
|
+
// Defensive — backupDataDir already swallows most errors, but if it
|
|
555
|
+
// throws for an unexpected reason we must still proceed with the drop
|
|
556
|
+
// so the user isn't locked out of their binary on a changed dim.
|
|
557
|
+
warn("[akm] pre-embedding-dim-change data dir backup raised an unexpected error — %s; embeddings will be regenerated without a snapshot", err instanceof Error ? err.message : String(err));
|
|
558
|
+
}
|
|
559
|
+
}
|
|
292
560
|
/**
|
|
293
561
|
* Re-insert backed-up `usage_events` rows into the freshly-created table.
|
|
294
562
|
*
|
|
@@ -383,6 +651,12 @@ export function deleteIndexDirStatesByStashDir(db, stashDir) {
|
|
|
383
651
|
db.prepare("DELETE FROM index_dir_state WHERE dir_path = ? OR dir_path LIKE ?").run(stashDir, `${stashDir}${path.sep}%`);
|
|
384
652
|
}
|
|
385
653
|
// ── Entry operations ────────────────────────────────────────────────────────
|
|
654
|
+
/**
|
|
655
|
+
* SQLite parameter chunk size — chosen well below SQLITE_MAX_VARIABLE_NUMBER
|
|
656
|
+
* (default 999 on most builds) so multi-row `IN (?, ?, ...)` queries stay
|
|
657
|
+
* within bounds. Shared by helpers below.
|
|
658
|
+
*/
|
|
659
|
+
const SQLITE_CHUNK_SIZE = 500;
|
|
386
660
|
/**
|
|
387
661
|
* Insert or update an entry in the `entries` table. Returns the row id.
|
|
388
662
|
*
|
|
@@ -396,7 +670,11 @@ export function upsertEntry(db, entryKey, dirPath, filePath, stashDir, entry, se
|
|
|
396
670
|
// every call. The dirty-mark INSERT and the upsert-with-RETURNING
|
|
397
671
|
// share the same WeakMap so they live and die with the connection.
|
|
398
672
|
const stmts = getUpsertStmts(db);
|
|
399
|
-
|
|
673
|
+
// Phase 5A / Advantage D5: surface derived memory parent ref into the
|
|
674
|
+
// dedicated `derived_from` column so retrieval-time lookup (parent→child)
|
|
675
|
+
// does not have to scan + JSON-decode every memory row.
|
|
676
|
+
const derivedFrom = typeof entry.derivedFrom === "string" && entry.derivedFrom.trim() ? entry.derivedFrom.trim() : null;
|
|
677
|
+
const result = stmts.upsert.get(entryKey, dirPath, filePath, stashDir, JSON.stringify(entry), searchText, entry.type, derivedFrom);
|
|
400
678
|
if (!result)
|
|
401
679
|
throw new Error("upsertEntry: entry_key not found after upsert");
|
|
402
680
|
// Mark this entry as FTS-dirty so `rebuildFts({ incremental: true })`
|
|
@@ -415,15 +693,16 @@ function getUpsertStmts(db) {
|
|
|
415
693
|
// SELECT round-trip needed (last_insert_rowid() is unreliable for
|
|
416
694
|
// ON CONFLICT). Use `.get()` so a single row comes back.
|
|
417
695
|
upsert: db.prepare(`
|
|
418
|
-
INSERT INTO entries (entry_key, dir_path, file_path, stash_dir, entry_json, search_text, entry_type)
|
|
419
|
-
VALUES (?, ?, ?, ?, ?, ?, ?)
|
|
696
|
+
INSERT INTO entries (entry_key, dir_path, file_path, stash_dir, entry_json, search_text, entry_type, derived_from)
|
|
697
|
+
VALUES (?, ?, ?, ?, ?, ?, ?, ?)
|
|
420
698
|
ON CONFLICT(entry_key) DO UPDATE SET
|
|
421
699
|
dir_path = excluded.dir_path,
|
|
422
700
|
file_path = excluded.file_path,
|
|
423
701
|
stash_dir = excluded.stash_dir,
|
|
424
702
|
entry_json = excluded.entry_json,
|
|
425
703
|
search_text = excluded.search_text,
|
|
426
|
-
entry_type = excluded.entry_type
|
|
704
|
+
entry_type = excluded.entry_type,
|
|
705
|
+
derived_from = excluded.derived_from
|
|
427
706
|
RETURNING id
|
|
428
707
|
`),
|
|
429
708
|
markDirty: db.prepare("INSERT OR IGNORE INTO entries_fts_dirty (entry_id) VALUES (?)"),
|
|
@@ -431,21 +710,128 @@ function getUpsertStmts(db) {
|
|
|
431
710
|
upsertStmtsByDb.set(db, stmts);
|
|
432
711
|
return stmts;
|
|
433
712
|
}
|
|
434
|
-
|
|
713
|
+
/**
|
|
714
|
+
* Phase 5A / DB v17 schema guard.
|
|
715
|
+
*
|
|
716
|
+
* Ensures the `entries.derived_from` column + index exist on the open
|
|
717
|
+
* connection. Called from `ensureSchema()` after the entries CREATE so that
|
|
718
|
+
* legacy databases (created against a pre-v17 binary but reopened without
|
|
719
|
+
* triggering `handleVersionUpgrade()`) still gain the new column without
|
|
720
|
+
* data loss. Idempotent: a `PRAGMA table_info` lookup gates the ALTER.
|
|
721
|
+
*/
|
|
722
|
+
function ensureDerivedFromColumn(db) {
|
|
723
|
+
try {
|
|
724
|
+
const cols = db.prepare("PRAGMA table_info(entries)").all();
|
|
725
|
+
const hasColumn = cols.some((c) => c.name === "derived_from");
|
|
726
|
+
if (!hasColumn) {
|
|
727
|
+
db.exec("ALTER TABLE entries ADD COLUMN derived_from TEXT");
|
|
728
|
+
}
|
|
729
|
+
// Index creation is idempotent on its own; safe to call unconditionally.
|
|
730
|
+
db.exec("CREATE INDEX IF NOT EXISTS idx_entries_derived_from ON entries(derived_from)");
|
|
731
|
+
}
|
|
732
|
+
catch {
|
|
733
|
+
/* table may not exist on a brand-new DB before CREATE — caller is responsible */
|
|
734
|
+
}
|
|
735
|
+
}
|
|
736
|
+
/**
|
|
737
|
+
* Phase 5A / Advantage D5: look up the derived-memory child row whose
|
|
738
|
+
* `derived_from` column matches `parentRef` (e.g. `"memory:claude-prefs"`).
|
|
739
|
+
*
|
|
740
|
+
* Returns the most-recently-updated derived child when multiple exist (one
|
|
741
|
+
* parent should yield exactly one `.derived` child in practice, but the
|
|
742
|
+
* ordering keeps results deterministic). Returns `null` when no derived
|
|
743
|
+
* child has been indexed for this parent.
|
|
744
|
+
*/
|
|
745
|
+
export function getDerivedForParent(db, parentRef) {
|
|
746
|
+
if (!parentRef)
|
|
747
|
+
return null;
|
|
748
|
+
try {
|
|
749
|
+
const row = db
|
|
750
|
+
.prepare(`SELECT id, entry_key, dir_path, file_path, stash_dir, entry_json, search_text
|
|
751
|
+
FROM entries
|
|
752
|
+
WHERE derived_from = ?
|
|
753
|
+
ORDER BY id DESC
|
|
754
|
+
LIMIT 1`)
|
|
755
|
+
.get(parentRef);
|
|
756
|
+
if (!row)
|
|
757
|
+
return null;
|
|
758
|
+
let entry;
|
|
759
|
+
try {
|
|
760
|
+
entry = JSON.parse(row.entry_json);
|
|
761
|
+
}
|
|
762
|
+
catch {
|
|
763
|
+
warn(`[db] getDerivedForParent: skipping entry id=${row.id} — corrupt entry_json`);
|
|
764
|
+
return null;
|
|
765
|
+
}
|
|
766
|
+
return {
|
|
767
|
+
id: row.id,
|
|
768
|
+
entryKey: row.entry_key,
|
|
769
|
+
dirPath: row.dir_path,
|
|
770
|
+
filePath: row.file_path,
|
|
771
|
+
stashDir: row.stash_dir,
|
|
772
|
+
entry,
|
|
773
|
+
searchText: row.search_text,
|
|
774
|
+
};
|
|
775
|
+
}
|
|
776
|
+
catch {
|
|
777
|
+
/* `derived_from` column may not exist on legacy DBs that haven't been
|
|
778
|
+
rebuilt; treat as "no derived child". */
|
|
779
|
+
return null;
|
|
780
|
+
}
|
|
781
|
+
}
|
|
782
|
+
/**
|
|
783
|
+
* Phase 2A / Rec 5: bulk-load positive feedback event counts for the given
|
|
784
|
+
* entry ids. Used by the utility-decay forgetting curve to stabilize
|
|
785
|
+
* (extend the half-life of) memories that have repeatedly proven useful.
|
|
786
|
+
*
|
|
787
|
+
* Returns a `Map<entryId, count>` containing only entries with at least one
|
|
788
|
+
* positive feedback event — missing ids implicitly map to `0`. Chunks at
|
|
789
|
+
* `SQLITE_CHUNK_SIZE` (500) to respect `SQLITE_MAX_VARIABLE_NUMBER`.
|
|
790
|
+
*
|
|
791
|
+
* Cheap when called with zero ids, and silently empty when the
|
|
792
|
+
* `usage_events` table is missing.
|
|
793
|
+
*/
|
|
794
|
+
export function getPositiveFeedbackCountsByIds(db, ids) {
|
|
795
|
+
const result = new Map();
|
|
796
|
+
if (ids.length === 0)
|
|
797
|
+
return result;
|
|
798
|
+
for (let i = 0; i < ids.length; i += SQLITE_CHUNK_SIZE) {
|
|
799
|
+
const chunk = ids.slice(i, i + SQLITE_CHUNK_SIZE);
|
|
800
|
+
const placeholders = chunk.map(() => "?").join(",");
|
|
801
|
+
try {
|
|
802
|
+
const rows = db
|
|
803
|
+
.prepare(`SELECT entry_id, COUNT(*) AS cnt
|
|
804
|
+
FROM usage_events
|
|
805
|
+
WHERE event_type = 'feedback'
|
|
806
|
+
AND signal = 'positive'
|
|
807
|
+
AND entry_id IN (${placeholders})
|
|
808
|
+
GROUP BY entry_id`)
|
|
809
|
+
.all(...chunk);
|
|
810
|
+
for (const row of rows) {
|
|
811
|
+
if (row.entry_id !== null && row.cnt > 0) {
|
|
812
|
+
result.set(row.entry_id, row.cnt);
|
|
813
|
+
}
|
|
814
|
+
}
|
|
815
|
+
}
|
|
816
|
+
catch {
|
|
817
|
+
/* usage_events table may be missing on legacy DBs — treat as zero counts */
|
|
818
|
+
}
|
|
819
|
+
}
|
|
820
|
+
return result;
|
|
821
|
+
}
|
|
822
|
+
function deleteEntriesWhere(db, column, value) {
|
|
435
823
|
db.transaction(() => {
|
|
436
|
-
const ids = db.prepare(
|
|
824
|
+
const ids = db.prepare(`SELECT id FROM entries WHERE ${column} = ?`).all(value);
|
|
437
825
|
deleteRelatedRows(db, ids);
|
|
438
|
-
db.prepare(
|
|
826
|
+
db.prepare(`DELETE FROM entries WHERE ${column} = ?`).run(value);
|
|
439
827
|
})();
|
|
440
828
|
}
|
|
829
|
+
export function deleteEntriesByDir(db, dirPath) {
|
|
830
|
+
deleteEntriesWhere(db, "dir_path", dirPath);
|
|
831
|
+
}
|
|
441
832
|
export function deleteEntriesByStashDir(db, stashDir) {
|
|
442
|
-
db
|
|
443
|
-
const ids = db.prepare("SELECT id FROM entries WHERE stash_dir = ?").all(stashDir);
|
|
444
|
-
deleteRelatedRows(db, ids);
|
|
445
|
-
db.prepare("DELETE FROM entries WHERE stash_dir = ?").run(stashDir);
|
|
446
|
-
})();
|
|
833
|
+
deleteEntriesWhere(db, "stash_dir", stashDir);
|
|
447
834
|
}
|
|
448
|
-
const SQLITE_CHUNK_SIZE = 500;
|
|
449
835
|
function deleteRelatedRows(db, ids) {
|
|
450
836
|
if (ids.length === 0)
|
|
451
837
|
return;
|
|
@@ -480,13 +866,6 @@ function deleteRelatedRows(db, ids) {
|
|
|
480
866
|
catch {
|
|
481
867
|
/* ignore */
|
|
482
868
|
}
|
|
483
|
-
// Also delete from FTS table so orphaned FTS rows don't remain
|
|
484
|
-
try {
|
|
485
|
-
db.prepare(`DELETE FROM entries_fts WHERE entry_id IN (${placeholders})`).run(...chunk);
|
|
486
|
-
}
|
|
487
|
-
catch {
|
|
488
|
-
/* ignore */
|
|
489
|
-
}
|
|
490
869
|
if (vecAvail) {
|
|
491
870
|
try {
|
|
492
871
|
db.prepare(`DELETE FROM entries_vec WHERE id IN (${placeholders})`).run(...chunk);
|
|
@@ -502,6 +881,12 @@ function deleteRelatedRows(db, ids) {
|
|
|
502
881
|
catch {
|
|
503
882
|
/* ignore */
|
|
504
883
|
}
|
|
884
|
+
try {
|
|
885
|
+
db.prepare(`DELETE FROM utility_scores_scoped WHERE entry_id IN (${placeholders})`).run(...chunk);
|
|
886
|
+
}
|
|
887
|
+
catch {
|
|
888
|
+
/* ignore */
|
|
889
|
+
}
|
|
505
890
|
// Clean up usage events before deleting entries
|
|
506
891
|
try {
|
|
507
892
|
db.prepare(`DELETE FROM usage_events WHERE entry_id IN (${placeholders})`).run(...chunk);
|
|
@@ -511,6 +896,26 @@ function deleteRelatedRows(db, ids) {
|
|
|
511
896
|
}
|
|
512
897
|
}
|
|
513
898
|
}
|
|
899
|
+
/**
|
|
900
|
+
* Delete entries by their primary key IDs, along with all related rows
|
|
901
|
+
* (embeddings, entries_vec, entries_fts, utility_scores, usage_events).
|
|
902
|
+
*
|
|
903
|
+
* Used by the `--clean` post-pass to remove stale entries whose source files
|
|
904
|
+
* no longer exist on disk.
|
|
905
|
+
*/
|
|
906
|
+
export function deleteEntriesByIds(db, ids) {
|
|
907
|
+
if (ids.length === 0)
|
|
908
|
+
return;
|
|
909
|
+
db.transaction(() => {
|
|
910
|
+
const idObjs = ids.map((id) => ({ id }));
|
|
911
|
+
deleteRelatedRows(db, idObjs);
|
|
912
|
+
for (let i = 0; i < ids.length; i += SQLITE_CHUNK_SIZE) {
|
|
913
|
+
const chunk = ids.slice(i, i + SQLITE_CHUNK_SIZE);
|
|
914
|
+
const placeholders = chunk.map(() => "?").join(",");
|
|
915
|
+
db.prepare(`DELETE FROM entries WHERE id IN (${placeholders})`).run(...chunk);
|
|
916
|
+
}
|
|
917
|
+
})();
|
|
918
|
+
}
|
|
514
919
|
/**
|
|
515
920
|
* Rebuild the FTS5 search index.
|
|
516
921
|
*
|
|
@@ -585,19 +990,32 @@ export function rebuildFts(db, options) {
|
|
|
585
990
|
}
|
|
586
991
|
// ── Vector operations ───────────────────────────────────────────────────────
|
|
587
992
|
export function upsertEmbedding(db, entryId, embedding) {
|
|
993
|
+
// Pre-flight FK guard: when an entry is deleted between when its id is queued
|
|
994
|
+
// for embedding and when this INSERT runs (e.g. consolidation deletes during
|
|
995
|
+
// a concurrent improve cycle), the INSERT throws "FOREIGN KEY constraint failed"
|
|
996
|
+
// and rolls back the entire batch transaction in the caller, losing every
|
|
997
|
+
// embedding for that run. A cheap SELECT here turns the race into a clean skip.
|
|
998
|
+
const exists = db.prepare("SELECT 1 FROM entries WHERE id = ?").get(entryId);
|
|
999
|
+
if (!exists)
|
|
1000
|
+
return false;
|
|
588
1001
|
const buf = float32Buffer(embedding);
|
|
589
1002
|
// Always write to BLOB table (works without sqlite-vec)
|
|
590
1003
|
db.prepare("INSERT OR REPLACE INTO embeddings (id, embedding) VALUES (?, ?)").run(entryId, buf);
|
|
591
|
-
// Also write to sqlite-vec table when available (fast path)
|
|
1004
|
+
// Also write to sqlite-vec table when available (fast path).
|
|
1005
|
+
// Wrapped in a transaction so a crash between DELETE and INSERT does not
|
|
1006
|
+
// leave the entry missing from the vec table.
|
|
592
1007
|
if (isVecAvailable(db)) {
|
|
593
1008
|
try {
|
|
594
|
-
db.
|
|
1009
|
+
db.transaction(() => {
|
|
1010
|
+
db.prepare("DELETE FROM entries_vec WHERE id = ?").run(entryId);
|
|
1011
|
+
db.prepare("INSERT INTO entries_vec (id, embedding) VALUES (?, ?)").run(entryId, buf);
|
|
1012
|
+
})();
|
|
595
1013
|
}
|
|
596
1014
|
catch {
|
|
597
|
-
/* ignore */
|
|
1015
|
+
/* ignore — vec table unavailable or constraint failure */
|
|
598
1016
|
}
|
|
599
|
-
db.prepare("INSERT INTO entries_vec (id, embedding) VALUES (?, ?)").run(entryId, buf);
|
|
600
1017
|
}
|
|
1018
|
+
return true;
|
|
601
1019
|
}
|
|
602
1020
|
export function searchVec(db, queryEmbedding, k) {
|
|
603
1021
|
// Fast path: use sqlite-vec when available
|
|
@@ -723,7 +1141,7 @@ function runFtsQuery(db, ftsQuery, limit, entryType) {
|
|
|
723
1141
|
JOIN entries e ON e.id = f.entry_id
|
|
724
1142
|
WHERE entries_fts MATCH ?
|
|
725
1143
|
AND e.entry_type = ?
|
|
726
|
-
ORDER BY bm25Score
|
|
1144
|
+
ORDER BY bm25Score, e.id ASC
|
|
727
1145
|
LIMIT ?
|
|
728
1146
|
`;
|
|
729
1147
|
params = [ftsQuery, entryType, limit];
|
|
@@ -735,7 +1153,7 @@ function runFtsQuery(db, ftsQuery, limit, entryType) {
|
|
|
735
1153
|
FROM entries_fts f
|
|
736
1154
|
JOIN entries e ON e.id = f.entry_id
|
|
737
1155
|
WHERE entries_fts MATCH ?
|
|
738
|
-
ORDER BY bm25Score
|
|
1156
|
+
ORDER BY bm25Score, e.id ASC
|
|
739
1157
|
LIMIT ?
|
|
740
1158
|
`;
|
|
741
1159
|
params = [ftsQuery, limit];
|
|
@@ -784,21 +1202,7 @@ export function sanitizeFtsQuery(query) {
|
|
|
784
1202
|
// contain ALL terms.
|
|
785
1203
|
return tokens.join(" ");
|
|
786
1204
|
}
|
|
787
|
-
|
|
788
|
-
export function getAllEntries(db, entryType) {
|
|
789
|
-
let sql;
|
|
790
|
-
let params;
|
|
791
|
-
if (entryType && entryType !== "any") {
|
|
792
|
-
sql =
|
|
793
|
-
"SELECT id, entry_key, dir_path, file_path, stash_dir, entry_json, search_text FROM entries WHERE entry_type = ?";
|
|
794
|
-
params = [entryType];
|
|
795
|
-
}
|
|
796
|
-
else {
|
|
797
|
-
sql = "SELECT id, entry_key, dir_path, file_path, stash_dir, entry_json, search_text FROM entries";
|
|
798
|
-
params = [];
|
|
799
|
-
}
|
|
800
|
-
const rows = db.prepare(sql).all(...params);
|
|
801
|
-
// Guard against corrupt JSON — skip the row rather than crashing
|
|
1205
|
+
function parseEntryRows(rows, context) {
|
|
802
1206
|
const entries = [];
|
|
803
1207
|
for (const row of rows) {
|
|
804
1208
|
let entry;
|
|
@@ -806,7 +1210,7 @@ export function getAllEntries(db, entryType) {
|
|
|
806
1210
|
entry = JSON.parse(row.entry_json);
|
|
807
1211
|
}
|
|
808
1212
|
catch {
|
|
809
|
-
warn(`[db]
|
|
1213
|
+
warn(`[db] ${context}: skipping entry id=${row.id} — corrupt entry_json`);
|
|
810
1214
|
continue;
|
|
811
1215
|
}
|
|
812
1216
|
entries.push({
|
|
@@ -821,6 +1225,21 @@ export function getAllEntries(db, entryType) {
|
|
|
821
1225
|
}
|
|
822
1226
|
return entries;
|
|
823
1227
|
}
|
|
1228
|
+
export function getAllEntries(db, entryType) {
|
|
1229
|
+
let sql;
|
|
1230
|
+
let params;
|
|
1231
|
+
if (entryType && entryType !== "any") {
|
|
1232
|
+
sql =
|
|
1233
|
+
"SELECT id, entry_key, dir_path, file_path, stash_dir, entry_json, search_text FROM entries WHERE entry_type = ?";
|
|
1234
|
+
params = [entryType];
|
|
1235
|
+
}
|
|
1236
|
+
else {
|
|
1237
|
+
sql = "SELECT id, entry_key, dir_path, file_path, stash_dir, entry_json, search_text FROM entries";
|
|
1238
|
+
params = [];
|
|
1239
|
+
}
|
|
1240
|
+
const rows = db.prepare(sql).all(...params);
|
|
1241
|
+
return parseEntryRows(rows, "getAllEntries");
|
|
1242
|
+
}
|
|
824
1243
|
export function findEntryIdByRef(db, ref) {
|
|
825
1244
|
const parsed = parseAssetRef(ref);
|
|
826
1245
|
const nameVariants = [parsed.name];
|
|
@@ -866,28 +1285,7 @@ export function getEntriesByDir(db, dirPath) {
|
|
|
866
1285
|
const rows = db
|
|
867
1286
|
.prepare("SELECT id, entry_key, dir_path, file_path, stash_dir, entry_json, search_text FROM entries WHERE dir_path = ?")
|
|
868
1287
|
.all(dirPath);
|
|
869
|
-
|
|
870
|
-
const entries = [];
|
|
871
|
-
for (const row of rows) {
|
|
872
|
-
let entry;
|
|
873
|
-
try {
|
|
874
|
-
entry = JSON.parse(row.entry_json);
|
|
875
|
-
}
|
|
876
|
-
catch {
|
|
877
|
-
warn(`[db] getEntriesByDir: skipping entry id=${row.id} — corrupt entry_json`);
|
|
878
|
-
continue;
|
|
879
|
-
}
|
|
880
|
-
entries.push({
|
|
881
|
-
id: row.id,
|
|
882
|
-
entryKey: row.entry_key,
|
|
883
|
-
dirPath: row.dir_path,
|
|
884
|
-
filePath: row.file_path,
|
|
885
|
-
stashDir: row.stash_dir,
|
|
886
|
-
entry,
|
|
887
|
-
searchText: row.search_text,
|
|
888
|
-
});
|
|
889
|
-
}
|
|
890
|
-
return entries;
|
|
1288
|
+
return parseEntryRows(rows, "getEntriesByDir");
|
|
891
1289
|
}
|
|
892
1290
|
/**
|
|
893
1291
|
* Get the utility score for an entry, or undefined if none exists.
|
|
@@ -910,12 +1308,17 @@ export function getUtilityScore(db, entryId) {
|
|
|
910
1308
|
}
|
|
911
1309
|
/**
|
|
912
1310
|
* Batch-load utility scores for multiple entry IDs in a single query.
|
|
913
|
-
* Returns a
|
|
1311
|
+
* Returns a `{ global, scoped }` pair, both Maps keyed by entry_id.
|
|
1312
|
+
*
|
|
1313
|
+
* When `scopeKey` is provided a second query runs against
|
|
1314
|
+
* `utility_scores_scoped` and the result is returned as `scoped`.
|
|
1315
|
+
* Both maps are always present; `scoped` is empty when `scopeKey` is absent.
|
|
914
1316
|
*/
|
|
915
|
-
export function getUtilityScoresByIds(db, ids) {
|
|
1317
|
+
export function getUtilityScoresByIds(db, ids, scopeKey) {
|
|
1318
|
+
const global = new Map();
|
|
1319
|
+
const scoped = new Map();
|
|
916
1320
|
if (ids.length === 0)
|
|
917
|
-
return
|
|
918
|
-
const result = new Map();
|
|
1321
|
+
return { global, scoped };
|
|
919
1322
|
// Process in chunks to stay within SQLITE_MAX_VARIABLE_NUMBER
|
|
920
1323
|
for (let i = 0; i < ids.length; i += SQLITE_CHUNK_SIZE) {
|
|
921
1324
|
const chunk = ids.slice(i, i + SQLITE_CHUNK_SIZE);
|
|
@@ -924,7 +1327,7 @@ export function getUtilityScoresByIds(db, ids) {
|
|
|
924
1327
|
.prepare(`SELECT entry_id, utility, show_count, search_count, select_rate, last_used_at, updated_at FROM utility_scores WHERE entry_id IN (${placeholders})`)
|
|
925
1328
|
.all(...chunk);
|
|
926
1329
|
for (const row of rows) {
|
|
927
|
-
|
|
1330
|
+
global.set(row.entry_id, {
|
|
928
1331
|
entryId: row.entry_id,
|
|
929
1332
|
utility: row.utility,
|
|
930
1333
|
showCount: row.show_count,
|
|
@@ -934,8 +1337,21 @@ export function getUtilityScoresByIds(db, ids) {
|
|
|
934
1337
|
updatedAt: row.updated_at,
|
|
935
1338
|
});
|
|
936
1339
|
}
|
|
1340
|
+
if (scopeKey) {
|
|
1341
|
+
const scopedRows = db
|
|
1342
|
+
.prepare(`SELECT entry_id, scope_key, utility, last_used_at FROM utility_scores_scoped WHERE scope_key = ? AND entry_id IN (${placeholders})`)
|
|
1343
|
+
.all(scopeKey, ...chunk);
|
|
1344
|
+
for (const row of scopedRows) {
|
|
1345
|
+
scoped.set(row.entry_id, {
|
|
1346
|
+
entryId: row.entry_id,
|
|
1347
|
+
scopeKey: row.scope_key,
|
|
1348
|
+
utility: row.utility,
|
|
1349
|
+
lastUsedAt: row.last_used_at,
|
|
1350
|
+
});
|
|
1351
|
+
}
|
|
1352
|
+
}
|
|
937
1353
|
}
|
|
938
|
-
return
|
|
1354
|
+
return { global, scoped };
|
|
939
1355
|
}
|
|
940
1356
|
/**
|
|
941
1357
|
* Insert or update a utility score for an entry.
|
|
@@ -953,3 +1369,417 @@ export function upsertUtilityScore(db, entryId, data) {
|
|
|
953
1369
|
updated_at = datetime('now')
|
|
954
1370
|
`).run(entryId, data.utility, data.showCount, data.searchCount, data.selectRate, data.lastUsedAt ?? null);
|
|
955
1371
|
}
|
|
1372
|
+
/**
|
|
1373
|
+
* Look up a cached LLM result for the given asset_ref.
|
|
1374
|
+
*
|
|
1375
|
+
* Returns `undefined` when no entry exists OR when the stored body_hash
|
|
1376
|
+
* doesn't match `currentBodyHash` (body has changed since the result was
|
|
1377
|
+
* cached). In both cases the caller should invoke the LLM and write a new
|
|
1378
|
+
* cache entry.
|
|
1379
|
+
*/
|
|
1380
|
+
export function getLlmCacheEntry(db, assetRef, currentBodyHash, cacheVariant = "") {
|
|
1381
|
+
const row = db
|
|
1382
|
+
.prepare("SELECT asset_ref, cache_variant, body_hash, result_json, updated_at FROM llm_enrichment_cache WHERE asset_ref = ? AND cache_variant = ?")
|
|
1383
|
+
.get(assetRef, cacheVariant);
|
|
1384
|
+
if (!row)
|
|
1385
|
+
return undefined;
|
|
1386
|
+
// Hash mismatch → body changed, treat as cache miss.
|
|
1387
|
+
if (row.body_hash !== currentBodyHash)
|
|
1388
|
+
return undefined;
|
|
1389
|
+
return {
|
|
1390
|
+
assetRef: row.asset_ref,
|
|
1391
|
+
cacheVariant: row.cache_variant,
|
|
1392
|
+
bodyHash: row.body_hash,
|
|
1393
|
+
resultJson: row.result_json,
|
|
1394
|
+
updatedAt: row.updated_at,
|
|
1395
|
+
};
|
|
1396
|
+
}
|
|
1397
|
+
/**
|
|
1398
|
+
* Batched variant of {@link getLlmCacheEntry}. Fetches every cache row whose
|
|
1399
|
+
* `asset_ref` is in `refs` with a single `IN (...)` query (chunked to respect
|
|
1400
|
+
* SQLITE_MAX_VARIABLE_NUMBER), returning a `Map<assetRef, LlmCacheEntry>`.
|
|
1401
|
+
*
|
|
1402
|
+
* Unlike `getLlmCacheEntry`, this does NOT filter by body hash — callers must
|
|
1403
|
+
* compare `entry.bodyHash` against the current body hash themselves. This lets
|
|
1404
|
+
* the batch path issue one DB query per chunk instead of one per file.
|
|
1405
|
+
*/
|
|
1406
|
+
export function getLlmCacheEntriesByRefs(db, refs, cacheVariant = "") {
|
|
1407
|
+
const result = new Map();
|
|
1408
|
+
if (refs.length === 0)
|
|
1409
|
+
return result;
|
|
1410
|
+
for (let i = 0; i < refs.length; i += SQLITE_CHUNK_SIZE) {
|
|
1411
|
+
const chunk = refs.slice(i, i + SQLITE_CHUNK_SIZE);
|
|
1412
|
+
const placeholders = chunk.map(() => "?").join(", ");
|
|
1413
|
+
const rows = db
|
|
1414
|
+
.prepare(`SELECT asset_ref, cache_variant, body_hash, result_json, updated_at FROM llm_enrichment_cache
|
|
1415
|
+
WHERE cache_variant = ? AND asset_ref IN (${placeholders})`)
|
|
1416
|
+
.all(cacheVariant, ...chunk);
|
|
1417
|
+
for (const row of rows) {
|
|
1418
|
+
result.set(row.asset_ref, {
|
|
1419
|
+
assetRef: row.asset_ref,
|
|
1420
|
+
cacheVariant: row.cache_variant,
|
|
1421
|
+
bodyHash: row.body_hash,
|
|
1422
|
+
resultJson: row.result_json,
|
|
1423
|
+
updatedAt: row.updated_at,
|
|
1424
|
+
});
|
|
1425
|
+
}
|
|
1426
|
+
}
|
|
1427
|
+
return result;
|
|
1428
|
+
}
|
|
1429
|
+
/**
|
|
1430
|
+
* Insert or update a cached LLM result for the given asset_ref.
|
|
1431
|
+
*/
|
|
1432
|
+
export function upsertLlmCacheEntry(db, assetRef, bodyHash, resultJson, cacheVariant = "") {
|
|
1433
|
+
db.prepare(`INSERT INTO llm_enrichment_cache (asset_ref, cache_variant, body_hash, result_json, updated_at)
|
|
1434
|
+
VALUES (?, ?, ?, ?, ?)
|
|
1435
|
+
ON CONFLICT(asset_ref, cache_variant) DO UPDATE SET
|
|
1436
|
+
body_hash = excluded.body_hash,
|
|
1437
|
+
result_json = excluded.result_json,
|
|
1438
|
+
updated_at = excluded.updated_at`).run(assetRef, cacheVariant, bodyHash, resultJson, Date.now());
|
|
1439
|
+
}
|
|
1440
|
+
/**
|
|
1441
|
+
* Delete LLM cache entries whose asset_ref is no longer present in the
|
|
1442
|
+
* `entries` table. Should be called during the cleanup phase of each index
|
|
1443
|
+
* run to prevent the cache from growing unboundedly as assets are removed.
|
|
1444
|
+
*
|
|
1445
|
+
* The join uses a LIKE match against the entries `file_path` column because
|
|
1446
|
+
* graph/memory cache refs are absolute file paths, while enrichment cache
|
|
1447
|
+
* refs are entry_key strings — we preserve any entry that still has a
|
|
1448
|
+
* corresponding row in either the entries table (by entry_key) or that
|
|
1449
|
+
* matches a live file_path.
|
|
1450
|
+
*/
|
|
1451
|
+
export function clearStaleCacheEntries(db) {
|
|
1452
|
+
try {
|
|
1453
|
+
db.exec(`
|
|
1454
|
+
DELETE FROM llm_enrichment_cache
|
|
1455
|
+
WHERE asset_ref NOT IN (SELECT file_path FROM entries)
|
|
1456
|
+
AND asset_ref NOT IN (SELECT entry_key FROM entries)
|
|
1457
|
+
`);
|
|
1458
|
+
}
|
|
1459
|
+
catch {
|
|
1460
|
+
/* ignore — table may not exist in very old DBs opened without ensureSchema */
|
|
1461
|
+
}
|
|
1462
|
+
}
|
|
1463
|
+
/**
|
|
1464
|
+
* Compute a stable SHA-256 hex digest of a UTF-8 string using Bun's native
|
|
1465
|
+
* hashing. Used as the body_hash key in `llm_enrichment_cache`.
|
|
1466
|
+
*
|
|
1467
|
+
* Bun.CryptoHasher is synchronous and allocation-free compared to Web Crypto,
|
|
1468
|
+
* making it suitable for use inside tight per-asset loops.
|
|
1469
|
+
*/
|
|
1470
|
+
export function computeBodyHash(body) {
|
|
1471
|
+
const hasher = new Bun.CryptoHasher("sha256");
|
|
1472
|
+
hasher.update(body);
|
|
1473
|
+
return hasher.digest("hex");
|
|
1474
|
+
}
|
|
1475
|
+
/**
|
|
1476
|
+
* Count search and show events for the given entry refs.
|
|
1477
|
+
* Returns a Map<ref, count> with only refs that have at least one event.
|
|
1478
|
+
* Used by the improve loop to find high-retrieval assets without feedback.
|
|
1479
|
+
*/
|
|
1480
|
+
export function getRetrievalCounts(db, refs) {
|
|
1481
|
+
if (refs.length === 0)
|
|
1482
|
+
return new Map();
|
|
1483
|
+
const result = new Map();
|
|
1484
|
+
// Chunk to stay within SQLITE_MAX_VARIABLE_NUMBER (same pattern as getUtilityScoresByIds).
|
|
1485
|
+
for (let i = 0; i < refs.length; i += SQLITE_CHUNK_SIZE) {
|
|
1486
|
+
const chunk = refs.slice(i, i + SQLITE_CHUNK_SIZE);
|
|
1487
|
+
const placeholders = chunk.map(() => "?").join(", ");
|
|
1488
|
+
const rows = db
|
|
1489
|
+
.prepare(`SELECT entry_ref, COUNT(*) AS cnt FROM usage_events
|
|
1490
|
+
WHERE event_type IN ('search','show') AND entry_ref IN (${placeholders})
|
|
1491
|
+
GROUP BY entry_ref`)
|
|
1492
|
+
.all(...chunk);
|
|
1493
|
+
for (const r of rows)
|
|
1494
|
+
result.set(r.entry_ref, r.cnt);
|
|
1495
|
+
}
|
|
1496
|
+
return result;
|
|
1497
|
+
}
|
|
1498
|
+
/**
|
|
1499
|
+
* Apply a MemRL reward signal to a batch of entries via exponential moving
|
|
1500
|
+
* average (EMA): next = clamp(current + lr * (reward - current), 0, 1).
|
|
1501
|
+
*
|
|
1502
|
+
* Wrapped in a single transaction so all bumps succeed or fail together.
|
|
1503
|
+
* The indexer (`akm index`) will overwrite these values at next reindex run;
|
|
1504
|
+
* bumps are intentionally temporary hints between index runs, not permanent
|
|
1505
|
+
* overrides.
|
|
1506
|
+
*
|
|
1507
|
+
* When `scopeKey` is provided, also writes a scoped bump to
|
|
1508
|
+
* `utility_scores_scoped` so per-project usage signals accumulate alongside
|
|
1509
|
+
* the global ones. The global table is always updated regardless.
|
|
1510
|
+
*/
|
|
1511
|
+
export function bumpUtilityScoresBatch(db, entryIds, reward, lr = 0.1, scopeKey) {
|
|
1512
|
+
if (entryIds.length === 0)
|
|
1513
|
+
return;
|
|
1514
|
+
db.transaction(() => {
|
|
1515
|
+
const { global: scoreMap } = getUtilityScoresByIds(db, entryIds);
|
|
1516
|
+
const now = new Date().toISOString();
|
|
1517
|
+
const nowMs = Date.now();
|
|
1518
|
+
const stmt = db.prepare(`INSERT INTO utility_scores (entry_id, utility, show_count, search_count, select_rate, last_used_at, updated_at)
|
|
1519
|
+
VALUES (?, ?, 0, 0, 0, ?, ?)
|
|
1520
|
+
ON CONFLICT(entry_id) DO UPDATE SET
|
|
1521
|
+
utility = excluded.utility,
|
|
1522
|
+
updated_at = excluded.updated_at`);
|
|
1523
|
+
// Prepare scoped upsert once outside the loop when scopeKey is present.
|
|
1524
|
+
const scopedStmt = scopeKey
|
|
1525
|
+
? db.prepare(`INSERT INTO utility_scores_scoped (entry_id, scope_key, utility, last_used_at)
|
|
1526
|
+
VALUES (?, ?, ?, ?)
|
|
1527
|
+
ON CONFLICT(entry_id, scope_key) DO UPDATE SET
|
|
1528
|
+
utility = excluded.utility,
|
|
1529
|
+
last_used_at = excluded.last_used_at`)
|
|
1530
|
+
: null;
|
|
1531
|
+
for (const entryId of entryIds) {
|
|
1532
|
+
const existing = scoreMap.get(entryId);
|
|
1533
|
+
const current = existing?.utility ?? 0;
|
|
1534
|
+
const next = Math.max(0, Math.min(1, current + lr * (reward - current)));
|
|
1535
|
+
stmt.run(entryId, next, now, now);
|
|
1536
|
+
if (scopedStmt && scopeKey) {
|
|
1537
|
+
// Retrieve the current scoped utility so we can apply the same EMA.
|
|
1538
|
+
const scopedCurrent = getScopedUtility(db, entryId, scopeKey);
|
|
1539
|
+
const scopedNext = Math.max(0, Math.min(1, scopedCurrent + lr * (reward - scopedCurrent)));
|
|
1540
|
+
scopedStmt.run(entryId, scopeKey, scopedNext, nowMs);
|
|
1541
|
+
}
|
|
1542
|
+
}
|
|
1543
|
+
})();
|
|
1544
|
+
}
|
|
1545
|
+
/**
|
|
1546
|
+
* Return the current utility value for a single (entry_id, scope_key) pair.
|
|
1547
|
+
* Returns 0 when no row exists yet.
|
|
1548
|
+
*/
|
|
1549
|
+
function getScopedUtility(db, entryId, scopeKey) {
|
|
1550
|
+
const row = db
|
|
1551
|
+
.prepare("SELECT utility FROM utility_scores_scoped WHERE entry_id = ? AND scope_key = ?")
|
|
1552
|
+
.get(entryId, scopeKey);
|
|
1553
|
+
return row?.utility ?? 0;
|
|
1554
|
+
}
|
|
1555
|
+
// ── Indexer-phase helpers (moved from indexer.ts) ────────────────────────────
|
|
1556
|
+
/**
|
|
1557
|
+
* Return all entries that do not yet have an embedding row.
|
|
1558
|
+
* Used by the embedding phase to determine which entries need vectors generated.
|
|
1559
|
+
*/
|
|
1560
|
+
export function getAllEntriesForEmbedding(db) {
|
|
1561
|
+
return db
|
|
1562
|
+
.prepare(`
|
|
1563
|
+
SELECT e.id, e.search_text AS searchText, e.entry_key AS entryKey, e.file_path AS filePath FROM entries e
|
|
1564
|
+
WHERE NOT EXISTS (SELECT 1 FROM embeddings b WHERE b.id = e.id)
|
|
1565
|
+
AND e.entry_type != 'vault'
|
|
1566
|
+
`)
|
|
1567
|
+
.all();
|
|
1568
|
+
}
|
|
1569
|
+
/**
|
|
1570
|
+
* Upsert a workflow document record for an indexed entry.
|
|
1571
|
+
* Persists the parsed workflow AST as JSON alongside a FNV-1a hash of the
|
|
1572
|
+
* source content for future incremental fast-paths.
|
|
1573
|
+
*/
|
|
1574
|
+
export function upsertWorkflowDocument(db, entryId, doc, content) {
|
|
1575
|
+
const sourceHash = computeSourceHash(content);
|
|
1576
|
+
db.prepare(`INSERT INTO workflow_documents (entry_id, schema_version, document_json, source_path, source_hash, updated_at)
|
|
1577
|
+
VALUES (?, ?, ?, ?, ?, ?)
|
|
1578
|
+
ON CONFLICT(entry_id) DO UPDATE SET
|
|
1579
|
+
schema_version = excluded.schema_version,
|
|
1580
|
+
document_json = excluded.document_json,
|
|
1581
|
+
source_path = excluded.source_path,
|
|
1582
|
+
source_hash = excluded.source_hash,
|
|
1583
|
+
updated_at = excluded.updated_at`).run(entryId, doc.schemaVersion, JSON.stringify(doc), doc.source.path, sourceHash, new Date().toISOString());
|
|
1584
|
+
}
|
|
1585
|
+
/**
|
|
1586
|
+
* Compute a cheap FNV-1a hash of a buffer for source-identity tracking.
|
|
1587
|
+
* Not security-sensitive; used as an incremental fast-path skip key.
|
|
1588
|
+
*/
|
|
1589
|
+
export function computeSourceHash(content) {
|
|
1590
|
+
let hash = 0x811c9dc5;
|
|
1591
|
+
for (let i = 0; i < content.length; i++) {
|
|
1592
|
+
hash ^= content[i];
|
|
1593
|
+
hash = Math.imul(hash, 0x01000193);
|
|
1594
|
+
}
|
|
1595
|
+
return (hash >>> 0).toString(16);
|
|
1596
|
+
}
|
|
1597
|
+
/**
|
|
1598
|
+
* Return distinct zero-result search queries from the `usage_events` table
|
|
1599
|
+
* within the given lookback window.
|
|
1600
|
+
*
|
|
1601
|
+
* Reads from `usage_events` (event_type = 'search') where the metadata JSON
|
|
1602
|
+
* blob contains `resultCount = 0`. The `search_events` table never existed;
|
|
1603
|
+
* all errors are caught and an empty array is returned so callers never need
|
|
1604
|
+
* to guard against DB schema differences.
|
|
1605
|
+
*/
|
|
1606
|
+
export function getZeroResultSearches(db, sinceDays = 30) {
|
|
1607
|
+
const since = new Date(Date.now() - sinceDays * 24 * 60 * 60 * 1000).toISOString();
|
|
1608
|
+
try {
|
|
1609
|
+
const rows = db
|
|
1610
|
+
.prepare(`SELECT DISTINCT json_extract(metadata, '$.query') AS query
|
|
1611
|
+
FROM usage_events
|
|
1612
|
+
WHERE event_type = 'search'
|
|
1613
|
+
AND created_at >= ?
|
|
1614
|
+
AND json_extract(metadata, '$.resultCount') = 0
|
|
1615
|
+
ORDER BY created_at DESC LIMIT 20`)
|
|
1616
|
+
.all(since);
|
|
1617
|
+
return rows.map((r) => r.query).filter((q) => q !== null);
|
|
1618
|
+
}
|
|
1619
|
+
catch {
|
|
1620
|
+
return []; // table may not exist in older DBs
|
|
1621
|
+
}
|
|
1622
|
+
}
|
|
1623
|
+
/**
|
|
1624
|
+
* Look up an entry by its integer numeric id.
|
|
1625
|
+
* Returns null when no matching row is found.
|
|
1626
|
+
*/
|
|
1627
|
+
export function getEntryByRef(db, type, name) {
|
|
1628
|
+
return db.prepare("SELECT id FROM entries WHERE entry_type = ? AND entry_key = ?").get(type, `${type}:${name}`);
|
|
1629
|
+
}
|
|
1630
|
+
/**
|
|
1631
|
+
* MemRL learning rate for feedback-driven utility updates (F-5 / #386).
|
|
1632
|
+
*
|
|
1633
|
+
* Follows the bounded-step formula from MemRL (arXiv:2601.03192):
|
|
1634
|
+
* next = clamp(current + lr × (reward − current), 0, 1)
|
|
1635
|
+
*
|
|
1636
|
+
* This replaces the unbounded `-0.03 × negativeCount` delta that could
|
|
1637
|
+
* silently remove high-utility assets from the improvement loop.
|
|
1638
|
+
*/
|
|
1639
|
+
const FEEDBACK_LR = 0.1;
|
|
1640
|
+
/**
|
|
1641
|
+
* Positive reward signal for a single positive feedback event.
|
|
1642
|
+
* Reward 1.0 means "fully correct / helpful".
|
|
1643
|
+
*/
|
|
1644
|
+
const FEEDBACK_REWARD_POSITIVE = 1.0;
|
|
1645
|
+
/**
|
|
1646
|
+
* Negative reward signal for a single negative feedback event.
|
|
1647
|
+
* Reward 0.0 means "not helpful" (lowest MemRL signal).
|
|
1648
|
+
*/
|
|
1649
|
+
const FEEDBACK_REWARD_NEGATIVE = 0.0;
|
|
1650
|
+
/**
|
|
1651
|
+
* Maximum total negative utility delta allowed in a single
|
|
1652
|
+
* `applyFeedbackToUtilityScore` call regardless of negativeCount.
|
|
1653
|
+
*
|
|
1654
|
+
* This caps the per-day negative impact (the function is called once per
|
|
1655
|
+
* feedback event — spamming 10 negatives in one session can move utility
|
|
1656
|
+
* at most `MAX_NEG_DELTA_PER_CALL`). The cap prevents a noisy negative-
|
|
1657
|
+
* feedback stream from silently destroying a high-utility asset's ranking.
|
|
1658
|
+
*/
|
|
1659
|
+
const MAX_NEG_DELTA_PER_CALL = 0.15;
|
|
1660
|
+
/**
|
|
1661
|
+
* Utility threshold below which a review-needed escalation is triggered.
|
|
1662
|
+
* When a previously high-utility asset (≥ HIGH_UTILITY_THRESHOLD) drops
|
|
1663
|
+
* below this value, the caller should create an escalation proposal.
|
|
1664
|
+
*/
|
|
1665
|
+
export const UTILITY_REVIEW_THRESHOLD = 0.5;
|
|
1666
|
+
/**
|
|
1667
|
+
* Utility level considered "high" — assets above this are tracked for
|
|
1668
|
+
* threshold-crossing escalation.
|
|
1669
|
+
*/
|
|
1670
|
+
export const HIGH_UTILITY_THRESHOLD = 0.5;
|
|
1671
|
+
/**
|
|
1672
|
+
* Apply accumulated feedback counts to the utility score of an entry using the
|
|
1673
|
+
* MemRL bounded-step EMA formula (F-5 / #386, arXiv:2601.03192).
|
|
1674
|
+
*
|
|
1675
|
+
* Replaces the previous unbounded `-0.03 × negativeCount` formula with:
|
|
1676
|
+
*
|
|
1677
|
+
* reward = weighted average of positive and negative signals
|
|
1678
|
+
* nextUtil = clamp(currentUtil + lr × (reward − currentUtil), 0, 1)
|
|
1679
|
+
*
|
|
1680
|
+
* The negative impact is additionally capped at {@link MAX_NEG_DELTA_PER_CALL}
|
|
1681
|
+
* to prevent a noisy feedback stream from silently erasing a high-utility asset.
|
|
1682
|
+
*
|
|
1683
|
+
* A new entry starts at 0.5 (neutral midpoint) before the EMA step is applied.
|
|
1684
|
+
*
|
|
1685
|
+
* Returns a {@link FeedbackUtilityResult} so the caller can detect when a
|
|
1686
|
+
* previously high-utility asset crosses below the review threshold and create
|
|
1687
|
+
* an escalation proposal.
|
|
1688
|
+
*/
|
|
1689
|
+
export function applyFeedbackToUtilityScore(db, entryId, positiveCount, negativeCount) {
|
|
1690
|
+
const existing = getUtilityScore(db, entryId);
|
|
1691
|
+
const previousUtility = existing?.utility ?? 0.5;
|
|
1692
|
+
if (positiveCount === 0 && negativeCount === 0) {
|
|
1693
|
+
return { previousUtility, nextUtility: previousUtility, crossedReviewThreshold: false };
|
|
1694
|
+
}
|
|
1695
|
+
const total = positiveCount + negativeCount;
|
|
1696
|
+
// Weighted reward: proportion of positive signals.
|
|
1697
|
+
const reward = positiveCount > 0 && negativeCount === 0
|
|
1698
|
+
? FEEDBACK_REWARD_POSITIVE
|
|
1699
|
+
: negativeCount > 0 && positiveCount === 0
|
|
1700
|
+
? FEEDBACK_REWARD_NEGATIVE
|
|
1701
|
+
: (positiveCount * FEEDBACK_REWARD_POSITIVE + negativeCount * FEEDBACK_REWARD_NEGATIVE) / total;
|
|
1702
|
+
// MemRL bounded-step EMA: lr × (reward − current)
|
|
1703
|
+
let delta = FEEDBACK_LR * (reward - previousUtility);
|
|
1704
|
+
// Per-call negative cap: if delta is negative (net negative feedback), cap it.
|
|
1705
|
+
if (delta < 0) {
|
|
1706
|
+
delta = Math.max(delta, -MAX_NEG_DELTA_PER_CALL);
|
|
1707
|
+
}
|
|
1708
|
+
const nextUtility = Math.max(0, Math.min(1, previousUtility + delta));
|
|
1709
|
+
const now = new Date().toISOString();
|
|
1710
|
+
db.prepare(`
|
|
1711
|
+
INSERT INTO utility_scores (entry_id, utility, show_count, search_count, select_rate, last_used_at, updated_at)
|
|
1712
|
+
VALUES (?, ?, 0, 0, 0, ?, ?)
|
|
1713
|
+
ON CONFLICT(entry_id) DO UPDATE SET
|
|
1714
|
+
utility = ?,
|
|
1715
|
+
updated_at = ?
|
|
1716
|
+
`).run(entryId, nextUtility, now, now, nextUtility, now);
|
|
1717
|
+
const crossedReviewThreshold = previousUtility >= HIGH_UTILITY_THRESHOLD && nextUtility < UTILITY_REVIEW_THRESHOLD;
|
|
1718
|
+
return { previousUtility, nextUtility, crossedReviewThreshold };
|
|
1719
|
+
}
|
|
1720
|
+
/**
|
|
1721
|
+
* Re-link detached usage_events to their current entry_ids via entry_ref.
|
|
1722
|
+
*
|
|
1723
|
+
* After a full rebuild, entry IDs change. This query matches events to their
|
|
1724
|
+
* new entry rows using the stable `entry_ref` ("type:name") column so usage
|
|
1725
|
+
* history survives a full reindex.
|
|
1726
|
+
*/
|
|
1727
|
+
export function relinkUsageEvents(db) {
|
|
1728
|
+
try {
|
|
1729
|
+
db.exec(`
|
|
1730
|
+
UPDATE usage_events SET entry_id = (
|
|
1731
|
+
SELECT e.id FROM entries e
|
|
1732
|
+
WHERE substr(e.entry_key, length(e.entry_key) - length(usage_events.entry_ref)) = ':' || usage_events.entry_ref
|
|
1733
|
+
LIMIT 1
|
|
1734
|
+
)
|
|
1735
|
+
WHERE entry_id IS NULL AND entry_ref IS NOT NULL
|
|
1736
|
+
`);
|
|
1737
|
+
}
|
|
1738
|
+
catch {
|
|
1739
|
+
/* ignore if table doesn't exist yet */
|
|
1740
|
+
}
|
|
1741
|
+
}
|
|
1742
|
+
// ── registry_index_cache helpers ─────────────────────────────────────────────
|
|
1743
|
+
/**
|
|
1744
|
+
* Upsert a registry index cache entry in index.db.
|
|
1745
|
+
*
|
|
1746
|
+
* @param db - Open index.db connection (from openDatabase / openExistingDatabase).
|
|
1747
|
+
* @param registryUrl - Canonical URL of the registry (used as primary key).
|
|
1748
|
+
* @param indexJson - Serialised registry index document (JSON string).
|
|
1749
|
+
* @param opts.etag - HTTP ETag from the response (optional).
|
|
1750
|
+
* @param opts.lastModified - HTTP Last-Modified from the response (optional).
|
|
1751
|
+
*/
|
|
1752
|
+
export function upsertRegistryIndexCache(db, registryUrl, indexJson, opts) {
|
|
1753
|
+
db.prepare(`
|
|
1754
|
+
INSERT INTO registry_index_cache (registry_url, fetched_at, etag, last_modified, index_json)
|
|
1755
|
+
VALUES (?, ?, ?, ?, ?)
|
|
1756
|
+
ON CONFLICT(registry_url) DO UPDATE SET
|
|
1757
|
+
fetched_at = excluded.fetched_at,
|
|
1758
|
+
etag = excluded.etag,
|
|
1759
|
+
last_modified = excluded.last_modified,
|
|
1760
|
+
index_json = excluded.index_json
|
|
1761
|
+
`).run(registryUrl, new Date().toISOString(), opts?.etag ?? null, opts?.lastModified ?? null, indexJson);
|
|
1762
|
+
}
|
|
1763
|
+
/**
|
|
1764
|
+
* Look up a cached registry index entry from index.db.
|
|
1765
|
+
* Returns undefined when not found or when the entry is older than `maxAgeMs`.
|
|
1766
|
+
*
|
|
1767
|
+
* TTL check: if `Date.now() - new Date(fetched_at).getTime() > maxAgeMs` the
|
|
1768
|
+
* entry is considered a cache miss and undefined is returned.
|
|
1769
|
+
*
|
|
1770
|
+
* @param db - Open index.db connection.
|
|
1771
|
+
* @param registryUrl - Canonical URL of the registry (primary key).
|
|
1772
|
+
* @param maxAgeMs - Maximum age in milliseconds before the entry is stale (default: 1 hour).
|
|
1773
|
+
*/
|
|
1774
|
+
export function getRegistryIndexCache(db, registryUrl, maxAgeMs = 3_600_000 /* 1 hour */) {
|
|
1775
|
+
const row = db
|
|
1776
|
+
.prepare(`SELECT fetched_at, etag, last_modified, index_json
|
|
1777
|
+
FROM registry_index_cache WHERE registry_url = ?`)
|
|
1778
|
+
.get(registryUrl);
|
|
1779
|
+
if (!row)
|
|
1780
|
+
return undefined;
|
|
1781
|
+
const fetchedAt = Date.parse(row.fetched_at);
|
|
1782
|
+
if (Number.isNaN(fetchedAt) || Date.now() - fetchedAt > maxAgeMs)
|
|
1783
|
+
return undefined;
|
|
1784
|
+
return { indexJson: row.index_json, etag: row.etag, lastModified: row.last_modified };
|
|
1785
|
+
}
|