clawmem 0.8.4 → 0.9.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,506 @@
1
+ /**
2
+ * §11.1 — `<vault-facts>` KG injection for context-surfacing (v0.9.0)
3
+ *
4
+ * Prompt-only entity detection + exact-match validation + triple query +
5
+ * token-budgeted XML serialization. Wires the SPO knowledge graph
6
+ * (populated by v0.8.5 decision-extractor + A-MEM enrichment) into the
7
+ * retrieval hot path without ever reading from ranked documents.
8
+ *
9
+ * Hard constraint from the approved design (§11.1, prompt-only seeding):
10
+ * entity seeds come from `input.prompt` text ONLY, never from ranked
11
+ * document bodies or snippets. Without this, a topic-boosted off-topic
12
+ * doc (§11.4) could pollute the `<vault-facts>` block with facts about
13
+ * entities that have nothing to do with the user's actual prompt.
14
+ *
15
+ * Three-path candidate generation (BACKLOG §11.1 "Concrete implementation"):
16
+ * (a) canonical-ID regex `/^[a-z][a-z0-9-]*:[a-z_]+:[a-z0-9_]+$/`
17
+ * (b) proper-noun extraction (capitalized tokens + all-caps acronyms)
18
+ * (c) normalized n-gram scan against entity_nodes.name (1-3 grams,
19
+ * keep internal hyphens whole, batch SQL lookup via
20
+ * `WHERE LOWER(name) IN (?, ?, ...) AND vault = ?` backed by
21
+ * the `idx_entity_nodes_lower_name` expression index added in
22
+ * the v0.9.0 schema migration)
23
+ *
24
+ * Per-path validate-then-count ordering (Codex §11.1 Turn 5):
25
+ * - Path (a): validate via direct PK lookup, count immediately.
26
+ * - Path (b): validate via `resolveEntityTypeExact` BEFORE counting —
27
+ * only non-null results consume budget. Raw capitalized tokens that
28
+ * fail validation are dropped silently without starving path (c).
29
+ * - Path (c): validated hits fill remaining slots up to the 100-cap.
30
+ * Within path (c): 3-grams > 2-grams > 1-grams; prompt order is
31
+ * the final tie-breaker within each length class.
32
+ *
33
+ * Cross-path dedup: path (b) / (c) hits that resolve to the same
34
+ * entity_id as an earlier path (a) hit are no-ops — they do not
35
+ * consume a second cap slot.
36
+ *
37
+ * Fail-open discipline (BACKLOG §11.1 "CRITICAL fail-open requirement"):
38
+ * - Empty prompt / zero candidates → return [] (caller skips stage).
39
+ * - SQLite error during any lookup → caught per-candidate, silent skip.
40
+ * - Empty triples for every validated entity → return null from
41
+ * buildVaultFactsBlock (caller omits the block entirely).
42
+ * - Token budget too small to fit even one triple → return null.
43
+ * - Exact-match ambiguity (two entities with the same name) → skip
44
+ * that entity via `resolveEntityTypeExact` returning null.
45
+ */
46
+
47
+ import type { Database } from "bun:sqlite";
48
+ import { resolveEntityTypeExact, ensureEntityCanonical } from "./entity.ts";
49
+
50
+ // =============================================================================
51
+ // Constants
52
+ // =============================================================================
53
+
54
+ /** Hard upper bound on the number of VALIDATED entity candidates per prompt. */
55
+ const CANDIDATE_CAP = 100;
56
+
57
+ /** Maximum n-gram length (inclusive). 3-grams provide the best recall
58
+ * vs. signal trade-off per the Codex Turn 3 analysis; 4-grams dilute. */
59
+ const MAX_NGRAM_LEN = 3;
60
+
61
+ /**
62
+ * Canonical entity ID shape: `vault:type:slug`. The slug segment can
63
+ * include hyphens (e.g. `skill:tool:forge-stack`). Use a non-hyphen
64
+ * boundary on both ends so a trailing `.` or `,` doesn't swallow the
65
+ * last character but interior hyphens survive intact.
66
+ */
67
+ const CANONICAL_ID_REGEX = /(?<![a-zA-Z0-9_-])[a-z][a-z0-9-]*:[a-z_]+:[a-z0-9_](?:[a-z0-9_-]*[a-z0-9_])?(?![a-zA-Z0-9_-])/g;
68
+
69
+ /**
70
+ * Proper-noun shape: capitalized first letter + optional mixed case, OR
71
+ * all-caps acronyms (2+ chars). Matches `ClawMem`, `OAuth`, `API`, `Bun`,
72
+ * `PostgreSQL`, `JWT`, etc. Intentionally does NOT match lowercase
73
+ * technical identifiers like `clawmem`, `forge-stack`, `oauth2` — those
74
+ * are the job of path (c) n-gram scanning.
75
+ */
76
+ const PROPER_NOUN_REGEX = /\b(?:[A-Z][a-z0-9]+(?:[A-Z][a-z0-9]*)*|[A-Z]{2,}[a-z0-9]*)\b/g;
77
+
78
+ // =============================================================================
79
+ // Types
80
+ // =============================================================================
81
+
82
+ export interface ValidatedEntity {
83
+ /** Canonical `vault:type:slug` entity id for querying triples. */
84
+ entityId: string;
85
+ /** Lowercase name used internally for dedup / audit. */
86
+ name: string;
87
+ /** Entity type as stored in `entity_nodes.entity_type`. */
88
+ type: string;
89
+ /** Which extraction path surfaced this candidate (for debugging). */
90
+ sourcePath: "canonical-id" | "proper-noun" | "ngram";
91
+ }
92
+
93
+ export interface NgramCandidate {
94
+ /** Lowercase / whitespace-normalized n-gram text. */
95
+ normalized: string;
96
+ /** N-gram length: 1, 2, or 3. Used for the longer-first tie-breaker. */
97
+ length: 1 | 2 | 3;
98
+ /** First-token index in the prompt, for stable prompt-order tie-break. */
99
+ promptOrder: number;
100
+ }
101
+
102
+ /** Lightweight shape of a knowledge-graph triple the caller needs for serialization. */
103
+ export interface VaultFactsTriple {
104
+ subject: string;
105
+ predicate: string;
106
+ object: string;
107
+ validTo: string | null;
108
+ confidence: number;
109
+ }
110
+
111
+ /** Function shape used to query triples for a single entity id.
112
+ * Decoupled from `Store` so unit tests can inject a mock. */
113
+ export type TripleQueryFn = (entityId: string) => VaultFactsTriple[];
114
+
115
+ // =============================================================================
116
+ // Path (a) — canonical-ID regex
117
+ // =============================================================================
118
+
119
+ /**
120
+ * Extract all canonical-ID matches from a prompt. Deduplicated preserving
121
+ * first-occurrence order. Purely syntactic — does NOT consult the DB.
122
+ */
123
+ export function extractCanonicalIds(prompt: string): string[] {
124
+ if (!prompt) return [];
125
+ const matches = prompt.match(CANONICAL_ID_REGEX) ?? [];
126
+ const seen = new Set<string>();
127
+ const out: string[] = [];
128
+ for (const m of matches) {
129
+ if (seen.has(m)) continue;
130
+ seen.add(m);
131
+ out.push(m);
132
+ }
133
+ return out;
134
+ }
135
+
136
+ // =============================================================================
137
+ // Path (b) — proper-noun extraction
138
+ // =============================================================================
139
+
140
+ /**
141
+ * Extract all proper-noun-shaped tokens from a prompt. Deduplicated
142
+ * preserving first-occurrence order. Purely syntactic — does NOT consult
143
+ * the DB. Validation happens via `resolveEntityTypeExact` at the
144
+ * per-path budget-accounting step, NOT here.
145
+ */
146
+ export function extractProperNouns(prompt: string): string[] {
147
+ if (!prompt) return [];
148
+ const matches = prompt.match(PROPER_NOUN_REGEX) ?? [];
149
+ const seen = new Set<string>();
150
+ const out: string[] = [];
151
+ for (const m of matches) {
152
+ if (seen.has(m)) continue;
153
+ seen.add(m);
154
+ out.push(m);
155
+ }
156
+ return out;
157
+ }
158
+
159
+ // =============================================================================
160
+ // Path (c) — normalized n-gram scan
161
+ // =============================================================================
162
+
163
+ /**
164
+ * Tokenize a prompt for n-gram generation. Splits on whitespace and
165
+ * common punctuation while keeping internal hyphens whole — so
166
+ * `forge-stack` stays one token, not two. Strips edge punctuation
167
+ * (quotes, backticks, brackets) from each token boundary.
168
+ */
169
+ function tokenizeForNgrams(prompt: string): string[] {
170
+ if (!prompt) return [];
171
+ return prompt
172
+ .split(/[\s,;:!?"'`()\[\]{}<>]+/)
173
+ .map(t => t.replace(/^[^a-zA-Z0-9\-]+|[^a-zA-Z0-9\-]+$/g, ""))
174
+ .filter(t => t.length > 0);
175
+ }
176
+
177
+ /**
178
+ * Generate 1-gram, 2-gram, and 3-gram windows from a prompt. Windows are
179
+ * deduplicated on their normalized form (lowercase, trimmed, internal
180
+ * whitespace collapsed). Result preserves generation order: all 1-grams
181
+ * first (in prompt order), then 2-grams, then 3-grams. The caller re-sorts
182
+ * by length+promptOrder at validation time for the Turn 5 tie-breaker.
183
+ */
184
+ export function generateNgramCandidates(prompt: string): NgramCandidate[] {
185
+ const tokens = tokenizeForNgrams(prompt);
186
+ const seen = new Set<string>();
187
+ const out: NgramCandidate[] = [];
188
+
189
+ for (let n = 1; n <= MAX_NGRAM_LEN; n++) {
190
+ for (let i = 0; i + n <= tokens.length; i++) {
191
+ const slice = tokens.slice(i, i + n).join(" ");
192
+ const normalized = slice.toLowerCase().trim().replace(/\s+/g, " ");
193
+ if (!normalized) continue;
194
+ if (seen.has(normalized)) continue;
195
+ seen.add(normalized);
196
+ out.push({ normalized, length: n as 1 | 2 | 3, promptOrder: i });
197
+ }
198
+ }
199
+ return out;
200
+ }
201
+
202
+ /**
203
+ * Batch-lookup a set of normalized candidate names against entity_nodes.
204
+ * Uses a single parameterized SQL query backed by the
205
+ * `idx_entity_nodes_lower_name` expression index (added in the v0.9.0
206
+ * schema migration). Duplicate names are deduped in SQL (`DISTINCT`).
207
+ * Returns a map from `LOWER(name)` → `{ entityId, entityType }`.
208
+ *
209
+ * Fail-open: any SQL error returns an empty map. The caller proceeds
210
+ * as if the batch returned zero hits, and path (c) contributes nothing
211
+ * for that prompt.
212
+ */
213
+ export function batchLookupNames(
214
+ db: Database,
215
+ candidates: string[],
216
+ vault: string = "default"
217
+ ): Map<string, { entityId: string; entityType: string }> {
218
+ const out = new Map<string, { entityId: string; entityType: string }>();
219
+ if (candidates.length === 0) return out;
220
+
221
+ // Dedupe and bound the candidate set for the SQL `IN` clause.
222
+ // The per-path budget accounting above us already bounds path (c) to
223
+ // `CANDIDATE_CAP - len(path a + path b)` entries, but we cap the
224
+ // raw n-gram set independently here for safety: a worst-case prompt
225
+ // could generate many distinct normalized n-grams even if only a
226
+ // few would survive the candidate accounting, and a single giant
227
+ // SQL IN clause is wasted work. The 500 cap is intentionally larger
228
+ // than `CANDIDATE_CAP` so the batch query still gets the headroom
229
+ // to return overflow n-grams that the prioritization step then
230
+ // drops at budget time.
231
+ const unique = Array.from(new Set(candidates)).slice(0, 500);
232
+ const placeholders = unique.map(() => "?").join(", ");
233
+ const sql = `
234
+ SELECT DISTINCT LOWER(name) AS lname, entity_id, entity_type
235
+ FROM entity_nodes
236
+ WHERE LOWER(name) IN (${placeholders})
237
+ AND vault = ?
238
+ `;
239
+
240
+ try {
241
+ const rows = db.prepare(sql).all(...unique, vault) as Array<{
242
+ lname: string;
243
+ entity_id: string;
244
+ entity_type: string;
245
+ }>;
246
+ for (const row of rows) {
247
+ out.set(row.lname, { entityId: row.entity_id, entityType: row.entity_type });
248
+ }
249
+ } catch {
250
+ /* fail-open: empty map */
251
+ }
252
+ return out;
253
+ }
254
+
255
+ // =============================================================================
256
+ // Main entity extraction — three-path, validate-then-count, 100-cap
257
+ // =============================================================================
258
+
259
+ /**
260
+ * Three-path prompt entity extraction with per-path validate-then-count
261
+ * ordering, cross-path dedup by resolved entity_id, and the Codex-approved
262
+ * 100-candidate cap.
263
+ *
264
+ * Reads `input.prompt` text ONLY — NEVER touches ranked documents,
265
+ * surfaced snippets, or any retrieval-phase field. This is the §11.1
266
+ * prompt-only hard constraint.
267
+ *
268
+ * Returns a list of validated entities ready for triple-query seeding.
269
+ * Empty array on empty prompt, zero matches, or any fail-open branch.
270
+ */
271
+ export function extractPromptEntities(
272
+ prompt: string,
273
+ db: Database,
274
+ vault: string = "default"
275
+ ): ValidatedEntity[] {
276
+ if (!prompt) return [];
277
+
278
+ const validated: ValidatedEntity[] = [];
279
+ const seenEntityIds = new Set<string>();
280
+
281
+ // --------------------------------------------------------------------
282
+ // Path (a): Canonical-ID regex → direct primary-key lookup
283
+ // --------------------------------------------------------------------
284
+ const canonicalIds = extractCanonicalIds(prompt);
285
+ for (const id of canonicalIds) {
286
+ if (validated.length >= CANDIDATE_CAP) break;
287
+ if (seenEntityIds.has(id)) continue;
288
+ try {
289
+ const exists = db
290
+ .prepare(`SELECT entity_id, entity_type FROM entity_nodes WHERE entity_id = ? AND vault = ?`)
291
+ .get(id, vault) as { entity_id: string; entity_type: string } | undefined;
292
+ if (!exists) continue;
293
+ seenEntityIds.add(id);
294
+ validated.push({
295
+ entityId: id,
296
+ name: id,
297
+ type: exists.entity_type,
298
+ sourcePath: "canonical-id",
299
+ });
300
+ } catch {
301
+ /* fail-open per candidate */
302
+ }
303
+ }
304
+
305
+ // --------------------------------------------------------------------
306
+ // Path (b): Proper-noun extraction → validate-then-count via
307
+ // resolveEntityTypeExact. Non-null return means exactly-one match.
308
+ // After confirming type, use ensureEntityCanonical to get the
309
+ // canonical `vault:type:slug` entity_id. Note: ensureEntityCanonical
310
+ // is effectively read-only in production because every entity_nodes
311
+ // row has a matching entities_fts row inserted at upsert time — the
312
+ // fallback INSERT OR IGNORE fires only when the FTS index got out
313
+ // of sync (rare / migration edge case), in which case it self-heals.
314
+ // --------------------------------------------------------------------
315
+ const properNouns = extractProperNouns(prompt);
316
+ for (const name of properNouns) {
317
+ if (validated.length >= CANDIDATE_CAP) break;
318
+ try {
319
+ const type = resolveEntityTypeExact(db, name, vault);
320
+ if (!type) continue; // null = zero or multi-match → skip silently
321
+ const entityId = ensureEntityCanonical(db, name, type, vault);
322
+ if (!entityId) continue;
323
+ if (seenEntityIds.has(entityId)) continue; // cross-path dedup
324
+ seenEntityIds.add(entityId);
325
+ validated.push({
326
+ entityId,
327
+ name: name.toLowerCase(),
328
+ type,
329
+ sourcePath: "proper-noun",
330
+ });
331
+ } catch {
332
+ /* fail-open per candidate */
333
+ }
334
+ }
335
+
336
+ // --------------------------------------------------------------------
337
+ // Path (c): Normalized n-gram scan → batch SQL → per-candidate validate
338
+ // → longer-first tie-breaker → fill remaining budget.
339
+ // --------------------------------------------------------------------
340
+ if (validated.length < CANDIDATE_CAP) {
341
+ const ngrams = generateNgramCandidates(prompt);
342
+ const normalizedSet = ngrams.map(g => g.normalized);
343
+
344
+ const hits = batchLookupNames(db, normalizedSet, vault);
345
+ if (hits.size > 0) {
346
+ // First pass: collect every ngram that the SQL batch confirms
347
+ // exists in entity_nodes, THEN run the exact-match validator to
348
+ // enforce the exactly-one-entity constraint. Attach length +
349
+ // promptOrder so the sort step can apply the Turn 5 tie-breaker.
350
+ interface ValidatedNgram extends ValidatedEntity {
351
+ length: 1 | 2 | 3;
352
+ promptOrder: number;
353
+ }
354
+ const validatedNgrams: ValidatedNgram[] = [];
355
+ const ngramSeen = new Set<string>();
356
+
357
+ for (const gram of ngrams) {
358
+ const hit = hits.get(gram.normalized);
359
+ if (!hit) continue;
360
+ if (ngramSeen.has(hit.entityId)) continue; // dedup within path (c)
361
+ if (seenEntityIds.has(hit.entityId)) continue; // dedup across paths
362
+
363
+ try {
364
+ const confirmedType = resolveEntityTypeExact(db, gram.normalized, vault);
365
+ if (!confirmedType) continue; // multi-match or zero-match → skip
366
+ ngramSeen.add(hit.entityId);
367
+ validatedNgrams.push({
368
+ entityId: hit.entityId,
369
+ name: gram.normalized,
370
+ type: confirmedType,
371
+ sourcePath: "ngram",
372
+ length: gram.length,
373
+ promptOrder: gram.promptOrder,
374
+ });
375
+ } catch {
376
+ /* fail-open per candidate */
377
+ }
378
+ }
379
+
380
+ // Turn 5 tie-breaker: longer n-grams first (3 → 2 → 1), then
381
+ // prompt order within each length class. Longer n-grams are more
382
+ // semantically specific and should win the remaining budget.
383
+ validatedNgrams.sort((a, b) => {
384
+ if (a.length !== b.length) return b.length - a.length;
385
+ return a.promptOrder - b.promptOrder;
386
+ });
387
+
388
+ // Fill remaining budget.
389
+ for (const g of validatedNgrams) {
390
+ if (validated.length >= CANDIDATE_CAP) break;
391
+ if (seenEntityIds.has(g.entityId)) continue; // paranoid re-check
392
+ seenEntityIds.add(g.entityId);
393
+ validated.push({
394
+ entityId: g.entityId,
395
+ name: g.name,
396
+ type: g.type,
397
+ sourcePath: "ngram",
398
+ });
399
+ }
400
+ }
401
+ }
402
+
403
+ return validated;
404
+ }
405
+
406
+ // =============================================================================
407
+ // Vault-facts block builder
408
+ // =============================================================================
409
+
410
+ export interface BuildVaultFactsOptions {
411
+ /** Cap on triples emitted per entity. Default 10. */
412
+ maxTriplesPerEntity?: number;
413
+ /** Token estimator. Defaults to ~4 chars per token heuristic. */
414
+ estimateTokens?: (s: string) => number;
415
+ /** ISO "now" used to filter `validTo > now`. Defaults to `new Date().toISOString()`. */
416
+ now?: string;
417
+ }
418
+
419
+ const DEFAULT_ESTIMATE_TOKENS = (s: string): number => Math.ceil(s.length / 4);
420
+
421
+ /**
422
+ * Build the `<vault-facts>` XML block for a set of validated entities
423
+ * and a budget in tokens. Returns null if:
424
+ * - No entities (caller: skip the stage entirely).
425
+ * - Zero current triples after filtering (caller: do NOT emit an
426
+ * empty `<vault-facts/>` element).
427
+ * - Budget too small to fit even one triple (caller: drop block,
428
+ * preserve established blocks' budget).
429
+ * - Query callback throws for every entity (fail-open).
430
+ *
431
+ * Truncation rule (BACKLOG §11.1 budget guidance): if the serialized
432
+ * block would exceed the budget, truncate at the triple boundary.
433
+ * Never mid-triple, never emit an empty block.
434
+ *
435
+ * This function does NOT query the DB directly — the caller passes a
436
+ * `TripleQueryFn` functor so tests can inject a mock query.
437
+ */
438
+ export function buildVaultFactsBlock(
439
+ entities: ValidatedEntity[],
440
+ queryTriples: TripleQueryFn,
441
+ budgetTokens: number,
442
+ options: BuildVaultFactsOptions = {}
443
+ ): string | null {
444
+ if (entities.length === 0) return null;
445
+ if (budgetTokens <= 0) return null;
446
+
447
+ const maxPerEntity = options.maxTriplesPerEntity ?? 10;
448
+ const estimate = options.estimateTokens ?? DEFAULT_ESTIMATE_TOKENS;
449
+ const now = options.now ?? new Date().toISOString();
450
+
451
+ // Collect all current triples from all entities, deduping across
452
+ // entities by (subject, predicate, object). Without this, prompts that
453
+ // resolve both endpoints of a triple (e.g. "ClawMem depends_on Bun"
454
+ // when both `ClawMem` and `Bun` are validated entities) would emit
455
+ // the same fact twice and spend budget twice — once from the
456
+ // outgoing side of ClawMem's query, once from the incoming side of
457
+ // Bun's query. Caught by Codex §11.1 code review Turn 1, 2026-04-13.
458
+ const lines: string[] = [];
459
+ const seen = new Set<string>();
460
+ for (const entity of entities) {
461
+ let triples: VaultFactsTriple[] = [];
462
+ try {
463
+ triples = queryTriples(entity.entityId);
464
+ } catch {
465
+ continue; // fail-open per entity
466
+ }
467
+
468
+ // Current-only filter: validTo IS NULL OR validTo > now.
469
+ // Cap at maxPerEntity per entity so one chatty entity does not
470
+ // monopolize the shared budget below.
471
+ const current = triples
472
+ .filter(t => !t.validTo || t.validTo > now)
473
+ .slice(0, maxPerEntity);
474
+
475
+ for (const t of current) {
476
+ const key = `${t.subject}\u0000${t.predicate}\u0000${t.object}`;
477
+ if (seen.has(key)) continue;
478
+ seen.add(key);
479
+ lines.push(`${t.subject} ${t.predicate} ${t.object}`);
480
+ }
481
+ }
482
+
483
+ if (lines.length === 0) return null;
484
+
485
+ // Token-bounded serialization. Start with the structural XML overhead
486
+ // (open + close tag + two newlines) and greedily add triple lines
487
+ // until the next line would overflow the budget. Drop entire block
488
+ // if even one line does not fit — never emit an empty block.
489
+ const OPEN = "<vault-facts>\n";
490
+ const CLOSE = "\n</vault-facts>";
491
+ const OVERHEAD = estimate(OPEN + CLOSE);
492
+ if (OVERHEAD >= budgetTokens) return null;
493
+
494
+ const outLines: string[] = [];
495
+ let runningTokens = OVERHEAD;
496
+ for (const line of lines) {
497
+ const lineTokens = estimate(line) + 1; // +1 for the trailing newline
498
+ if (runningTokens + lineTokens > budgetTokens) break;
499
+ outLines.push(line);
500
+ runningTokens += lineTokens;
501
+ }
502
+
503
+ if (outLines.length === 0) return null;
504
+
505
+ return `${OPEN}${outLines.join("\n")}${CLOSE}`;
506
+ }