clawmem 0.7.0 → 0.7.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,481 @@
1
+ /**
2
+ * Anti-contamination LLM synthesis wrapper (Ext 1).
3
+ *
4
+ * Three guardrails around Phase 3 deductive synthesis:
5
+ *
6
+ * 1. **Evidence filtering** — `collectRelevantEvidence` splits each
7
+ * source doc's `facts` + `narrative` into sentences and keeps only
8
+ * those with lexical overlap against the draft conclusion/premises.
9
+ * The filtered evidence is fed to the validation LLM so it sees
10
+ * only the parts of each source that actually matter.
11
+ *
12
+ * 2. **Relation context injection** — `buildSourceRelationContext`
13
+ * queries `memory_relations` for edges AMONG the cited source docs
14
+ * and formats them as structural context. This lets the LLM
15
+ * cross-reference the graph shape alongside the raw text.
16
+ *
17
+ * 3. **Contamination scan** — `scanConclusionContamination` is the
18
+ * primary safety check. It compares entities (or lexical anchors)
19
+ * mentioned by the draft conclusion against the set of entities
20
+ * present in the cited sources. Any mention of an entity that
21
+ * exists in the BROADER candidate pool but NOT in the sources is
22
+ * flagged as contamination — the LLM imported content from a doc
23
+ * it wasn't supposed to reference. Entity-aware first (uses
24
+ * `entity_mentions`), lexical fallback when entity state is thin.
25
+ *
26
+ * `validateDeductiveDraft` orchestrates all three: deterministic
27
+ * pre-checks → contamination scan → LLM validation/refinement. Never
28
+ * throws, LLM null is a soft fall-through that still honors the
29
+ * deterministic safety gates.
30
+ *
31
+ * Adapted from Thoth `dream_cycle.py:371-565` + `prompts.py:552-579`
32
+ * (THOTH_EXTRACTION_PLAN.md Extraction 1).
33
+ */
34
+
35
+ import type { Store } from "./store.ts";
36
+ import type { LLM } from "./llm.ts";
37
+ import { extractJsonFromLLM } from "./amem.ts";
38
+ import { extractSubjectAnchorsLexical } from "./text-similarity.ts";
39
+
40
+ // =============================================================================
41
+ // Types
42
+ // =============================================================================
43
+
44
+ /**
45
+ * A deductive draft as emitted by the Phase 3 draft-generation LLM
46
+ * call. Matches the shape of `extractJsonFromLLM` output for the
47
+ * existing draft prompt.
48
+ */
49
+ export interface DeductiveDraft {
50
+ conclusion: string;
51
+ premises: string[];
52
+ sourceIndices: number[];
53
+ }
54
+
55
+ /**
56
+ * Minimal doc shape the guardrails need. Kept narrow so the module can
57
+ * be tested without the full `Document` row type — any object with
58
+ * `id`, `title`, and optionally `facts`/`narrative` satisfies it.
59
+ */
60
+ export interface DocLike {
61
+ id: number;
62
+ title: string;
63
+ facts?: string | null;
64
+ narrative?: string | null;
65
+ }
66
+
67
+ export type ValidationRejectReason =
68
+ | "empty"
69
+ | "invalid_indices"
70
+ | "contamination"
71
+ | "unsupported"
72
+ | "null_llm";
73
+
74
+ export interface DeductiveValidation {
75
+ accepted: boolean;
76
+ conclusion?: string;
77
+ premises?: string[];
78
+ reason?: ValidationRejectReason;
79
+ contaminationHits?: string[];
80
+ contaminationMethod?: "entity" | "lexical";
81
+ /**
82
+ * True when `accepted === true` because the LLM validation path
83
+ * failed (null result, throw, or malformed JSON) and the deterministic
84
+ * pre-checks were treated as sufficient. Operators should track
85
+ * this separately from LLM-affirmed acceptances — a high
86
+ * fallback-accept rate means the LLM path is effectively disabled
87
+ * and deductions are only gated by the deterministic guardrails.
88
+ */
89
+ fallbackAccepted?: boolean;
90
+ }
91
+
92
+ // =============================================================================
93
+ // Evidence filtering
94
+ // =============================================================================
95
+
96
+ /**
97
+ * Split each source doc's `facts` + `narrative` into sentences, keep
98
+ * only sentences with lexical overlap against the draft conclusion or
99
+ * any premise (minimum 2 shared >3-char tokens). Returns the
100
+ * concatenated evidence text (for LLM context) and the raw sentence
101
+ * list (for further downstream validation or logging).
102
+ *
103
+ * Keeps evidence output bounded so long source docs don't blow the
104
+ * validation prompt budget.
105
+ */
106
+ export function collectRelevantEvidence(
107
+ sourceDocs: DocLike[],
108
+ draft: DeductiveDraft
109
+ ): { evidenceText: string; evidenceSentences: string[] } {
110
+ const draftTokens = new Set<string>();
111
+ const addTokens = (s: string) => {
112
+ for (const tok of s.toLowerCase().split(/\s+/)) {
113
+ if (tok.length > 3) draftTokens.add(tok);
114
+ }
115
+ };
116
+ addTokens(draft.conclusion);
117
+ for (const p of draft.premises ?? []) addTokens(p);
118
+
119
+ const relevant: string[] = [];
120
+ for (const doc of sourceDocs) {
121
+ const text = `${doc.facts ?? ""}\n${doc.narrative ?? ""}`;
122
+ const sentences = text
123
+ .split(/[.!?\n]+/)
124
+ .map((s) => s.trim())
125
+ .filter(Boolean);
126
+ for (const sentence of sentences) {
127
+ const sentenceTokens = new Set(
128
+ sentence
129
+ .toLowerCase()
130
+ .split(/\s+/)
131
+ .filter((t) => t.length > 3)
132
+ );
133
+ const overlap = [...sentenceTokens].filter((t) => draftTokens.has(t)).length;
134
+ if (overlap >= 2) {
135
+ relevant.push(`[doc#${doc.id}] ${sentence}`);
136
+ }
137
+ }
138
+ }
139
+
140
+ return {
141
+ evidenceText: relevant.join(". "),
142
+ evidenceSentences: relevant,
143
+ };
144
+ }
145
+
146
+ // =============================================================================
147
+ // Source relation context
148
+ // =============================================================================
149
+
150
+ /**
151
+ * Query `memory_relations` for edges AMONG the cited source docs and
152
+ * format them as a human-readable context string. Sorted by weight
153
+ * DESC, capped at `maxEdges` (default 10) to keep the prompt
154
+ * bounded. Returns the empty string when there are no edges or the
155
+ * query fails — callers treat that as "no structural context".
156
+ */
157
+ export function buildSourceRelationContext(
158
+ store: Store,
159
+ sourceDocIds: number[],
160
+ maxEdges: number = 10
161
+ ): string {
162
+ if (sourceDocIds.length < 2) return "";
163
+
164
+ const placeholders = sourceDocIds.map(() => "?").join(",");
165
+ let rows: {
166
+ source_id: number;
167
+ target_id: number;
168
+ relation_type: string;
169
+ weight: number;
170
+ }[];
171
+ try {
172
+ rows = store.db
173
+ .prepare(
174
+ `SELECT source_id, target_id, relation_type, weight
175
+ FROM memory_relations
176
+ WHERE source_id IN (${placeholders})
177
+ AND target_id IN (${placeholders})
178
+ ORDER BY weight DESC
179
+ LIMIT ?`
180
+ )
181
+ .all(...sourceDocIds, ...sourceDocIds, maxEdges) as typeof rows;
182
+ } catch {
183
+ return "";
184
+ }
185
+
186
+ if (rows.length === 0) return "";
187
+
188
+ return rows
189
+ .map(
190
+ (r) =>
191
+ `doc#${r.source_id} --[${r.relation_type} w=${r.weight.toFixed(2)}]--> doc#${r.target_id}`
192
+ )
193
+ .join("\n");
194
+ }
195
+
196
+ // =============================================================================
197
+ // Contamination scan
198
+ // =============================================================================
199
+
200
+ /**
201
+ * Scan a draft conclusion for "contamination" — content that appears
202
+ * in the broader candidate pool but NOT in the cited source docs.
203
+ *
204
+ * Entity-aware first: queries `entity_mentions` for both the source
205
+ * docs and the pool. When an entity is mentioned by the pool but not
206
+ * by the sources, look up its canonical name in `entity_nodes` and
207
+ * search for it in the conclusion (whole-word match, case-insensitive).
208
+ *
209
+ * Lexical fallback: when either side has zero entity mentions, extract
210
+ * proper-noun anchors from source text and pool text, find the set
211
+ * exclusive to the pool, and check whether the conclusion mentions
212
+ * any of them.
213
+ *
214
+ * Returns the list of contamination hits (anchor strings or entity
215
+ * names) and which path produced them.
216
+ */
217
+ export function scanConclusionContamination(
218
+ store: Store,
219
+ conclusion: string,
220
+ sourceDocIds: number[],
221
+ candidatePool: DocLike[]
222
+ ): { hits: string[]; method: "entity" | "lexical" } {
223
+ const candidateIds = candidatePool.map((d) => d.id);
224
+
225
+ const sourceEntities = getEntitiesForDocs(store, sourceDocIds);
226
+ const poolEntities = getEntitiesForDocs(store, candidateIds);
227
+
228
+ if (sourceEntities !== null && poolEntities !== null) {
229
+ const sourceSet = new Set(sourceEntities);
230
+ const outsideEntities = poolEntities.filter((e) => !sourceSet.has(e));
231
+ if (outsideEntities.length === 0) {
232
+ return { hits: [], method: "entity" };
233
+ }
234
+
235
+ let names: { entity_id: string; name: string }[];
236
+ try {
237
+ const placeholders = outsideEntities.map(() => "?").join(",");
238
+ names = store.db
239
+ .prepare(
240
+ `SELECT entity_id, name FROM entity_nodes WHERE entity_id IN (${placeholders})`
241
+ )
242
+ .all(...outsideEntities) as typeof names;
243
+ } catch {
244
+ return scanLexicalContamination(conclusion, sourceDocIds, candidatePool);
245
+ }
246
+
247
+ const lowerConclusion = conclusion.toLowerCase();
248
+ const hitSet = new Set<string>();
249
+ for (const n of names) {
250
+ const nameLC = n.name.toLowerCase();
251
+ // Use a custom non-alnum boundary instead of `\b` because `\b` fails
252
+ // for names that BEGIN or END with punctuation (`auth-service`,
253
+ // `OAuth2.0`, `C++`, `.NET`). `\b` requires one side to be a word
254
+ // character, so a trailing `+` in `c++` followed by whitespace
255
+ // produces no match (both sides non-word).
256
+ //
257
+ // Lookbehind/lookahead on `[^a-z0-9]` (plus start/end anchors)
258
+ // correctly matches the name when surrounded by anything that
259
+ // isn't alphanumeric — including punctuation, whitespace, and
260
+ // string boundaries.
261
+ const regex = new RegExp(
262
+ `(?<=^|[^a-z0-9])${escapeRegex(nameLC)}(?=$|[^a-z0-9])`
263
+ );
264
+ if (regex.test(lowerConclusion)) {
265
+ hitSet.add(n.name);
266
+ }
267
+ }
268
+ return { hits: [...hitSet], method: "entity" };
269
+ }
270
+
271
+ return scanLexicalContamination(conclusion, sourceDocIds, candidatePool);
272
+ }
273
+
274
+ /**
275
+ * Get the canonical entity IDs mentioned by a set of docs. Returns
276
+ * null when the docs have no entity_mentions at all — caller should
277
+ * fall back to lexical scan (apples-to-apples comparison).
278
+ */
279
+ function getEntitiesForDocs(store: Store, docIds: number[]): string[] | null {
280
+ if (docIds.length === 0) return [];
281
+ const placeholders = docIds.map(() => "?").join(",");
282
+ let rows: { entity_id: string }[];
283
+ try {
284
+ rows = store.db
285
+ .prepare(
286
+ `SELECT DISTINCT entity_id FROM entity_mentions WHERE doc_id IN (${placeholders})`
287
+ )
288
+ .all(...docIds) as typeof rows;
289
+ } catch {
290
+ return null;
291
+ }
292
+ if (rows.length === 0) return null;
293
+ return rows.map((r) => r.entity_id);
294
+ }
295
+
296
+ function scanLexicalContamination(
297
+ conclusion: string,
298
+ sourceDocIds: number[],
299
+ candidatePool: DocLike[]
300
+ ): { hits: string[]; method: "lexical" } {
301
+ const sourceSet = new Set(sourceDocIds);
302
+ const sourceDocs = candidatePool.filter((d) => sourceSet.has(d.id));
303
+ const outsideDocs = candidatePool.filter((d) => !sourceSet.has(d.id));
304
+
305
+ const sourceText = sourceDocs
306
+ .map((d) => `${d.title}\n${d.facts ?? ""}\n${d.narrative ?? ""}`)
307
+ .join("\n");
308
+ const outsideText = outsideDocs
309
+ .map((d) => `${d.title}\n${d.facts ?? ""}\n${d.narrative ?? ""}`)
310
+ .join("\n");
311
+
312
+ const sourceAnchors = new Set(extractSubjectAnchorsLexical(sourceText));
313
+ const outsideAnchors = extractSubjectAnchorsLexical(outsideText);
314
+
315
+ const exclusiveOutside = outsideAnchors.filter((a) => !sourceAnchors.has(a));
316
+ if (exclusiveOutside.length === 0) {
317
+ return { hits: [], method: "lexical" };
318
+ }
319
+
320
+ const conclusionAnchors = new Set(extractSubjectAnchorsLexical(conclusion));
321
+ const hits = [...new Set(exclusiveOutside.filter((a) => conclusionAnchors.has(a)))];
322
+ return { hits, method: "lexical" };
323
+ }
324
+
325
+ function escapeRegex(s: string): string {
326
+ return s.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
327
+ }
328
+
329
+ // =============================================================================
330
+ // Validation orchestrator
331
+ // =============================================================================
332
+
333
+ const VALIDATION_PROMPT_TEMPLATE = `You are a logic validator. Check whether a proposed deductive conclusion is fully supported by the provided source evidence — and nothing beyond it.
334
+
335
+ Source evidence (only these observations are allowed):
336
+ {EVIDENCE}
337
+ {RELATIONS}
338
+ Proposed deduction:
339
+ Conclusion: {CONCLUSION}
340
+ Premises: {PREMISES}
341
+
342
+ Rules:
343
+ 1. If the evidence does not fully support the conclusion, reject.
344
+ 2. If the conclusion references anything NOT present in the source evidence, reject.
345
+ 3. If the conclusion is supported but could be phrased more precisely, return a revised conclusion.
346
+
347
+ Respond with ONLY a JSON object:
348
+ {"accepted": true|false, "conclusion": "revised or original", "premises": ["revised or original"], "reason": "brief"}
349
+
350
+ Do not include any other text. /no_think`;
351
+
352
+ /**
353
+ * Validate a deductive draft against its source docs.
354
+ *
355
+ * Pipeline:
356
+ * 1. Deterministic pre-checks:
357
+ * - conclusion must be non-trivial (>= 10 chars after trim)
358
+ * - source docs must resolve to >= 2 unique ids
359
+ * 2. Contamination scan — reject immediately on any hit.
360
+ * 3. LLM validation/refinement. On null/malformed JSON, fall back to
361
+ * deterministic accept (the pre-checks already passed, so the
362
+ * draft is structurally valid).
363
+ *
364
+ * Never throws. Returns `accepted: false` with a `reason` on any
365
+ * rejection so the caller can track per-reason counters in stats.
366
+ */
367
+ export async function validateDeductiveDraft(
368
+ store: Store,
369
+ llm: LLM,
370
+ draft: DeductiveDraft,
371
+ sourceDocs: DocLike[],
372
+ candidatePool: DocLike[]
373
+ ): Promise<DeductiveValidation> {
374
+ // Pre-check 1: non-trivial conclusion
375
+ if (!draft.conclusion?.trim() || draft.conclusion.trim().length < 10) {
376
+ return { accepted: false, reason: "empty" };
377
+ }
378
+
379
+ // Pre-check 2: at least 2 unique source docs
380
+ const uniqueSourceIds = [...new Set(sourceDocs.map((d) => d.id))];
381
+ if (uniqueSourceIds.length < 2) {
382
+ return { accepted: false, reason: "invalid_indices" };
383
+ }
384
+
385
+ // Contamination scan
386
+ const contamination = scanConclusionContamination(
387
+ store,
388
+ draft.conclusion,
389
+ uniqueSourceIds,
390
+ candidatePool
391
+ );
392
+ if (contamination.hits.length > 0) {
393
+ return {
394
+ accepted: false,
395
+ reason: "contamination",
396
+ contaminationHits: contamination.hits,
397
+ contaminationMethod: contamination.method,
398
+ };
399
+ }
400
+
401
+ // LLM validation / refinement
402
+ const evidence = collectRelevantEvidence(sourceDocs, draft);
403
+ const relationContext = buildSourceRelationContext(store, uniqueSourceIds);
404
+
405
+ const evidenceBlock =
406
+ evidence.evidenceText ||
407
+ sourceDocs
408
+ .map(
409
+ (d) =>
410
+ `[doc#${d.id}] ${d.title}: ${(d.facts ?? "").slice(0, 200)} ${(d.narrative ?? "").slice(0, 200)}`
411
+ )
412
+ .join("\n");
413
+
414
+ const prompt = VALIDATION_PROMPT_TEMPLATE.replace("{EVIDENCE}", evidenceBlock)
415
+ .replace("{RELATIONS}", relationContext ? `\nRelations among sources:\n${relationContext}\n` : "")
416
+ .replace("{CONCLUSION}", draft.conclusion)
417
+ .replace("{PREMISES}", (draft.premises ?? []).join("; "));
418
+
419
+ let result;
420
+ try {
421
+ result = await llm.generate(prompt, { temperature: 0.2, maxTokens: 400 });
422
+ } catch {
423
+ // LLM call threw → deterministic accept (pre-checks already passed)
424
+ return {
425
+ accepted: true,
426
+ conclusion: draft.conclusion,
427
+ premises: draft.premises,
428
+ fallbackAccepted: true,
429
+ };
430
+ }
431
+
432
+ if (!result?.text) {
433
+ // LLM returned null (cooldown / remote down) → deterministic accept
434
+ return {
435
+ accepted: true,
436
+ conclusion: draft.conclusion,
437
+ premises: draft.premises,
438
+ fallbackAccepted: true,
439
+ };
440
+ }
441
+
442
+ const parsed = extractJsonFromLLM(result.text) as {
443
+ accepted?: unknown;
444
+ conclusion?: unknown;
445
+ premises?: unknown;
446
+ reason?: unknown;
447
+ } | null;
448
+
449
+ if (!parsed || typeof parsed.accepted !== "boolean") {
450
+ // Malformed → deterministic accept
451
+ return {
452
+ accepted: true,
453
+ conclusion: draft.conclusion,
454
+ premises: draft.premises,
455
+ fallbackAccepted: true,
456
+ };
457
+ }
458
+
459
+ if (!parsed.accepted) {
460
+ return {
461
+ accepted: false,
462
+ reason: "unsupported",
463
+ conclusion:
464
+ typeof parsed.conclusion === "string" ? parsed.conclusion : draft.conclusion,
465
+ };
466
+ }
467
+
468
+ // Accepted, possibly with LLM refinement
469
+ return {
470
+ accepted: true,
471
+ conclusion:
472
+ typeof parsed.conclusion === "string" && parsed.conclusion.trim()
473
+ ? parsed.conclusion
474
+ : draft.conclusion,
475
+ premises:
476
+ Array.isArray(parsed.premises) &&
477
+ parsed.premises.every((p) => typeof p === "string")
478
+ ? (parsed.premises as string[])
479
+ : draft.premises,
480
+ };
481
+ }
@@ -58,6 +58,17 @@ const NUDGE_INTERVAL = parseInt(process.env.CLAWMEM_NUDGE_INTERVAL || "15", 10);
58
58
  const LIFECYCLE_HOOK_NAMES = ["memory_pin", "memory_forget", "memory_snooze", "lifecycle-archive"];
59
59
  const NUDGE_TEXT = "You haven't managed memory recently. If vault-context is surfacing noise → snooze it. If a critical decision was just made → pin it. If stale knowledge appeared → forget it.";
60
60
 
61
+ // Ext 6a: Context instruction + relationship snippets
62
+ // The instruction is ALWAYS prepended when the hook emits context — it frames
63
+ // the surfaced facts as background knowledge the agent already holds, reducing
64
+ // prompt-level ambiguity. Relationship snippets are fetched from the vault
65
+ // knowledge graph for edges where BOTH endpoints are in the surfaced doc set.
66
+ const INSTRUCTION_TEXT = "Treat the following as background facts you already know unless the user corrects them.";
67
+ const INSTRUCTION_XML = `<instruction>${INSTRUCTION_TEXT}</instruction>`;
68
+ const INSTRUCTION_TOKEN_COST = estimateTokens(INSTRUCTION_XML);
69
+ const RELATIONSHIPS_XML_OVERHEAD_TOKENS = estimateTokens("<relationships>\n\n</relationships>");
70
+ const MAX_RELATION_SNIPPETS = 10;
71
+
61
72
  // File path patterns to extract from prompts (E13 replacement: file-aware UserPromptSubmit)
62
73
  const FILE_PATH_RE = /(?:^|\s)((?:\/[\w.@-]+)+(?:\.\w+)?|[\w.@-]+\.(?:ts|js|py|md|sh|yaml|yml|json|toml|rs|go|tsx|jsx|css|html))\b/g;
63
74
 
@@ -349,8 +360,13 @@ export async function contextSurfacing(
349
360
  }
350
361
  }
351
362
 
352
- // Build context within token budget (profile-driven)
353
- const { context, paths, tokens } = buildContext(scored, prompt, tokenBudget);
363
+ // Build context within token budget (profile-driven).
364
+ // Ext 6a: Reserve budget for the always-on instruction line so the final
365
+ // vault-context payload stays within `tokenBudget`. Relations are layered
366
+ // in afterward using whatever budget remains and are the first thing
367
+ // truncated when the payload would overflow.
368
+ const factsBudget = Math.max(0, tokenBudget - INSTRUCTION_TOKEN_COST);
369
+ const { context, paths, tokens } = buildContext(scored, prompt, factsBudget);
354
370
 
355
371
  if (!context) {
356
372
  logEmptyTurn(store, input);
@@ -417,9 +433,29 @@ export async function contextSurfacing(
417
433
  // Memory nudge: periodically remind agent to use lifecycle tools
418
434
  const nudge = NUDGE_INTERVAL > 0 ? shouldNudge(store) : null;
419
435
 
436
+ // Ext 6a: Enrich vault-context with instruction framing + optional
437
+ // relationship snippets sourced from memory_relations. Only edges where
438
+ // BOTH endpoints are in the surfaced doc set are included. The relations
439
+ // block is the first thing dropped when the payload would overflow budget.
440
+ //
441
+ // Budget accounting (Turn 11 fix): `tokens` from buildContext only sums per-
442
+ // entry bodies and misses both the `<facts>...</facts>` wrapper and the
443
+ // `\n\n---\n\n` separators between entries. Compute the wrapped-facts cost
444
+ // directly from the rendered string so the relationships block can never
445
+ // push the final `<vault-context>` inner payload past `tokenBudget`.
446
+ const surfacedDocIds = lookupSurfacedDocIds(store, paths);
447
+ const relationSnippets = fetchRelationSnippets(store, surfacedDocIds);
448
+ const factsBlockXml = `<facts>\n${context}\n</facts>`;
449
+ const factsWrappedTokens = estimateTokens(factsBlockXml);
450
+ const relationBudget = Math.max(
451
+ 0,
452
+ tokenBudget - INSTRUCTION_TOKEN_COST - factsWrappedTokens
453
+ );
454
+ const vaultInner = buildVaultContextInner(context, relationSnippets, relationBudget);
455
+
420
456
  const parts: string[] = [];
421
457
  if (routingHint) parts.push(`<vault-routing>${routingHint}</vault-routing>`);
422
- parts.push(`<vault-context>\n${context}\n</vault-context>`);
458
+ parts.push(`<vault-context>\n${vaultInner}\n</vault-context>`);
423
459
  if (nudge) parts.push(`<vault-nudge>${NUDGE_TEXT}</vault-nudge>`);
424
460
 
425
461
  return makeContextOutput("context-surfacing", parts.join("\n"));
@@ -522,6 +558,148 @@ function buildContext(
522
558
  };
523
559
  }
524
560
 
561
+ // =============================================================================
562
+ // Ext 6a: Relationship snippets + instruction framing
563
+ // =============================================================================
564
+
565
+ /**
566
+ * Relationship snippet derived from a memory_relations edge whose source and
567
+ * target are both active documents currently surfaced by the context hook.
568
+ */
569
+ export interface RelationSnippet {
570
+ sourceTitle: string;
571
+ targetTitle: string;
572
+ relationType: string;
573
+ }
574
+
575
+ /**
576
+ * Resolve surfaced display paths back to document ids so the relation query
577
+ * can filter memory_relations edges to the surfaced set. Silently drops paths
578
+ * that don't match an active row in the general vault (e.g. skill-vault paths
579
+ * or deactivated docs) — fail-open, never throws.
580
+ */
581
+ export function lookupSurfacedDocIds(
582
+ store: Store,
583
+ displayPaths: string[]
584
+ ): number[] {
585
+ if (displayPaths.length === 0) return [];
586
+ try {
587
+ const placeholders = displayPaths.map(() => "?").join(",");
588
+ const rows = store.db
589
+ .prepare(
590
+ `SELECT id FROM documents
591
+ WHERE active = 1
592
+ AND (collection || '/' || path) IN (${placeholders})`
593
+ )
594
+ .all(...displayPaths) as Array<{ id: number }>;
595
+ return rows.map((r) => r.id);
596
+ } catch {
597
+ return [];
598
+ }
599
+ }
600
+
601
+ /**
602
+ * Fetch relationship snippets for edges where BOTH endpoints are in the
603
+ * surfaced doc set. Returns an empty list on empty input, zero/one surfaced
604
+ * docs, self-loops, or any DB error (fail-open, never throws). Results are
605
+ * ordered by relation weight DESC then recency so the most salient edges
606
+ * survive budget truncation.
607
+ */
608
+ export function fetchRelationSnippets(
609
+ store: Store,
610
+ surfacedDocIds: number[],
611
+ limit: number = MAX_RELATION_SNIPPETS
612
+ ): RelationSnippet[] {
613
+ if (surfacedDocIds.length < 2) return [];
614
+ try {
615
+ const placeholders = surfacedDocIds.map(() => "?").join(",");
616
+ const rows = store.db
617
+ .prepare(
618
+ `SELECT mr.relation_type,
619
+ ds.title AS source_title,
620
+ dt.title AS target_title
621
+ FROM memory_relations mr
622
+ JOIN documents ds ON ds.id = mr.source_id AND ds.active = 1
623
+ JOIN documents dt ON dt.id = mr.target_id AND dt.active = 1
624
+ WHERE mr.source_id IN (${placeholders})
625
+ AND mr.target_id IN (${placeholders})
626
+ AND mr.source_id != mr.target_id
627
+ ORDER BY mr.weight DESC, mr.created_at DESC
628
+ LIMIT ?`
629
+ )
630
+ .all(...surfacedDocIds, ...surfacedDocIds, limit) as Array<{
631
+ relation_type: string;
632
+ source_title: string;
633
+ target_title: string;
634
+ }>;
635
+ return rows.map((r) => ({
636
+ sourceTitle: r.source_title,
637
+ targetTitle: r.target_title,
638
+ relationType: r.relation_type,
639
+ }));
640
+ } catch {
641
+ return [];
642
+ }
643
+ }
644
+
645
+ /**
646
+ * Render relationship snippets as bullet lines, sanitizing titles to block
647
+ * prompt-injection via metadata fields. Lines that become filtered-content
648
+ * markers after sanitization are dropped.
649
+ */
650
+ export function renderRelationshipLines(
651
+ relations: RelationSnippet[]
652
+ ): string[] {
653
+ const FILTERED = "[content filtered for security]";
654
+ const out: string[] = [];
655
+ for (const r of relations) {
656
+ const src = sanitizeSnippet(r.sourceTitle);
657
+ const tgt = sanitizeSnippet(r.targetTitle);
658
+ if (src === FILTERED || tgt === FILTERED) continue;
659
+ out.push(`- ${src} --[${r.relationType}]--> ${tgt}`);
660
+ }
661
+ return out;
662
+ }
663
+
664
+ /**
665
+ * Assemble the inner body of <vault-context>: always instruction + facts,
666
+ * optionally relationships when at least one line fits in the remaining
667
+ * budget. Relationships are the first thing dropped — if the relationships
668
+ * XML wrapper alone would exceed `remainingBudgetTokens`, the whole block
669
+ * is omitted rather than emitting an empty wrapper.
670
+ */
671
+ export function buildVaultContextInner(
672
+ factsBlock: string,
673
+ relations: RelationSnippet[],
674
+ remainingBudgetTokens: number
675
+ ): string {
676
+ const lines: string[] = [];
677
+ lines.push(INSTRUCTION_XML);
678
+ lines.push(`<facts>\n${factsBlock}\n</facts>`);
679
+
680
+ if (relations.length === 0 || remainingBudgetTokens <= 0) {
681
+ return lines.join("\n");
682
+ }
683
+
684
+ const relationLines = renderRelationshipLines(relations);
685
+ if (relationLines.length === 0) return lines.join("\n");
686
+
687
+ // The XML wrapper itself consumes tokens — if there's no room for even one
688
+ // line on top of the wrapper, drop the block entirely.
689
+ const fittedLines: string[] = [];
690
+ let used = RELATIONSHIPS_XML_OVERHEAD_TOKENS;
691
+ for (const line of relationLines) {
692
+ const lineTokens = estimateTokens(line + "\n");
693
+ if (used + lineTokens > remainingBudgetTokens) break;
694
+ fittedLines.push(line);
695
+ used += lineTokens;
696
+ }
697
+ if (fittedLines.length === 0) return lines.join("\n");
698
+
699
+ lines.push(`<relationships>\n${fittedLines.join("\n")}\n</relationships>`);
700
+ return lines.join("\n");
701
+ }
702
+
525
703
  /**
526
704
  * Check if the agent should be nudged to use lifecycle tools.
527
705
  * Returns true if N+ context-surfacing invocations have occurred since the