clawmem 0.6.0 → 0.7.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,481 @@
1
+ /**
2
+ * Anti-contamination LLM synthesis wrapper (Ext 1).
3
+ *
4
+ * Three guardrails around Phase 3 deductive synthesis:
5
+ *
6
+ * 1. **Evidence filtering** — `collectRelevantEvidence` splits each
7
+ * source doc's `facts` + `narrative` into sentences and keeps only
8
+ * those with lexical overlap against the draft conclusion/premises.
9
+ * The filtered evidence is fed to the validation LLM so it sees
10
+ * only the parts of each source that actually matter.
11
+ *
12
+ * 2. **Relation context injection** — `buildSourceRelationContext`
13
+ * queries `memory_relations` for edges AMONG the cited source docs
14
+ * and formats them as structural context. This lets the LLM
15
+ * cross-reference the graph shape alongside the raw text.
16
+ *
17
+ * 3. **Contamination scan** — `scanConclusionContamination` is the
18
+ * primary safety check. It compares entities (or lexical anchors)
19
+ * mentioned by the draft conclusion against the set of entities
20
+ * present in the cited sources. Any mention of an entity that
21
+ * exists in the BROADER candidate pool but NOT in the sources is
22
+ * flagged as contamination — the LLM imported content from a doc
23
+ * it wasn't supposed to reference. Entity-aware first (uses
24
+ * `entity_mentions`), lexical fallback when entity state is thin.
25
+ *
26
+ * `validateDeductiveDraft` orchestrates all three: deterministic
27
+ * pre-checks → contamination scan → LLM validation/refinement. Never
28
+ * throws, LLM null is a soft fall-through that still honors the
29
+ * deterministic safety gates.
30
+ *
31
+ * Adapted from Thoth `dream_cycle.py:371-565` + `prompts.py:552-579`
32
+ * (THOTH_EXTRACTION_PLAN.md Extraction 1).
33
+ */
34
+
35
+ import type { Store } from "./store.ts";
36
+ import type { LLM } from "./llm.ts";
37
+ import { extractJsonFromLLM } from "./amem.ts";
38
+ import { extractSubjectAnchorsLexical } from "./text-similarity.ts";
39
+
40
+ // =============================================================================
41
+ // Types
42
+ // =============================================================================
43
+
44
+ /**
45
+ * A deductive draft as emitted by the Phase 3 draft-generation LLM
46
+ * call. Matches the shape of `extractJsonFromLLM` output for the
47
+ * existing draft prompt.
48
+ */
49
+ export interface DeductiveDraft {
50
+ conclusion: string;
51
+ premises: string[];
52
+ sourceIndices: number[];
53
+ }
54
+
55
+ /**
56
+ * Minimal doc shape the guardrails need. Kept narrow so the module can
57
+ * be tested without the full `Document` row type — any object with
58
+ * `id`, `title`, and optionally `facts`/`narrative` satisfies it.
59
+ */
60
+ export interface DocLike {
61
+ id: number;
62
+ title: string;
63
+ facts?: string | null;
64
+ narrative?: string | null;
65
+ }
66
+
67
+ export type ValidationRejectReason =
68
+ | "empty"
69
+ | "invalid_indices"
70
+ | "contamination"
71
+ | "unsupported"
72
+ | "null_llm";
73
+
74
+ export interface DeductiveValidation {
75
+ accepted: boolean;
76
+ conclusion?: string;
77
+ premises?: string[];
78
+ reason?: ValidationRejectReason;
79
+ contaminationHits?: string[];
80
+ contaminationMethod?: "entity" | "lexical";
81
+ /**
82
+ * True when `accepted === true` because the LLM validation path
83
+ * failed (null result, throw, or malformed JSON) and the deterministic
84
+ * pre-checks were treated as sufficient. Operators should track
85
+ * this separately from LLM-affirmed acceptances — a high
86
+ * fallback-accept rate means the LLM path is effectively disabled
87
+ * and deductions are only gated by the deterministic guardrails.
88
+ */
89
+ fallbackAccepted?: boolean;
90
+ }
91
+
92
+ // =============================================================================
93
+ // Evidence filtering
94
+ // =============================================================================
95
+
96
+ /**
97
+ * Split each source doc's `facts` + `narrative` into sentences, keep
98
+ * only sentences with lexical overlap against the draft conclusion or
99
+ * any premise (minimum 2 shared >3-char tokens). Returns the
100
+ * concatenated evidence text (for LLM context) and the raw sentence
101
+ * list (for further downstream validation or logging).
102
+ *
103
+ * Keeps evidence output bounded so long source docs don't blow the
104
+ * validation prompt budget.
105
+ */
106
+ export function collectRelevantEvidence(
107
+ sourceDocs: DocLike[],
108
+ draft: DeductiveDraft
109
+ ): { evidenceText: string; evidenceSentences: string[] } {
110
+ const draftTokens = new Set<string>();
111
+ const addTokens = (s: string) => {
112
+ for (const tok of s.toLowerCase().split(/\s+/)) {
113
+ if (tok.length > 3) draftTokens.add(tok);
114
+ }
115
+ };
116
+ addTokens(draft.conclusion);
117
+ for (const p of draft.premises ?? []) addTokens(p);
118
+
119
+ const relevant: string[] = [];
120
+ for (const doc of sourceDocs) {
121
+ const text = `${doc.facts ?? ""}\n${doc.narrative ?? ""}`;
122
+ const sentences = text
123
+ .split(/[.!?\n]+/)
124
+ .map((s) => s.trim())
125
+ .filter(Boolean);
126
+ for (const sentence of sentences) {
127
+ const sentenceTokens = new Set(
128
+ sentence
129
+ .toLowerCase()
130
+ .split(/\s+/)
131
+ .filter((t) => t.length > 3)
132
+ );
133
+ const overlap = [...sentenceTokens].filter((t) => draftTokens.has(t)).length;
134
+ if (overlap >= 2) {
135
+ relevant.push(`[doc#${doc.id}] ${sentence}`);
136
+ }
137
+ }
138
+ }
139
+
140
+ return {
141
+ evidenceText: relevant.join(". "),
142
+ evidenceSentences: relevant,
143
+ };
144
+ }
145
+
146
+ // =============================================================================
147
+ // Source relation context
148
+ // =============================================================================
149
+
150
+ /**
151
+ * Query `memory_relations` for edges AMONG the cited source docs and
152
+ * format them as a human-readable context string. Sorted by weight
153
+ * DESC, capped at `maxEdges` (default 10) to keep the prompt
154
+ * bounded. Returns the empty string when there are no edges or the
155
+ * query fails — callers treat that as "no structural context".
156
+ */
157
+ export function buildSourceRelationContext(
158
+ store: Store,
159
+ sourceDocIds: number[],
160
+ maxEdges: number = 10
161
+ ): string {
162
+ if (sourceDocIds.length < 2) return "";
163
+
164
+ const placeholders = sourceDocIds.map(() => "?").join(",");
165
+ let rows: {
166
+ source_id: number;
167
+ target_id: number;
168
+ relation_type: string;
169
+ weight: number;
170
+ }[];
171
+ try {
172
+ rows = store.db
173
+ .prepare(
174
+ `SELECT source_id, target_id, relation_type, weight
175
+ FROM memory_relations
176
+ WHERE source_id IN (${placeholders})
177
+ AND target_id IN (${placeholders})
178
+ ORDER BY weight DESC
179
+ LIMIT ?`
180
+ )
181
+ .all(...sourceDocIds, ...sourceDocIds, maxEdges) as typeof rows;
182
+ } catch {
183
+ return "";
184
+ }
185
+
186
+ if (rows.length === 0) return "";
187
+
188
+ return rows
189
+ .map(
190
+ (r) =>
191
+ `doc#${r.source_id} --[${r.relation_type} w=${r.weight.toFixed(2)}]--> doc#${r.target_id}`
192
+ )
193
+ .join("\n");
194
+ }
195
+
196
+ // =============================================================================
197
+ // Contamination scan
198
+ // =============================================================================
199
+
200
+ /**
201
+ * Scan a draft conclusion for "contamination" — content that appears
202
+ * in the broader candidate pool but NOT in the cited source docs.
203
+ *
204
+ * Entity-aware first: queries `entity_mentions` for both the source
205
+ * docs and the pool. When an entity is mentioned by the pool but not
206
+ * by the sources, look up its canonical name in `entity_nodes` and
207
+ * search for it in the conclusion (whole-word match, case-insensitive).
208
+ *
209
+ * Lexical fallback: when either side has zero entity mentions, extract
210
+ * proper-noun anchors from source text and pool text, find the set
211
+ * exclusive to the pool, and check whether the conclusion mentions
212
+ * any of them.
213
+ *
214
+ * Returns the list of contamination hits (anchor strings or entity
215
+ * names) and which path produced them.
216
+ */
217
+ export function scanConclusionContamination(
218
+ store: Store,
219
+ conclusion: string,
220
+ sourceDocIds: number[],
221
+ candidatePool: DocLike[]
222
+ ): { hits: string[]; method: "entity" | "lexical" } {
223
+ const candidateIds = candidatePool.map((d) => d.id);
224
+
225
+ const sourceEntities = getEntitiesForDocs(store, sourceDocIds);
226
+ const poolEntities = getEntitiesForDocs(store, candidateIds);
227
+
228
+ if (sourceEntities !== null && poolEntities !== null) {
229
+ const sourceSet = new Set(sourceEntities);
230
+ const outsideEntities = poolEntities.filter((e) => !sourceSet.has(e));
231
+ if (outsideEntities.length === 0) {
232
+ return { hits: [], method: "entity" };
233
+ }
234
+
235
+ let names: { entity_id: string; name: string }[];
236
+ try {
237
+ const placeholders = outsideEntities.map(() => "?").join(",");
238
+ names = store.db
239
+ .prepare(
240
+ `SELECT entity_id, name FROM entity_nodes WHERE entity_id IN (${placeholders})`
241
+ )
242
+ .all(...outsideEntities) as typeof names;
243
+ } catch {
244
+ return scanLexicalContamination(conclusion, sourceDocIds, candidatePool);
245
+ }
246
+
247
+ const lowerConclusion = conclusion.toLowerCase();
248
+ const hitSet = new Set<string>();
249
+ for (const n of names) {
250
+ const nameLC = n.name.toLowerCase();
251
+ // Use a custom non-alnum boundary instead of `\b` because `\b` fails
252
+ // for names that BEGIN or END with punctuation (`auth-service`,
253
+ // `OAuth2.0`, `C++`, `.NET`). `\b` requires one side to be a word
254
+ // character, so a trailing `+` in `c++` followed by whitespace
255
+ // produces no match (both sides non-word).
256
+ //
257
+ // Lookbehind/lookahead on `[^a-z0-9]` (plus start/end anchors)
258
+ // correctly matches the name when surrounded by anything that
259
+ // isn't alphanumeric — including punctuation, whitespace, and
260
+ // string boundaries.
261
+ const regex = new RegExp(
262
+ `(?<=^|[^a-z0-9])${escapeRegex(nameLC)}(?=$|[^a-z0-9])`
263
+ );
264
+ if (regex.test(lowerConclusion)) {
265
+ hitSet.add(n.name);
266
+ }
267
+ }
268
+ return { hits: [...hitSet], method: "entity" };
269
+ }
270
+
271
+ return scanLexicalContamination(conclusion, sourceDocIds, candidatePool);
272
+ }
273
+
274
+ /**
275
+ * Get the canonical entity IDs mentioned by a set of docs. Returns
276
+ * null when the docs have no entity_mentions at all — caller should
277
+ * fall back to lexical scan (apples-to-apples comparison).
278
+ */
279
+ function getEntitiesForDocs(store: Store, docIds: number[]): string[] | null {
280
+ if (docIds.length === 0) return [];
281
+ const placeholders = docIds.map(() => "?").join(",");
282
+ let rows: { entity_id: string }[];
283
+ try {
284
+ rows = store.db
285
+ .prepare(
286
+ `SELECT DISTINCT entity_id FROM entity_mentions WHERE doc_id IN (${placeholders})`
287
+ )
288
+ .all(...docIds) as typeof rows;
289
+ } catch {
290
+ return null;
291
+ }
292
+ if (rows.length === 0) return null;
293
+ return rows.map((r) => r.entity_id);
294
+ }
295
+
296
+ function scanLexicalContamination(
297
+ conclusion: string,
298
+ sourceDocIds: number[],
299
+ candidatePool: DocLike[]
300
+ ): { hits: string[]; method: "lexical" } {
301
+ const sourceSet = new Set(sourceDocIds);
302
+ const sourceDocs = candidatePool.filter((d) => sourceSet.has(d.id));
303
+ const outsideDocs = candidatePool.filter((d) => !sourceSet.has(d.id));
304
+
305
+ const sourceText = sourceDocs
306
+ .map((d) => `${d.title}\n${d.facts ?? ""}\n${d.narrative ?? ""}`)
307
+ .join("\n");
308
+ const outsideText = outsideDocs
309
+ .map((d) => `${d.title}\n${d.facts ?? ""}\n${d.narrative ?? ""}`)
310
+ .join("\n");
311
+
312
+ const sourceAnchors = new Set(extractSubjectAnchorsLexical(sourceText));
313
+ const outsideAnchors = extractSubjectAnchorsLexical(outsideText);
314
+
315
+ const exclusiveOutside = outsideAnchors.filter((a) => !sourceAnchors.has(a));
316
+ if (exclusiveOutside.length === 0) {
317
+ return { hits: [], method: "lexical" };
318
+ }
319
+
320
+ const conclusionAnchors = new Set(extractSubjectAnchorsLexical(conclusion));
321
+ const hits = [...new Set(exclusiveOutside.filter((a) => conclusionAnchors.has(a)))];
322
+ return { hits, method: "lexical" };
323
+ }
324
+
325
+ function escapeRegex(s: string): string {
326
+ return s.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
327
+ }
328
+
329
+ // =============================================================================
330
+ // Validation orchestrator
331
+ // =============================================================================
332
+
333
+ const VALIDATION_PROMPT_TEMPLATE = `You are a logic validator. Check whether a proposed deductive conclusion is fully supported by the provided source evidence — and nothing beyond it.
334
+
335
+ Source evidence (only these observations are allowed):
336
+ {EVIDENCE}
337
+ {RELATIONS}
338
+ Proposed deduction:
339
+ Conclusion: {CONCLUSION}
340
+ Premises: {PREMISES}
341
+
342
+ Rules:
343
+ 1. If the evidence does not fully support the conclusion, reject.
344
+ 2. If the conclusion references anything NOT present in the source evidence, reject.
345
+ 3. If the conclusion is supported but could be phrased more precisely, return a revised conclusion.
346
+
347
+ Respond with ONLY a JSON object:
348
+ {"accepted": true|false, "conclusion": "revised or original", "premises": ["revised or original"], "reason": "brief"}
349
+
350
+ Do not include any other text. /no_think`;
351
+
352
+ /**
353
+ * Validate a deductive draft against its source docs.
354
+ *
355
+ * Pipeline:
356
+ * 1. Deterministic pre-checks:
357
+ * - conclusion must be non-trivial (>= 10 chars after trim)
358
+ * - source docs must resolve to >= 2 unique ids
359
+ * 2. Contamination scan — reject immediately on any hit.
360
+ * 3. LLM validation/refinement. On null/malformed JSON, fall back to
361
+ * deterministic accept (the pre-checks already passed, so the
362
+ * draft is structurally valid).
363
+ *
364
+ * Never throws. Returns `accepted: false` with a `reason` on any
365
+ * rejection so the caller can track per-reason counters in stats.
366
+ */
367
+ export async function validateDeductiveDraft(
368
+ store: Store,
369
+ llm: LLM,
370
+ draft: DeductiveDraft,
371
+ sourceDocs: DocLike[],
372
+ candidatePool: DocLike[]
373
+ ): Promise<DeductiveValidation> {
374
+ // Pre-check 1: non-trivial conclusion
375
+ if (!draft.conclusion?.trim() || draft.conclusion.trim().length < 10) {
376
+ return { accepted: false, reason: "empty" };
377
+ }
378
+
379
+ // Pre-check 2: at least 2 unique source docs
380
+ const uniqueSourceIds = [...new Set(sourceDocs.map((d) => d.id))];
381
+ if (uniqueSourceIds.length < 2) {
382
+ return { accepted: false, reason: "invalid_indices" };
383
+ }
384
+
385
+ // Contamination scan
386
+ const contamination = scanConclusionContamination(
387
+ store,
388
+ draft.conclusion,
389
+ uniqueSourceIds,
390
+ candidatePool
391
+ );
392
+ if (contamination.hits.length > 0) {
393
+ return {
394
+ accepted: false,
395
+ reason: "contamination",
396
+ contaminationHits: contamination.hits,
397
+ contaminationMethod: contamination.method,
398
+ };
399
+ }
400
+
401
+ // LLM validation / refinement
402
+ const evidence = collectRelevantEvidence(sourceDocs, draft);
403
+ const relationContext = buildSourceRelationContext(store, uniqueSourceIds);
404
+
405
+ const evidenceBlock =
406
+ evidence.evidenceText ||
407
+ sourceDocs
408
+ .map(
409
+ (d) =>
410
+ `[doc#${d.id}] ${d.title}: ${(d.facts ?? "").slice(0, 200)} ${(d.narrative ?? "").slice(0, 200)}`
411
+ )
412
+ .join("\n");
413
+
414
+ const prompt = VALIDATION_PROMPT_TEMPLATE.replace("{EVIDENCE}", evidenceBlock)
415
+ .replace("{RELATIONS}", relationContext ? `\nRelations among sources:\n${relationContext}\n` : "")
416
+ .replace("{CONCLUSION}", draft.conclusion)
417
+ .replace("{PREMISES}", (draft.premises ?? []).join("; "));
418
+
419
+ let result;
420
+ try {
421
+ result = await llm.generate(prompt, { temperature: 0.2, maxTokens: 400 });
422
+ } catch {
423
+ // LLM call threw → deterministic accept (pre-checks already passed)
424
+ return {
425
+ accepted: true,
426
+ conclusion: draft.conclusion,
427
+ premises: draft.premises,
428
+ fallbackAccepted: true,
429
+ };
430
+ }
431
+
432
+ if (!result?.text) {
433
+ // LLM returned null (cooldown / remote down) → deterministic accept
434
+ return {
435
+ accepted: true,
436
+ conclusion: draft.conclusion,
437
+ premises: draft.premises,
438
+ fallbackAccepted: true,
439
+ };
440
+ }
441
+
442
+ const parsed = extractJsonFromLLM(result.text) as {
443
+ accepted?: unknown;
444
+ conclusion?: unknown;
445
+ premises?: unknown;
446
+ reason?: unknown;
447
+ } | null;
448
+
449
+ if (!parsed || typeof parsed.accepted !== "boolean") {
450
+ // Malformed → deterministic accept
451
+ return {
452
+ accepted: true,
453
+ conclusion: draft.conclusion,
454
+ premises: draft.premises,
455
+ fallbackAccepted: true,
456
+ };
457
+ }
458
+
459
+ if (!parsed.accepted) {
460
+ return {
461
+ accepted: false,
462
+ reason: "unsupported",
463
+ conclusion:
464
+ typeof parsed.conclusion === "string" ? parsed.conclusion : draft.conclusion,
465
+ };
466
+ }
467
+
468
+ // Accepted, possibly with LLM refinement
469
+ return {
470
+ accepted: true,
471
+ conclusion:
472
+ typeof parsed.conclusion === "string" && parsed.conclusion.trim()
473
+ ? parsed.conclusion
474
+ : draft.conclusion,
475
+ premises:
476
+ Array.isArray(parsed.premises) &&
477
+ parsed.premises.every((p) => typeof p === "string")
478
+ ? (parsed.premises as string[])
479
+ : draft.premises,
480
+ };
481
+ }