clawmem 0.6.0 → 0.7.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,364 @@
1
+ /**
2
+ * Text similarity + merge safety gate for consolidation.
3
+ *
4
+ * Prevents semantic collision between topics that share vocabulary but
5
+ * refer to different subjects (e.g., two observations about "Dan" vs
6
+ * "Dad" or "Bob" vs "Rob"). Adds a dual-threshold safety check after
7
+ * the cheap Jaccard candidate-generation step.
8
+ *
9
+ * Entity-aware first: uses `entity_mentions` when both sides have canonical
10
+ * entities resolved. Lexical fallback via proper-noun anchor regex when
11
+ * either side lacks entity state. Strictest default when both sides are
12
+ * empty (no anchors at all).
13
+ *
14
+ * Adapted from Thoth `dream_cycle.py:218-272` subject-name guard
15
+ * (THOTH_EXTRACTION_PLAN.md Extraction 3).
16
+ */
17
+
18
+ import type { Store } from "./store.ts";
19
+
20
+ // =============================================================================
21
+ // Config — dual-threshold merge safety
22
+ // =============================================================================
23
+
24
+ /**
25
+ * NORMAL threshold: applies when anchor sets are compatible (subset or
26
+ * high overlap — same primary subject) AND the gate is gating on text
27
+ * similarity alone. Overridable via `CLAWMEM_MERGE_SCORE_NORMAL` env var
28
+ * for operator calibration during rollout.
29
+ *
30
+ * ⚠ Threshold is inherited from Thoth's `dream_cycle.py:218-272` guard,
31
+ * which uses Python's `difflib.SequenceMatcher` (character-level LCS).
32
+ * ClawMem uses normalized character 3-gram cosine, which is systematically
33
+ * harsher on benign rephrasings (word-order changes, synonym swaps). A
34
+ * same-meaning paraphrase like "The team migrated auth to OAuth2 last
35
+ * Friday" vs "Last Friday the team completed the auth migration to
36
+ * OAuth2" lands around 0.5 in 3-gram cosine but near 0.85 in
37
+ * SequenceMatcher. Consequence: merges will fragment more than Thoth
38
+ * did. This is the SAFE trade-off — fragmentation > false merges — but
39
+ * operators should tune via env var once they have real data.
40
+ */
41
+ export const MERGE_SCORE_NORMAL = parseEnvFloat(
42
+ "CLAWMEM_MERGE_SCORE_NORMAL",
43
+ 0.93
44
+ );
45
+
46
+ /**
47
+ * STRICT threshold: applies in the strictest-default path (both sides
48
+ * have zero anchors — no canonical entities, no proper-noun anchors).
49
+ * Overridable via `CLAWMEM_MERGE_SCORE_STRICT`.
50
+ *
51
+ * Non-strictest-default paths use the hard-reject rule on materially
52
+ * different anchors, not this threshold.
53
+ */
54
+ export const MERGE_SCORE_STRICT = parseEnvFloat(
55
+ "CLAWMEM_MERGE_SCORE_STRICT",
56
+ 0.98
57
+ );
58
+
59
+ function parseEnvFloat(name: string, fallback: number): number {
60
+ const raw = process.env[name];
61
+ if (raw === undefined) return fallback;
62
+ const n = Number.parseFloat(raw);
63
+ if (!Number.isFinite(n) || n < 0 || n > 1) return fallback;
64
+ return n;
65
+ }
66
+
67
+ // =============================================================================
68
+ // Anchor extraction (entity-first, lexical fallback)
69
+ // =============================================================================
70
+
71
+ export type AnchorSource = "entity_mentions" | "lexical_fallback";
72
+
73
+ export interface ExtractedAnchors {
74
+ entities: string[];
75
+ method: AnchorSource;
76
+ }
77
+
78
+ /**
79
+ * Get canonical entity IDs referenced by a set of source documents.
80
+ * Returns `{ entities: [], method: 'lexical_fallback' }` when no entity
81
+ * mentions exist for any of the given docs — the caller should then
82
+ * fall back to lexical anchor extraction over the raw text.
83
+ */
84
+ export function extractSourceDocEntities(
85
+ store: Store,
86
+ sourceDocIds: number[]
87
+ ): ExtractedAnchors {
88
+ if (sourceDocIds.length === 0) {
89
+ return { entities: [], method: "lexical_fallback" };
90
+ }
91
+
92
+ const placeholders = sourceDocIds.map(() => "?").join(",");
93
+ let rows: { entity_id: string }[];
94
+ try {
95
+ rows = store.db
96
+ .prepare(
97
+ `SELECT DISTINCT entity_id FROM entity_mentions WHERE doc_id IN (${placeholders})`
98
+ )
99
+ .all(...sourceDocIds) as { entity_id: string }[];
100
+ } catch {
101
+ return { entities: [], method: "lexical_fallback" };
102
+ }
103
+
104
+ if (rows.length === 0) {
105
+ return { entities: [], method: "lexical_fallback" };
106
+ }
107
+
108
+ return {
109
+ entities: rows.map((r) => r.entity_id),
110
+ method: "entity_mentions",
111
+ };
112
+ }
113
+
114
+ /**
115
+ * Extract lexical subject anchors from raw text.
116
+ *
117
+ * Heuristic: capitalized tokens that are not common sentence-start words.
118
+ * This is the fallback when `entity_mentions` is empty (the doc has not
119
+ * been through entity enrichment yet, or is from the pre-entity era).
120
+ */
121
+ export function extractSubjectAnchorsLexical(text: string): string[] {
122
+ if (!text) return [];
123
+
124
+ // Match capitalized tokens: CamelCase, UPPERCASE, Capitalized.
125
+ // Minimum 2 chars to avoid matching stray initials at sentence start.
126
+ const matches = text.match(/\b[A-Z][a-zA-Z0-9]{1,}\b/g) || [];
127
+
128
+ // Filter common sentence-start capitalized words that aren't proper nouns
129
+ const stopwords = new Set<string>([
130
+ "the", "a", "an", "this", "that", "these", "those",
131
+ "it", "we", "i", "he", "she", "they", "you", "me", "him", "her", "us", "them",
132
+ "and", "but", "or", "not", "is", "was", "are", "were",
133
+ "be", "been", "being", "have", "has", "had",
134
+ "do", "does", "did", "will", "would", "should", "could", "can",
135
+ "may", "might", "must", "shall",
136
+ "in", "on", "at", "to", "for", "of", "with", "by", "from",
137
+ "if", "then", "else", "when", "while", "where", "how", "why",
138
+ "all", "any", "some", "no", "one", "two",
139
+ ]);
140
+
141
+ const normalized = new Set<string>();
142
+ for (const token of matches) {
143
+ const lower = token.toLowerCase();
144
+ if (lower.length >= 2 && !stopwords.has(lower)) {
145
+ normalized.add(lower);
146
+ }
147
+ }
148
+
149
+ return [...normalized];
150
+ }
151
+
152
+ // =============================================================================
153
+ // Normalized character 3-gram cosine similarity
154
+ // =============================================================================
155
+
156
+ /**
157
+ * Character 3-gram cosine similarity.
158
+ *
159
+ * Robust to word-level permutation and punctuation; catches near-duplicate
160
+ * statements that differ only in wording or whitespace. Returns 0.0..1.0.
161
+ *
162
+ * Chosen over Jaccard (used as the cheap first-stage filter) because
163
+ * 3-gram cosine is tighter on paraphrase detection — it distinguishes
164
+ * "Dan visited Paris" from "Dad visited Paris" while the Jaccard over
165
+ * long-word sets would treat both as near-duplicates.
166
+ */
167
+ export function normalizedCosine3Gram(a: string, b: string): number {
168
+ const na = normalizeForTrigram(a);
169
+ const nb = normalizeForTrigram(b);
170
+
171
+ if (na.length === 0 || nb.length === 0) return 0;
172
+ if (na === nb) return 1.0;
173
+
174
+ const ta = trigramCounts(na);
175
+ const tb = trigramCounts(nb);
176
+
177
+ let dot = 0;
178
+ for (const [gram, count] of ta) {
179
+ const other = tb.get(gram);
180
+ if (other) dot += count * other;
181
+ }
182
+
183
+ const ma = magnitude(ta);
184
+ const mb = magnitude(tb);
185
+ if (ma === 0 || mb === 0) return 0;
186
+
187
+ return dot / (ma * mb);
188
+ }
189
+
190
+ function normalizeForTrigram(s: string): string {
191
+ return s
192
+ .toLowerCase()
193
+ .replace(/[^a-z0-9 ]+/g, " ")
194
+ .replace(/\s+/g, " ")
195
+ .trim();
196
+ }
197
+
198
+ function trigramCounts(s: string): Map<string, number> {
199
+ const out = new Map<string, number>();
200
+ if (s.length < 3) {
201
+ out.set(s, 1);
202
+ return out;
203
+ }
204
+ for (let i = 0; i <= s.length - 3; i++) {
205
+ const gram = s.slice(i, i + 3);
206
+ out.set(gram, (out.get(gram) || 0) + 1);
207
+ }
208
+ return out;
209
+ }
210
+
211
+ function magnitude(m: Map<string, number>): number {
212
+ let sum = 0;
213
+ for (const v of m.values()) sum += v * v;
214
+ return Math.sqrt(sum);
215
+ }
216
+
217
+ // =============================================================================
218
+ // Anchor set comparison
219
+ // =============================================================================
220
+
221
+ /**
222
+ * Determine whether two anchor sets "materially differ".
223
+ *
224
+ * Rules (all case-insensitive):
225
+ * 1. Either side empty → NOT materially different (caller handles via
226
+ * strictest-default path).
227
+ * 2. One set is a subset of the other → NOT materially different
228
+ * (allows `"Bob"` ↔ `"Bob Smith"`).
229
+ * 3. Intersection empty → materially different (`"Dan"` vs `"Dad"`).
230
+ * 4. Partial overlap → materially different when AT MOST half of the
231
+ * smaller set is shared (boundary `≤ 0.5` treated as material to
232
+ * fence off primary-subject mismatches like
233
+ * `[alice, auth-service]` vs `[bob, auth-service]` where the only
234
+ * shared anchor is the context, not the subject).
235
+ */
236
+ export function anchorSetsMateriallyDiffer(a: string[], b: string[]): boolean {
237
+ if (a.length === 0 || b.length === 0) return false;
238
+
239
+ const setA = new Set(a.map((x) => x.toLowerCase()));
240
+ const setB = new Set(b.map((x) => x.toLowerCase()));
241
+
242
+ const aSubB = [...setA].every((x) => setB.has(x));
243
+ const bSubA = [...setB].every((x) => setA.has(x));
244
+ if (aSubB || bSubA) return false;
245
+
246
+ const intersect = [...setA].filter((x) => setB.has(x));
247
+ if (intersect.length === 0) return true;
248
+
249
+ const smaller = Math.min(setA.size, setB.size);
250
+ return intersect.length / smaller <= 0.5;
251
+ }
252
+
253
+ // =============================================================================
254
+ // Merge safety gate
255
+ // =============================================================================
256
+
257
+ export type MergeSafetyMethod = "entity_aware" | "lexical_only" | "strictest_default";
258
+
259
+ export interface MergeSafetyResult {
260
+ accepted: boolean;
261
+ score: number;
262
+ threshold: number;
263
+ reason: string;
264
+ method: MergeSafetyMethod;
265
+ }
266
+
267
+ /**
268
+ * Merge safety gate.
269
+ *
270
+ * Flow:
271
+ * 1. Compute normalized character 3-gram cosine similarity between the
272
+ * candidate and existing observation texts.
273
+ * 2. Extract anchor sets for both sides. Entity-aware first
274
+ * (`entity_mentions`), lexical fallback otherwise. If EITHER side
275
+ * lacks `entity_mentions` coverage, both sides fall back to lexical
276
+ * so the comparison is apples-to-apples.
277
+ * 3. Decide:
278
+ * - Both anchor sets empty (strictest default) → accept iff
279
+ * `score >= MERGE_SCORE_STRICT`.
280
+ * - Anchors materially differ → **HARD REJECT regardless of text
281
+ * similarity**. This is the primary safety goal: two observations
282
+ * whose canonical subjects differ are never the same observation,
283
+ * even if the LLM emits identical wording. Historically the gate
284
+ * upgraded to a stricter threshold instead of hard-rejecting, but
285
+ * that allowed merges at score 1.0 when the LLM emitted templated
286
+ * text with no subject name.
287
+ * - Anchors compatible (subset or high overlap) → accept iff
288
+ * `score >= MERGE_SCORE_NORMAL`.
289
+ */
290
+ export function passesMergeSafety(
291
+ store: Store,
292
+ candidateText: string,
293
+ candidateSourceDocIds: number[],
294
+ existingText: string,
295
+ existingSourceDocIds: number[]
296
+ ): MergeSafetyResult {
297
+ const score = normalizedCosine3Gram(candidateText, existingText);
298
+
299
+ const candEnt = extractSourceDocEntities(store, candidateSourceDocIds);
300
+ const existEnt = extractSourceDocEntities(store, existingSourceDocIds);
301
+
302
+ // Use entity-aware path only when BOTH sides have entity mentions —
303
+ // otherwise the comparison is apples-to-oranges (one side is a set of
304
+ // canonical IDs, the other is a set of lexical tokens).
305
+ const bothEntity =
306
+ candEnt.method === "entity_mentions" && existEnt.method === "entity_mentions";
307
+
308
+ let anchorsA: string[];
309
+ let anchorsB: string[];
310
+ let method: MergeSafetyMethod;
311
+
312
+ if (bothEntity) {
313
+ anchorsA = candEnt.entities;
314
+ anchorsB = existEnt.entities;
315
+ method = "entity_aware";
316
+ } else {
317
+ anchorsA = extractSubjectAnchorsLexical(candidateText);
318
+ anchorsB = extractSubjectAnchorsLexical(existingText);
319
+ method = "lexical_only";
320
+ }
321
+
322
+ // Strictest default: both sides empty → no subject signal at all
323
+ if (anchorsA.length === 0 && anchorsB.length === 0) {
324
+ const threshold = MERGE_SCORE_STRICT;
325
+ const accepted = score >= threshold;
326
+ return {
327
+ accepted,
328
+ score,
329
+ threshold,
330
+ reason: accepted
331
+ ? `strictest-default met (${score.toFixed(3)} >= ${threshold})`
332
+ : `strictest-default unmet (${score.toFixed(3)} < ${threshold})`,
333
+ method: "strictest_default",
334
+ };
335
+ }
336
+
337
+ // Hard reject on materially different anchors — this is the primary
338
+ // safety goal of the extraction. Applies to BOTH entity_aware and
339
+ // lexical_only modes so the policy is uniform.
340
+ if (anchorSetsMateriallyDiffer(anchorsA, anchorsB)) {
341
+ return {
342
+ accepted: false,
343
+ score,
344
+ // Reported threshold is STRICT only for operator logging; the
345
+ // decision was hard-reject, not threshold-gated.
346
+ threshold: MERGE_SCORE_STRICT,
347
+ reason: `${method} materially different anchors — hard reject (score=${score.toFixed(3)})`,
348
+ method,
349
+ };
350
+ }
351
+
352
+ // Compatible anchors (subset or high overlap): gate on text similarity
353
+ const threshold = MERGE_SCORE_NORMAL;
354
+ const accepted = score >= threshold;
355
+ return {
356
+ accepted,
357
+ score,
358
+ threshold,
359
+ reason: accepted
360
+ ? `${method} aligned anchors — ${score.toFixed(3)} >= ${threshold}`
361
+ : `${method} aligned anchors — ${score.toFixed(3)} < ${threshold}`,
362
+ method,
363
+ };
364
+ }