clawmem 0.7.0 → 0.7.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,266 @@
1
+ /**
2
+ * Contradiction-aware merge gate (Ext 2).
3
+ *
4
+ * LLM-first contradiction check with heuristic fallback. Returns a
5
+ * structured `ContradictionResult` that downstream merge code uses to
6
+ * decide whether to merge, supersede, or link two observations.
7
+ *
8
+ * Flow:
9
+ * 1. `llmContradictionCheck` — structured LLM classification; returns
10
+ * null on LLM cooldown, network failure, malformed JSON, or missing
11
+ * `contradictory` field.
12
+ * 2. `heuristicContradictionCheck` — deterministic signal on
13
+ * negation asymmetry or number/date mismatch. Used as fallback when
14
+ * the LLM path returns null.
15
+ * 3. `checkContradiction` — orchestrator. Runs LLM first, falls back
16
+ * to heuristic on null. Never throws. Always returns a usable
17
+ * `ContradictionResult`.
18
+ *
19
+ * Adapted from Thoth `tools/memory_tool.py:111-184` contradiction-check
20
+ * pattern (THOTH_EXTRACTION_PLAN.md Extraction 2).
21
+ *
22
+ * Reuses the A-MEM convention relation type `'contradicts'` (plural) —
23
+ * see P0 taxonomy guard at `tests/unit/contradict-taxonomy.test.ts`.
24
+ */
25
+
26
+ import type { LLM } from "./llm.ts";
27
+ import { extractJsonFromLLM } from "./amem.ts";
28
+
29
+ // =============================================================================
30
+ // Types
31
+ // =============================================================================
32
+
33
+ export type ContradictionSource = "llm" | "heuristic" | "unknown";
34
+
35
+ export interface ContradictionResult {
36
+ contradictory: boolean;
37
+ confidence: number; // 0.0 - 1.0
38
+ reason?: string;
39
+ source: ContradictionSource;
40
+ }
41
+
42
+ /**
43
+ * Phase-2 contradiction handling policy. `link` (default) preserves
44
+ * both rows as active and sets `invalidated_by` as a backlink for
45
+ * operator queries. `supersede` additionally sets `invalidated_at` on
46
+ * the old row so it stops surfacing in active recalls.
47
+ */
48
+ export type ContradictionPolicy = "link" | "supersede";
49
+
50
+ export function resolveContradictionPolicy(): ContradictionPolicy {
51
+ const raw = process.env.CLAWMEM_CONTRADICTION_POLICY;
52
+ if (raw === "supersede") return "supersede";
53
+ return "link"; // default
54
+ }
55
+
56
+ /**
57
+ * Minimum LLM contradiction confidence to act on. Lower scores are
58
+ * treated as inconclusive and the merge proceeds (conservative: only
59
+ * block merges on clear contradictions). Overridable via
60
+ * `CLAWMEM_CONTRADICTION_MIN_CONFIDENCE` env var (0.0 - 1.0).
61
+ */
62
+ export const CONTRADICTION_MIN_CONFIDENCE = parseEnvFloat(
63
+ "CLAWMEM_CONTRADICTION_MIN_CONFIDENCE",
64
+ 0.5
65
+ );
66
+
67
+ function parseEnvFloat(name: string, fallback: number): number {
68
+ const raw = process.env[name];
69
+ if (raw === undefined) return fallback;
70
+ const n = Number.parseFloat(raw);
71
+ if (!Number.isFinite(n) || n < 0 || n > 1) return fallback;
72
+ return n;
73
+ }
74
+
75
+ // =============================================================================
76
+ // Heuristic contradiction detection (deterministic, no LLM)
77
+ // =============================================================================
78
+
79
+ /**
80
+ * Deterministic heuristic contradiction check.
81
+ *
82
+ * Signals:
83
+ * - **Negation asymmetry:** one side has an explicit negation token
84
+ * (`not`, `never`, `no`, `didn't`, etc.) and the other doesn't.
85
+ * - **Number/date mismatch:** both sides cite numbers or dates but the
86
+ * sets have no shared values.
87
+ *
88
+ * Intentionally conservative: returns `contradictory=false,
89
+ * confidence=0` when no signal is found, leaving the decision to the
90
+ * LLM or the caller's default.
91
+ */
92
+ export function heuristicContradictionCheck(
93
+ a: string,
94
+ b: string
95
+ ): ContradictionResult {
96
+ const negA = hasNegation(a);
97
+ const negB = hasNegation(b);
98
+
99
+ // Negation asymmetry: one side explicitly negates, the other doesn't
100
+ if (negA !== negB) {
101
+ return {
102
+ contradictory: true,
103
+ confidence: 0.6,
104
+ reason: "negation asymmetry — one statement has explicit negation",
105
+ source: "heuristic",
106
+ };
107
+ }
108
+
109
+ const numsA = extractNumbers(a);
110
+ const numsB = extractNumbers(b);
111
+
112
+ // Number/date mismatch: both cite numbers but no shared values
113
+ if (numsA.length > 0 && numsB.length > 0) {
114
+ const setA = new Set(numsA);
115
+ const setB = new Set(numsB);
116
+ const shared = [...setA].filter((n) => setB.has(n));
117
+ if (shared.length === 0) {
118
+ return {
119
+ contradictory: true,
120
+ confidence: 0.5,
121
+ reason: `number/date mismatch (A=${numsA.join(",")} B=${numsB.join(",")})`,
122
+ source: "heuristic",
123
+ };
124
+ }
125
+ }
126
+
127
+ // No heuristic signal
128
+ return {
129
+ contradictory: false,
130
+ confidence: 0.0,
131
+ reason: "no heuristic signal",
132
+ source: "heuristic",
133
+ };
134
+ }
135
+
136
+ /**
137
+ * Extract standalone integers, decimals, and ISO-ish dates from a
138
+ * string as a normalized set of numeric tokens.
139
+ */
140
+ function extractNumbers(s: string): string[] {
141
+ // Matches: integers, decimals (1.5, 1,000), ISO dates (2026-04-10),
142
+ // US dates (04/10/2026), version strings (v0.7.1 → 0.7.1)
143
+ const matches = s.match(/\b\d{1,5}(?:[.,/-]\d{1,5}){0,2}\b/g) || [];
144
+ return matches.map((m) => m.replace(/,/g, ""));
145
+ }
146
+
147
+ /**
148
+ * Return true if the string contains an explicit negation token.
149
+ * Matches English contractions (didn't, won't, cannot, etc.) plus
150
+ * bare negations (not, never, no).
151
+ */
152
+ function hasNegation(s: string): boolean {
153
+ return /\b(not|never|no|don['\u2019]t|didn['\u2019]t|won['\u2019]t|cannot|can['\u2019]t|wasn['\u2019]t|isn['\u2019]t|aren['\u2019]t|weren['\u2019]t|shouldn['\u2019]t|couldn['\u2019]t|wouldn['\u2019]t)\b/i.test(
154
+ s
155
+ );
156
+ }
157
+
158
+ // =============================================================================
159
+ // LLM-based contradiction detection
160
+ // =============================================================================
161
+
162
+ const CONTRADICTION_PROMPT_TEMPLATE = `You are a logic checker. Determine whether two statements contradict each other.
163
+
164
+ Statement A: {A}
165
+
166
+ Statement B: {B}{CONTEXT}
167
+
168
+ A contradiction exists if one statement directly denies the other, or if both cannot be true at the same time. Subtle differences in specificity (e.g. "Bob" vs "Bob Smith") are NOT contradictions. Different dates, counts, outcomes, or decisions on the same subject ARE contradictions.
169
+
170
+ Respond with ONLY a JSON object:
171
+ {"contradictory": true|false, "confidence": 0.0-1.0, "reason": "brief explanation"}
172
+
173
+ Do not include any other text. /no_think`;
174
+
175
+ /**
176
+ * LLM-based contradiction classifier.
177
+ *
178
+ * Returns `null` on any of:
179
+ * - LLM generate call throws
180
+ * - LLM returns null (cooldown, timeout, remote LLM down)
181
+ * - LLM returns text but JSON extraction fails
182
+ * - Parsed JSON is missing a boolean `contradictory` field
183
+ *
184
+ * Callers should fall back to the heuristic path on null.
185
+ */
186
+ export async function llmContradictionCheck(
187
+ llm: LLM,
188
+ a: string,
189
+ b: string,
190
+ context?: string
191
+ ): Promise<ContradictionResult | null> {
192
+ const prompt = CONTRADICTION_PROMPT_TEMPLATE.replace("{A}", a)
193
+ .replace("{B}", b)
194
+ .replace("{CONTEXT}", context ? `\n\nContext:\n${context}` : "");
195
+
196
+ let result;
197
+ try {
198
+ result = await llm.generate(prompt, { temperature: 0.2, maxTokens: 150 });
199
+ } catch {
200
+ return null;
201
+ }
202
+
203
+ if (!result?.text) return null;
204
+
205
+ const parsed = extractJsonFromLLM(result.text) as {
206
+ contradictory?: unknown;
207
+ confidence?: unknown;
208
+ reason?: unknown;
209
+ } | null;
210
+
211
+ if (!parsed || typeof parsed.contradictory !== "boolean") return null;
212
+
213
+ const confidence =
214
+ typeof parsed.confidence === "number" && Number.isFinite(parsed.confidence)
215
+ ? Math.max(0, Math.min(1, parsed.confidence))
216
+ : 0.5;
217
+
218
+ return {
219
+ contradictory: parsed.contradictory,
220
+ confidence,
221
+ reason: typeof parsed.reason === "string" ? parsed.reason : undefined,
222
+ source: "llm",
223
+ };
224
+ }
225
+
226
+ // =============================================================================
227
+ // Orchestrator
228
+ // =============================================================================
229
+
230
+ /**
231
+ * Orchestrated contradiction check.
232
+ *
233
+ * 1. Try LLM path; if it returns a usable result, use it.
234
+ * 2. Otherwise fall back to the deterministic heuristic.
235
+ *
236
+ * Never throws. Always returns a `ContradictionResult`. When the
237
+ * result's `source` is `heuristic` and `contradictory=false`, the
238
+ * caller knows the check is inconclusive and should proceed with the
239
+ * default merge path.
240
+ */
241
+ export async function checkContradiction(
242
+ llm: LLM,
243
+ a: string,
244
+ b: string,
245
+ context?: string
246
+ ): Promise<ContradictionResult> {
247
+ const llmResult = await llmContradictionCheck(llm, a, b, context);
248
+ if (llmResult) return llmResult;
249
+ return heuristicContradictionCheck(a, b);
250
+ }
251
+
252
+ /**
253
+ * Apply the `CONTRADICTION_MIN_CONFIDENCE` threshold to a
254
+ * `ContradictionResult` — returns true iff the result claims a
255
+ * contradiction AND meets the confidence floor.
256
+ *
257
+ * Callers use this to decide whether to block a merge. Keeping the
258
+ * threshold check centralized means operators can tune via env var
259
+ * without touching the merge code.
260
+ */
261
+ export function isActionableContradiction(result: ContradictionResult): boolean {
262
+ return (
263
+ result.contradictory === true &&
264
+ result.confidence >= CONTRADICTION_MIN_CONFIDENCE
265
+ );
266
+ }
@@ -0,0 +1,364 @@
1
+ /**
2
+ * Text similarity + merge safety gate for consolidation.
3
+ *
4
+ * Prevents semantic collision between topics that share vocabulary but
5
+ * refer to different subjects (e.g., two observations about "Dan" vs
6
+ * "Dad" or "Bob" vs "Rob"). Adds a dual-threshold safety check after
7
+ * the cheap Jaccard candidate-generation step.
8
+ *
9
+ * Entity-aware first: uses `entity_mentions` when both sides have canonical
10
+ * entities resolved. Lexical fallback via proper-noun anchor regex when
11
+ * either side lacks entity state. Strictest default when both sides are
12
+ * empty (no anchors at all).
13
+ *
14
+ * Adapted from Thoth `dream_cycle.py:218-272` subject-name guard
15
+ * (THOTH_EXTRACTION_PLAN.md Extraction 3).
16
+ */
17
+
18
+ import type { Store } from "./store.ts";
19
+
20
+ // =============================================================================
21
+ // Config — dual-threshold merge safety
22
+ // =============================================================================
23
+
24
+ /**
25
+ * NORMAL threshold: applies when anchor sets are compatible (subset or
26
+ * high overlap — same primary subject) AND the gate is gating on text
27
+ * similarity alone. Overridable via `CLAWMEM_MERGE_SCORE_NORMAL` env var
28
+ * for operator calibration during rollout.
29
+ *
30
+ * ⚠ Threshold is inherited from Thoth's `dream_cycle.py:218-272` guard,
31
+ * which uses Python's `difflib.SequenceMatcher` (character-level LCS).
32
+ * ClawMem uses normalized character 3-gram cosine, which is systematically
33
+ * harsher on benign rephrasings (word-order changes, synonym swaps). A
34
+ * same-meaning paraphrase like "The team migrated auth to OAuth2 last
35
+ * Friday" vs "Last Friday the team completed the auth migration to
36
+ * OAuth2" lands around 0.5 in 3-gram cosine but near 0.85 in
37
+ * SequenceMatcher. Consequence: merges will fragment more than Thoth
38
+ * did. This is the SAFE trade-off — fragmentation > false merges — but
39
+ * operators should tune via env var once they have real data.
40
+ */
41
+ export const MERGE_SCORE_NORMAL = parseEnvFloat(
42
+ "CLAWMEM_MERGE_SCORE_NORMAL",
43
+ 0.93
44
+ );
45
+
46
+ /**
47
+ * STRICT threshold: applies in the strictest-default path (both sides
48
+ * have zero anchors — no canonical entities, no proper-noun anchors).
49
+ * Overridable via `CLAWMEM_MERGE_SCORE_STRICT`.
50
+ *
51
+ * Non-strictest-default paths use the hard-reject rule on materially
52
+ * different anchors, not this threshold.
53
+ */
54
+ export const MERGE_SCORE_STRICT = parseEnvFloat(
55
+ "CLAWMEM_MERGE_SCORE_STRICT",
56
+ 0.98
57
+ );
58
+
59
+ function parseEnvFloat(name: string, fallback: number): number {
60
+ const raw = process.env[name];
61
+ if (raw === undefined) return fallback;
62
+ const n = Number.parseFloat(raw);
63
+ if (!Number.isFinite(n) || n < 0 || n > 1) return fallback;
64
+ return n;
65
+ }
66
+
67
+ // =============================================================================
68
+ // Anchor extraction (entity-first, lexical fallback)
69
+ // =============================================================================
70
+
71
+ export type AnchorSource = "entity_mentions" | "lexical_fallback";
72
+
73
+ export interface ExtractedAnchors {
74
+ entities: string[];
75
+ method: AnchorSource;
76
+ }
77
+
78
+ /**
79
+ * Get canonical entity IDs referenced by a set of source documents.
80
+ * Returns `{ entities: [], method: 'lexical_fallback' }` when no entity
81
+ * mentions exist for any of the given docs — the caller should then
82
+ * fall back to lexical anchor extraction over the raw text.
83
+ */
84
+ export function extractSourceDocEntities(
85
+ store: Store,
86
+ sourceDocIds: number[]
87
+ ): ExtractedAnchors {
88
+ if (sourceDocIds.length === 0) {
89
+ return { entities: [], method: "lexical_fallback" };
90
+ }
91
+
92
+ const placeholders = sourceDocIds.map(() => "?").join(",");
93
+ let rows: { entity_id: string }[];
94
+ try {
95
+ rows = store.db
96
+ .prepare(
97
+ `SELECT DISTINCT entity_id FROM entity_mentions WHERE doc_id IN (${placeholders})`
98
+ )
99
+ .all(...sourceDocIds) as { entity_id: string }[];
100
+ } catch {
101
+ return { entities: [], method: "lexical_fallback" };
102
+ }
103
+
104
+ if (rows.length === 0) {
105
+ return { entities: [], method: "lexical_fallback" };
106
+ }
107
+
108
+ return {
109
+ entities: rows.map((r) => r.entity_id),
110
+ method: "entity_mentions",
111
+ };
112
+ }
113
+
114
+ /**
115
+ * Extract lexical subject anchors from raw text.
116
+ *
117
+ * Heuristic: capitalized tokens that are not common sentence-start words.
118
+ * This is the fallback when `entity_mentions` is empty (the doc has not
119
+ * been through entity enrichment yet, or is from the pre-entity era).
120
+ */
121
+ export function extractSubjectAnchorsLexical(text: string): string[] {
122
+ if (!text) return [];
123
+
124
+ // Match capitalized tokens: CamelCase, UPPERCASE, Capitalized.
125
+ // Minimum 2 chars to avoid matching stray initials at sentence start.
126
+ const matches = text.match(/\b[A-Z][a-zA-Z0-9]{1,}\b/g) || [];
127
+
128
+ // Filter common sentence-start capitalized words that aren't proper nouns
129
+ const stopwords = new Set<string>([
130
+ "the", "a", "an", "this", "that", "these", "those",
131
+ "it", "we", "i", "he", "she", "they", "you", "me", "him", "her", "us", "them",
132
+ "and", "but", "or", "not", "is", "was", "are", "were",
133
+ "be", "been", "being", "have", "has", "had",
134
+ "do", "does", "did", "will", "would", "should", "could", "can",
135
+ "may", "might", "must", "shall",
136
+ "in", "on", "at", "to", "for", "of", "with", "by", "from",
137
+ "if", "then", "else", "when", "while", "where", "how", "why",
138
+ "all", "any", "some", "no", "one", "two",
139
+ ]);
140
+
141
+ const normalized = new Set<string>();
142
+ for (const token of matches) {
143
+ const lower = token.toLowerCase();
144
+ if (lower.length >= 2 && !stopwords.has(lower)) {
145
+ normalized.add(lower);
146
+ }
147
+ }
148
+
149
+ return [...normalized];
150
+ }
151
+
152
+ // =============================================================================
153
+ // Normalized character 3-gram cosine similarity
154
+ // =============================================================================
155
+
156
+ /**
157
+ * Character 3-gram cosine similarity.
158
+ *
159
+ * Robust to word-level permutation and punctuation; catches near-duplicate
160
+ * statements that differ only in wording or whitespace. Returns 0.0..1.0.
161
+ *
162
+ * Chosen over Jaccard (used as the cheap first-stage filter) because
163
+ * 3-gram cosine is tighter on paraphrase detection — it distinguishes
164
+ * "Dan visited Paris" from "Dad visited Paris" while the Jaccard over
165
+ * long-word sets would treat both as near-duplicates.
166
+ */
167
+ export function normalizedCosine3Gram(a: string, b: string): number {
168
+ const na = normalizeForTrigram(a);
169
+ const nb = normalizeForTrigram(b);
170
+
171
+ if (na.length === 0 || nb.length === 0) return 0;
172
+ if (na === nb) return 1.0;
173
+
174
+ const ta = trigramCounts(na);
175
+ const tb = trigramCounts(nb);
176
+
177
+ let dot = 0;
178
+ for (const [gram, count] of ta) {
179
+ const other = tb.get(gram);
180
+ if (other) dot += count * other;
181
+ }
182
+
183
+ const ma = magnitude(ta);
184
+ const mb = magnitude(tb);
185
+ if (ma === 0 || mb === 0) return 0;
186
+
187
+ return dot / (ma * mb);
188
+ }
189
+
190
+ function normalizeForTrigram(s: string): string {
191
+ return s
192
+ .toLowerCase()
193
+ .replace(/[^a-z0-9 ]+/g, " ")
194
+ .replace(/\s+/g, " ")
195
+ .trim();
196
+ }
197
+
198
+ function trigramCounts(s: string): Map<string, number> {
199
+ const out = new Map<string, number>();
200
+ if (s.length < 3) {
201
+ out.set(s, 1);
202
+ return out;
203
+ }
204
+ for (let i = 0; i <= s.length - 3; i++) {
205
+ const gram = s.slice(i, i + 3);
206
+ out.set(gram, (out.get(gram) || 0) + 1);
207
+ }
208
+ return out;
209
+ }
210
+
211
+ function magnitude(m: Map<string, number>): number {
212
+ let sum = 0;
213
+ for (const v of m.values()) sum += v * v;
214
+ return Math.sqrt(sum);
215
+ }
216
+
217
+ // =============================================================================
218
+ // Anchor set comparison
219
+ // =============================================================================
220
+
221
+ /**
222
+ * Determine whether two anchor sets "materially differ".
223
+ *
224
+ * Rules (all case-insensitive):
225
+ * 1. Either side empty → NOT materially different (caller handles via
226
+ * strictest-default path).
227
+ * 2. One set is a subset of the other → NOT materially different
228
+ * (allows `"Bob"` ↔ `"Bob Smith"`).
229
+ * 3. Intersection empty → materially different (`"Dan"` vs `"Dad"`).
230
+ * 4. Partial overlap → materially different when AT MOST half of the
231
+ * smaller set is shared (boundary `≤ 0.5` treated as material to
232
+ * fence off primary-subject mismatches like
233
+ * `[alice, auth-service]` vs `[bob, auth-service]` where the only
234
+ * shared anchor is the context, not the subject).
235
+ */
236
+ export function anchorSetsMateriallyDiffer(a: string[], b: string[]): boolean {
237
+ if (a.length === 0 || b.length === 0) return false;
238
+
239
+ const setA = new Set(a.map((x) => x.toLowerCase()));
240
+ const setB = new Set(b.map((x) => x.toLowerCase()));
241
+
242
+ const aSubB = [...setA].every((x) => setB.has(x));
243
+ const bSubA = [...setB].every((x) => setA.has(x));
244
+ if (aSubB || bSubA) return false;
245
+
246
+ const intersect = [...setA].filter((x) => setB.has(x));
247
+ if (intersect.length === 0) return true;
248
+
249
+ const smaller = Math.min(setA.size, setB.size);
250
+ return intersect.length / smaller <= 0.5;
251
+ }
252
+
253
+ // =============================================================================
254
+ // Merge safety gate
255
+ // =============================================================================
256
+
257
+ export type MergeSafetyMethod = "entity_aware" | "lexical_only" | "strictest_default";
258
+
259
+ export interface MergeSafetyResult {
260
+ accepted: boolean;
261
+ score: number;
262
+ threshold: number;
263
+ reason: string;
264
+ method: MergeSafetyMethod;
265
+ }
266
+
267
+ /**
268
+ * Merge safety gate.
269
+ *
270
+ * Flow:
271
+ * 1. Compute normalized character 3-gram cosine similarity between the
272
+ * candidate and existing observation texts.
273
+ * 2. Extract anchor sets for both sides. Entity-aware first
274
+ * (`entity_mentions`), lexical fallback otherwise. If EITHER side
275
+ * lacks `entity_mentions` coverage, both sides fall back to lexical
276
+ * so the comparison is apples-to-apples.
277
+ * 3. Decide:
278
+ * - Both anchor sets empty (strictest default) → accept iff
279
+ * `score >= MERGE_SCORE_STRICT`.
280
+ * - Anchors materially differ → **HARD REJECT regardless of text
281
+ * similarity**. This is the primary safety goal: two observations
282
+ * whose canonical subjects differ are never the same observation,
283
+ * even if the LLM emits identical wording. Historically the gate
284
+ * upgraded to a stricter threshold instead of hard-rejecting, but
285
+ * that allowed merges at score 1.0 when the LLM emitted templated
286
+ * text with no subject name.
287
+ * - Anchors compatible (subset or high overlap) → accept iff
288
+ * `score >= MERGE_SCORE_NORMAL`.
289
+ */
290
+ export function passesMergeSafety(
291
+ store: Store,
292
+ candidateText: string,
293
+ candidateSourceDocIds: number[],
294
+ existingText: string,
295
+ existingSourceDocIds: number[]
296
+ ): MergeSafetyResult {
297
+ const score = normalizedCosine3Gram(candidateText, existingText);
298
+
299
+ const candEnt = extractSourceDocEntities(store, candidateSourceDocIds);
300
+ const existEnt = extractSourceDocEntities(store, existingSourceDocIds);
301
+
302
+ // Use entity-aware path only when BOTH sides have entity mentions —
303
+ // otherwise the comparison is apples-to-oranges (one side is a set of
304
+ // canonical IDs, the other is a set of lexical tokens).
305
+ const bothEntity =
306
+ candEnt.method === "entity_mentions" && existEnt.method === "entity_mentions";
307
+
308
+ let anchorsA: string[];
309
+ let anchorsB: string[];
310
+ let method: MergeSafetyMethod;
311
+
312
+ if (bothEntity) {
313
+ anchorsA = candEnt.entities;
314
+ anchorsB = existEnt.entities;
315
+ method = "entity_aware";
316
+ } else {
317
+ anchorsA = extractSubjectAnchorsLexical(candidateText);
318
+ anchorsB = extractSubjectAnchorsLexical(existingText);
319
+ method = "lexical_only";
320
+ }
321
+
322
+ // Strictest default: both sides empty → no subject signal at all
323
+ if (anchorsA.length === 0 && anchorsB.length === 0) {
324
+ const threshold = MERGE_SCORE_STRICT;
325
+ const accepted = score >= threshold;
326
+ return {
327
+ accepted,
328
+ score,
329
+ threshold,
330
+ reason: accepted
331
+ ? `strictest-default met (${score.toFixed(3)} >= ${threshold})`
332
+ : `strictest-default unmet (${score.toFixed(3)} < ${threshold})`,
333
+ method: "strictest_default",
334
+ };
335
+ }
336
+
337
+ // Hard reject on materially different anchors — this is the primary
338
+ // safety goal of the extraction. Applies to BOTH entity_aware and
339
+ // lexical_only modes so the policy is uniform.
340
+ if (anchorSetsMateriallyDiffer(anchorsA, anchorsB)) {
341
+ return {
342
+ accepted: false,
343
+ score,
344
+ // Reported threshold is STRICT only for operator logging; the
345
+ // decision was hard-reject, not threshold-gated.
346
+ threshold: MERGE_SCORE_STRICT,
347
+ reason: `${method} materially different anchors — hard reject (score=${score.toFixed(3)})`,
348
+ method,
349
+ };
350
+ }
351
+
352
+ // Compatible anchors (subset or high overlap): gate on text similarity
353
+ const threshold = MERGE_SCORE_NORMAL;
354
+ const accepted = score >= threshold;
355
+ return {
356
+ accepted,
357
+ score,
358
+ threshold,
359
+ reason: accepted
360
+ ? `${method} aligned anchors — ${score.toFixed(3)} >= ${threshold}`
361
+ : `${method} aligned anchors — ${score.toFixed(3)} < ${threshold}`,
362
+ method,
363
+ };
364
+ }