clawmem 0.7.0 → 0.7.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/AGENTS.md +11 -5
- package/CLAUDE.md +11 -5
- package/README.md +37 -5
- package/SKILL.md +17 -2
- package/package.json +1 -1
- package/src/clawmem.ts +30 -2
- package/src/consolidation.ts +514 -40
- package/src/conversation-synthesis.ts +637 -0
- package/src/deductive-guardrails.ts +481 -0
- package/src/hooks/context-surfacing.ts +181 -3
- package/src/merge-guards.ts +266 -0
- package/src/text-similarity.ts +364 -0
|
@@ -0,0 +1,266 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Contradiction-aware merge gate (Ext 2).
|
|
3
|
+
*
|
|
4
|
+
* LLM-first contradiction check with heuristic fallback. Returns a
|
|
5
|
+
* structured `ContradictionResult` that downstream merge code uses to
|
|
6
|
+
* decide whether to merge, supersede, or link two observations.
|
|
7
|
+
*
|
|
8
|
+
* Flow:
|
|
9
|
+
* 1. `llmContradictionCheck` — structured LLM classification; returns
|
|
10
|
+
* null on LLM cooldown, network failure, malformed JSON, or missing
|
|
11
|
+
* `contradictory` field.
|
|
12
|
+
* 2. `heuristicContradictionCheck` — deterministic signal on
|
|
13
|
+
* negation asymmetry or number/date mismatch. Used as fallback when
|
|
14
|
+
* the LLM path returns null.
|
|
15
|
+
* 3. `checkContradiction` — orchestrator. Runs LLM first, falls back
|
|
16
|
+
* to heuristic on null. Never throws. Always returns a usable
|
|
17
|
+
* `ContradictionResult`.
|
|
18
|
+
*
|
|
19
|
+
* Adapted from Thoth `tools/memory_tool.py:111-184` contradiction-check
|
|
20
|
+
* pattern (THOTH_EXTRACTION_PLAN.md Extraction 2).
|
|
21
|
+
*
|
|
22
|
+
* Reuses the A-MEM convention relation type `'contradicts'` (plural) —
|
|
23
|
+
* see P0 taxonomy guard at `tests/unit/contradict-taxonomy.test.ts`.
|
|
24
|
+
*/
|
|
25
|
+
|
|
26
|
+
import type { LLM } from "./llm.ts";
|
|
27
|
+
import { extractJsonFromLLM } from "./amem.ts";
|
|
28
|
+
|
|
29
|
+
// =============================================================================
|
|
30
|
+
// Types
|
|
31
|
+
// =============================================================================
|
|
32
|
+
|
|
33
|
+
export type ContradictionSource = "llm" | "heuristic" | "unknown";
|
|
34
|
+
|
|
35
|
+
export interface ContradictionResult {
|
|
36
|
+
contradictory: boolean;
|
|
37
|
+
confidence: number; // 0.0 - 1.0
|
|
38
|
+
reason?: string;
|
|
39
|
+
source: ContradictionSource;
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
/**
|
|
43
|
+
* Phase-2 contradiction handling policy. `link` (default) preserves
|
|
44
|
+
* both rows as active and sets `invalidated_by` as a backlink for
|
|
45
|
+
* operator queries. `supersede` additionally sets `invalidated_at` on
|
|
46
|
+
* the old row so it stops surfacing in active recalls.
|
|
47
|
+
*/
|
|
48
|
+
export type ContradictionPolicy = "link" | "supersede";
|
|
49
|
+
|
|
50
|
+
export function resolveContradictionPolicy(): ContradictionPolicy {
|
|
51
|
+
const raw = process.env.CLAWMEM_CONTRADICTION_POLICY;
|
|
52
|
+
if (raw === "supersede") return "supersede";
|
|
53
|
+
return "link"; // default
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
/**
|
|
57
|
+
* Minimum LLM contradiction confidence to act on. Lower scores are
|
|
58
|
+
* treated as inconclusive and the merge proceeds (conservative: only
|
|
59
|
+
* block merges on clear contradictions). Overridable via
|
|
60
|
+
* `CLAWMEM_CONTRADICTION_MIN_CONFIDENCE` env var (0.0 - 1.0).
|
|
61
|
+
*/
|
|
62
|
+
export const CONTRADICTION_MIN_CONFIDENCE = parseEnvFloat(
|
|
63
|
+
"CLAWMEM_CONTRADICTION_MIN_CONFIDENCE",
|
|
64
|
+
0.5
|
|
65
|
+
);
|
|
66
|
+
|
|
67
|
+
function parseEnvFloat(name: string, fallback: number): number {
|
|
68
|
+
const raw = process.env[name];
|
|
69
|
+
if (raw === undefined) return fallback;
|
|
70
|
+
const n = Number.parseFloat(raw);
|
|
71
|
+
if (!Number.isFinite(n) || n < 0 || n > 1) return fallback;
|
|
72
|
+
return n;
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
// =============================================================================
|
|
76
|
+
// Heuristic contradiction detection (deterministic, no LLM)
|
|
77
|
+
// =============================================================================
|
|
78
|
+
|
|
79
|
+
/**
|
|
80
|
+
* Deterministic heuristic contradiction check.
|
|
81
|
+
*
|
|
82
|
+
* Signals:
|
|
83
|
+
* - **Negation asymmetry:** one side has an explicit negation token
|
|
84
|
+
* (`not`, `never`, `no`, `didn't`, etc.) and the other doesn't.
|
|
85
|
+
* - **Number/date mismatch:** both sides cite numbers or dates but the
|
|
86
|
+
* sets have no shared values.
|
|
87
|
+
*
|
|
88
|
+
* Intentionally conservative: returns `contradictory=false,
|
|
89
|
+
* confidence=0` when no signal is found, leaving the decision to the
|
|
90
|
+
* LLM or the caller's default.
|
|
91
|
+
*/
|
|
92
|
+
export function heuristicContradictionCheck(
|
|
93
|
+
a: string,
|
|
94
|
+
b: string
|
|
95
|
+
): ContradictionResult {
|
|
96
|
+
const negA = hasNegation(a);
|
|
97
|
+
const negB = hasNegation(b);
|
|
98
|
+
|
|
99
|
+
// Negation asymmetry: one side explicitly negates, the other doesn't
|
|
100
|
+
if (negA !== negB) {
|
|
101
|
+
return {
|
|
102
|
+
contradictory: true,
|
|
103
|
+
confidence: 0.6,
|
|
104
|
+
reason: "negation asymmetry — one statement has explicit negation",
|
|
105
|
+
source: "heuristic",
|
|
106
|
+
};
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
const numsA = extractNumbers(a);
|
|
110
|
+
const numsB = extractNumbers(b);
|
|
111
|
+
|
|
112
|
+
// Number/date mismatch: both cite numbers but no shared values
|
|
113
|
+
if (numsA.length > 0 && numsB.length > 0) {
|
|
114
|
+
const setA = new Set(numsA);
|
|
115
|
+
const setB = new Set(numsB);
|
|
116
|
+
const shared = [...setA].filter((n) => setB.has(n));
|
|
117
|
+
if (shared.length === 0) {
|
|
118
|
+
return {
|
|
119
|
+
contradictory: true,
|
|
120
|
+
confidence: 0.5,
|
|
121
|
+
reason: `number/date mismatch (A=${numsA.join(",")} B=${numsB.join(",")})`,
|
|
122
|
+
source: "heuristic",
|
|
123
|
+
};
|
|
124
|
+
}
|
|
125
|
+
}
|
|
126
|
+
|
|
127
|
+
// No heuristic signal
|
|
128
|
+
return {
|
|
129
|
+
contradictory: false,
|
|
130
|
+
confidence: 0.0,
|
|
131
|
+
reason: "no heuristic signal",
|
|
132
|
+
source: "heuristic",
|
|
133
|
+
};
|
|
134
|
+
}
|
|
135
|
+
|
|
136
|
+
/**
|
|
137
|
+
* Extract standalone integers, decimals, and ISO-ish dates from a
|
|
138
|
+
* string as a normalized set of numeric tokens.
|
|
139
|
+
*/
|
|
140
|
+
function extractNumbers(s: string): string[] {
|
|
141
|
+
// Matches: integers, decimals (1.5, 1,000), ISO dates (2026-04-10),
|
|
142
|
+
// US dates (04/10/2026), version strings (v0.7.1 → 0.7.1)
|
|
143
|
+
const matches = s.match(/\b\d{1,5}(?:[.,/-]\d{1,5}){0,2}\b/g) || [];
|
|
144
|
+
return matches.map((m) => m.replace(/,/g, ""));
|
|
145
|
+
}
|
|
146
|
+
|
|
147
|
+
/**
|
|
148
|
+
* Return true if the string contains an explicit negation token.
|
|
149
|
+
* Matches English contractions (didn't, won't, cannot, etc.) plus
|
|
150
|
+
* bare negations (not, never, no).
|
|
151
|
+
*/
|
|
152
|
+
function hasNegation(s: string): boolean {
|
|
153
|
+
return /\b(not|never|no|don['\u2019]t|didn['\u2019]t|won['\u2019]t|cannot|can['\u2019]t|wasn['\u2019]t|isn['\u2019]t|aren['\u2019]t|weren['\u2019]t|shouldn['\u2019]t|couldn['\u2019]t|wouldn['\u2019]t)\b/i.test(
|
|
154
|
+
s
|
|
155
|
+
);
|
|
156
|
+
}
|
|
157
|
+
|
|
158
|
+
// =============================================================================
|
|
159
|
+
// LLM-based contradiction detection
|
|
160
|
+
// =============================================================================
|
|
161
|
+
|
|
162
|
+
const CONTRADICTION_PROMPT_TEMPLATE = `You are a logic checker. Determine whether two statements contradict each other.
|
|
163
|
+
|
|
164
|
+
Statement A: {A}
|
|
165
|
+
|
|
166
|
+
Statement B: {B}{CONTEXT}
|
|
167
|
+
|
|
168
|
+
A contradiction exists if one statement directly denies the other, or if both cannot be true at the same time. Subtle differences in specificity (e.g. "Bob" vs "Bob Smith") are NOT contradictions. Different dates, counts, outcomes, or decisions on the same subject ARE contradictions.
|
|
169
|
+
|
|
170
|
+
Respond with ONLY a JSON object:
|
|
171
|
+
{"contradictory": true|false, "confidence": 0.0-1.0, "reason": "brief explanation"}
|
|
172
|
+
|
|
173
|
+
Do not include any other text. /no_think`;
|
|
174
|
+
|
|
175
|
+
/**
|
|
176
|
+
* LLM-based contradiction classifier.
|
|
177
|
+
*
|
|
178
|
+
* Returns `null` on any of:
|
|
179
|
+
* - LLM generate call throws
|
|
180
|
+
* - LLM returns null (cooldown, timeout, remote LLM down)
|
|
181
|
+
* - LLM returns text but JSON extraction fails
|
|
182
|
+
* - Parsed JSON is missing a boolean `contradictory` field
|
|
183
|
+
*
|
|
184
|
+
* Callers should fall back to the heuristic path on null.
|
|
185
|
+
*/
|
|
186
|
+
export async function llmContradictionCheck(
|
|
187
|
+
llm: LLM,
|
|
188
|
+
a: string,
|
|
189
|
+
b: string,
|
|
190
|
+
context?: string
|
|
191
|
+
): Promise<ContradictionResult | null> {
|
|
192
|
+
const prompt = CONTRADICTION_PROMPT_TEMPLATE.replace("{A}", a)
|
|
193
|
+
.replace("{B}", b)
|
|
194
|
+
.replace("{CONTEXT}", context ? `\n\nContext:\n${context}` : "");
|
|
195
|
+
|
|
196
|
+
let result;
|
|
197
|
+
try {
|
|
198
|
+
result = await llm.generate(prompt, { temperature: 0.2, maxTokens: 150 });
|
|
199
|
+
} catch {
|
|
200
|
+
return null;
|
|
201
|
+
}
|
|
202
|
+
|
|
203
|
+
if (!result?.text) return null;
|
|
204
|
+
|
|
205
|
+
const parsed = extractJsonFromLLM(result.text) as {
|
|
206
|
+
contradictory?: unknown;
|
|
207
|
+
confidence?: unknown;
|
|
208
|
+
reason?: unknown;
|
|
209
|
+
} | null;
|
|
210
|
+
|
|
211
|
+
if (!parsed || typeof parsed.contradictory !== "boolean") return null;
|
|
212
|
+
|
|
213
|
+
const confidence =
|
|
214
|
+
typeof parsed.confidence === "number" && Number.isFinite(parsed.confidence)
|
|
215
|
+
? Math.max(0, Math.min(1, parsed.confidence))
|
|
216
|
+
: 0.5;
|
|
217
|
+
|
|
218
|
+
return {
|
|
219
|
+
contradictory: parsed.contradictory,
|
|
220
|
+
confidence,
|
|
221
|
+
reason: typeof parsed.reason === "string" ? parsed.reason : undefined,
|
|
222
|
+
source: "llm",
|
|
223
|
+
};
|
|
224
|
+
}
|
|
225
|
+
|
|
226
|
+
// =============================================================================
|
|
227
|
+
// Orchestrator
|
|
228
|
+
// =============================================================================
|
|
229
|
+
|
|
230
|
+
/**
|
|
231
|
+
* Orchestrated contradiction check.
|
|
232
|
+
*
|
|
233
|
+
* 1. Try LLM path; if it returns a usable result, use it.
|
|
234
|
+
* 2. Otherwise fall back to the deterministic heuristic.
|
|
235
|
+
*
|
|
236
|
+
* Never throws. Always returns a `ContradictionResult`. When the
|
|
237
|
+
* result's `source` is `heuristic` and `contradictory=false`, the
|
|
238
|
+
* caller knows the check is inconclusive and should proceed with the
|
|
239
|
+
* default merge path.
|
|
240
|
+
*/
|
|
241
|
+
export async function checkContradiction(
|
|
242
|
+
llm: LLM,
|
|
243
|
+
a: string,
|
|
244
|
+
b: string,
|
|
245
|
+
context?: string
|
|
246
|
+
): Promise<ContradictionResult> {
|
|
247
|
+
const llmResult = await llmContradictionCheck(llm, a, b, context);
|
|
248
|
+
if (llmResult) return llmResult;
|
|
249
|
+
return heuristicContradictionCheck(a, b);
|
|
250
|
+
}
|
|
251
|
+
|
|
252
|
+
/**
|
|
253
|
+
* Apply the `CONTRADICTION_MIN_CONFIDENCE` threshold to a
|
|
254
|
+
* `ContradictionResult` — returns true iff the result claims a
|
|
255
|
+
* contradiction AND meets the confidence floor.
|
|
256
|
+
*
|
|
257
|
+
* Callers use this to decide whether to block a merge. Keeping the
|
|
258
|
+
* threshold check centralized means operators can tune via env var
|
|
259
|
+
* without touching the merge code.
|
|
260
|
+
*/
|
|
261
|
+
export function isActionableContradiction(result: ContradictionResult): boolean {
|
|
262
|
+
return (
|
|
263
|
+
result.contradictory === true &&
|
|
264
|
+
result.confidence >= CONTRADICTION_MIN_CONFIDENCE
|
|
265
|
+
);
|
|
266
|
+
}
|
|
@@ -0,0 +1,364 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Text similarity + merge safety gate for consolidation.
|
|
3
|
+
*
|
|
4
|
+
* Prevents semantic collision between topics that share vocabulary but
|
|
5
|
+
* refer to different subjects (e.g., two observations about "Dan" vs
|
|
6
|
+
* "Dad" or "Bob" vs "Rob"). Adds a dual-threshold safety check after
|
|
7
|
+
* the cheap Jaccard candidate-generation step.
|
|
8
|
+
*
|
|
9
|
+
* Entity-aware first: uses `entity_mentions` when both sides have canonical
|
|
10
|
+
* entities resolved. Lexical fallback via proper-noun anchor regex when
|
|
11
|
+
* either side lacks entity state. Strictest default when both sides are
|
|
12
|
+
* empty (no anchors at all).
|
|
13
|
+
*
|
|
14
|
+
* Adapted from Thoth `dream_cycle.py:218-272` subject-name guard
|
|
15
|
+
* (THOTH_EXTRACTION_PLAN.md Extraction 3).
|
|
16
|
+
*/
|
|
17
|
+
|
|
18
|
+
import type { Store } from "./store.ts";
|
|
19
|
+
|
|
20
|
+
// =============================================================================
|
|
21
|
+
// Config — dual-threshold merge safety
|
|
22
|
+
// =============================================================================
|
|
23
|
+
|
|
24
|
+
/**
|
|
25
|
+
* NORMAL threshold: applies when anchor sets are compatible (subset or
|
|
26
|
+
* high overlap — same primary subject) AND the gate is gating on text
|
|
27
|
+
* similarity alone. Overridable via `CLAWMEM_MERGE_SCORE_NORMAL` env var
|
|
28
|
+
* for operator calibration during rollout.
|
|
29
|
+
*
|
|
30
|
+
* ⚠ Threshold is inherited from Thoth's `dream_cycle.py:218-272` guard,
|
|
31
|
+
* which uses Python's `difflib.SequenceMatcher` (character-level LCS).
|
|
32
|
+
* ClawMem uses normalized character 3-gram cosine, which is systematically
|
|
33
|
+
* harsher on benign rephrasings (word-order changes, synonym swaps). A
|
|
34
|
+
* same-meaning paraphrase like "The team migrated auth to OAuth2 last
|
|
35
|
+
* Friday" vs "Last Friday the team completed the auth migration to
|
|
36
|
+
* OAuth2" lands around 0.5 in 3-gram cosine but near 0.85 in
|
|
37
|
+
* SequenceMatcher. Consequence: merges will fragment more than Thoth
|
|
38
|
+
* did. This is the SAFE trade-off — fragmentation > false merges — but
|
|
39
|
+
* operators should tune via env var once they have real data.
|
|
40
|
+
*/
|
|
41
|
+
export const MERGE_SCORE_NORMAL = parseEnvFloat(
|
|
42
|
+
"CLAWMEM_MERGE_SCORE_NORMAL",
|
|
43
|
+
0.93
|
|
44
|
+
);
|
|
45
|
+
|
|
46
|
+
/**
|
|
47
|
+
* STRICT threshold: applies in the strictest-default path (both sides
|
|
48
|
+
* have zero anchors — no canonical entities, no proper-noun anchors).
|
|
49
|
+
* Overridable via `CLAWMEM_MERGE_SCORE_STRICT`.
|
|
50
|
+
*
|
|
51
|
+
* Non-strictest-default paths use the hard-reject rule on materially
|
|
52
|
+
* different anchors, not this threshold.
|
|
53
|
+
*/
|
|
54
|
+
export const MERGE_SCORE_STRICT = parseEnvFloat(
|
|
55
|
+
"CLAWMEM_MERGE_SCORE_STRICT",
|
|
56
|
+
0.98
|
|
57
|
+
);
|
|
58
|
+
|
|
59
|
+
function parseEnvFloat(name: string, fallback: number): number {
|
|
60
|
+
const raw = process.env[name];
|
|
61
|
+
if (raw === undefined) return fallback;
|
|
62
|
+
const n = Number.parseFloat(raw);
|
|
63
|
+
if (!Number.isFinite(n) || n < 0 || n > 1) return fallback;
|
|
64
|
+
return n;
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
// =============================================================================
|
|
68
|
+
// Anchor extraction (entity-first, lexical fallback)
|
|
69
|
+
// =============================================================================
|
|
70
|
+
|
|
71
|
+
export type AnchorSource = "entity_mentions" | "lexical_fallback";
|
|
72
|
+
|
|
73
|
+
export interface ExtractedAnchors {
|
|
74
|
+
entities: string[];
|
|
75
|
+
method: AnchorSource;
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
/**
|
|
79
|
+
* Get canonical entity IDs referenced by a set of source documents.
|
|
80
|
+
* Returns `{ entities: [], method: 'lexical_fallback' }` when no entity
|
|
81
|
+
* mentions exist for any of the given docs — the caller should then
|
|
82
|
+
* fall back to lexical anchor extraction over the raw text.
|
|
83
|
+
*/
|
|
84
|
+
export function extractSourceDocEntities(
|
|
85
|
+
store: Store,
|
|
86
|
+
sourceDocIds: number[]
|
|
87
|
+
): ExtractedAnchors {
|
|
88
|
+
if (sourceDocIds.length === 0) {
|
|
89
|
+
return { entities: [], method: "lexical_fallback" };
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
const placeholders = sourceDocIds.map(() => "?").join(",");
|
|
93
|
+
let rows: { entity_id: string }[];
|
|
94
|
+
try {
|
|
95
|
+
rows = store.db
|
|
96
|
+
.prepare(
|
|
97
|
+
`SELECT DISTINCT entity_id FROM entity_mentions WHERE doc_id IN (${placeholders})`
|
|
98
|
+
)
|
|
99
|
+
.all(...sourceDocIds) as { entity_id: string }[];
|
|
100
|
+
} catch {
|
|
101
|
+
return { entities: [], method: "lexical_fallback" };
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
if (rows.length === 0) {
|
|
105
|
+
return { entities: [], method: "lexical_fallback" };
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
return {
|
|
109
|
+
entities: rows.map((r) => r.entity_id),
|
|
110
|
+
method: "entity_mentions",
|
|
111
|
+
};
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
/**
|
|
115
|
+
* Extract lexical subject anchors from raw text.
|
|
116
|
+
*
|
|
117
|
+
* Heuristic: capitalized tokens that are not common sentence-start words.
|
|
118
|
+
* This is the fallback when `entity_mentions` is empty (the doc has not
|
|
119
|
+
* been through entity enrichment yet, or is from the pre-entity era).
|
|
120
|
+
*/
|
|
121
|
+
export function extractSubjectAnchorsLexical(text: string): string[] {
|
|
122
|
+
if (!text) return [];
|
|
123
|
+
|
|
124
|
+
// Match capitalized tokens: CamelCase, UPPERCASE, Capitalized.
|
|
125
|
+
// Minimum 2 chars to avoid matching stray initials at sentence start.
|
|
126
|
+
const matches = text.match(/\b[A-Z][a-zA-Z0-9]{1,}\b/g) || [];
|
|
127
|
+
|
|
128
|
+
// Filter common sentence-start capitalized words that aren't proper nouns
|
|
129
|
+
const stopwords = new Set<string>([
|
|
130
|
+
"the", "a", "an", "this", "that", "these", "those",
|
|
131
|
+
"it", "we", "i", "he", "she", "they", "you", "me", "him", "her", "us", "them",
|
|
132
|
+
"and", "but", "or", "not", "is", "was", "are", "were",
|
|
133
|
+
"be", "been", "being", "have", "has", "had",
|
|
134
|
+
"do", "does", "did", "will", "would", "should", "could", "can",
|
|
135
|
+
"may", "might", "must", "shall",
|
|
136
|
+
"in", "on", "at", "to", "for", "of", "with", "by", "from",
|
|
137
|
+
"if", "then", "else", "when", "while", "where", "how", "why",
|
|
138
|
+
"all", "any", "some", "no", "one", "two",
|
|
139
|
+
]);
|
|
140
|
+
|
|
141
|
+
const normalized = new Set<string>();
|
|
142
|
+
for (const token of matches) {
|
|
143
|
+
const lower = token.toLowerCase();
|
|
144
|
+
if (lower.length >= 2 && !stopwords.has(lower)) {
|
|
145
|
+
normalized.add(lower);
|
|
146
|
+
}
|
|
147
|
+
}
|
|
148
|
+
|
|
149
|
+
return [...normalized];
|
|
150
|
+
}
|
|
151
|
+
|
|
152
|
+
// =============================================================================
|
|
153
|
+
// Normalized character 3-gram cosine similarity
|
|
154
|
+
// =============================================================================
|
|
155
|
+
|
|
156
|
+
/**
|
|
157
|
+
* Character 3-gram cosine similarity.
|
|
158
|
+
*
|
|
159
|
+
* Robust to word-level permutation and punctuation; catches near-duplicate
|
|
160
|
+
* statements that differ only in wording or whitespace. Returns 0.0..1.0.
|
|
161
|
+
*
|
|
162
|
+
* Chosen over Jaccard (used as the cheap first-stage filter) because
|
|
163
|
+
* 3-gram cosine is tighter on paraphrase detection — it distinguishes
|
|
164
|
+
* "Dan visited Paris" from "Dad visited Paris" while the Jaccard over
|
|
165
|
+
* long-word sets would treat both as near-duplicates.
|
|
166
|
+
*/
|
|
167
|
+
export function normalizedCosine3Gram(a: string, b: string): number {
|
|
168
|
+
const na = normalizeForTrigram(a);
|
|
169
|
+
const nb = normalizeForTrigram(b);
|
|
170
|
+
|
|
171
|
+
if (na.length === 0 || nb.length === 0) return 0;
|
|
172
|
+
if (na === nb) return 1.0;
|
|
173
|
+
|
|
174
|
+
const ta = trigramCounts(na);
|
|
175
|
+
const tb = trigramCounts(nb);
|
|
176
|
+
|
|
177
|
+
let dot = 0;
|
|
178
|
+
for (const [gram, count] of ta) {
|
|
179
|
+
const other = tb.get(gram);
|
|
180
|
+
if (other) dot += count * other;
|
|
181
|
+
}
|
|
182
|
+
|
|
183
|
+
const ma = magnitude(ta);
|
|
184
|
+
const mb = magnitude(tb);
|
|
185
|
+
if (ma === 0 || mb === 0) return 0;
|
|
186
|
+
|
|
187
|
+
return dot / (ma * mb);
|
|
188
|
+
}
|
|
189
|
+
|
|
190
|
+
function normalizeForTrigram(s: string): string {
|
|
191
|
+
return s
|
|
192
|
+
.toLowerCase()
|
|
193
|
+
.replace(/[^a-z0-9 ]+/g, " ")
|
|
194
|
+
.replace(/\s+/g, " ")
|
|
195
|
+
.trim();
|
|
196
|
+
}
|
|
197
|
+
|
|
198
|
+
function trigramCounts(s: string): Map<string, number> {
|
|
199
|
+
const out = new Map<string, number>();
|
|
200
|
+
if (s.length < 3) {
|
|
201
|
+
out.set(s, 1);
|
|
202
|
+
return out;
|
|
203
|
+
}
|
|
204
|
+
for (let i = 0; i <= s.length - 3; i++) {
|
|
205
|
+
const gram = s.slice(i, i + 3);
|
|
206
|
+
out.set(gram, (out.get(gram) || 0) + 1);
|
|
207
|
+
}
|
|
208
|
+
return out;
|
|
209
|
+
}
|
|
210
|
+
|
|
211
|
+
function magnitude(m: Map<string, number>): number {
|
|
212
|
+
let sum = 0;
|
|
213
|
+
for (const v of m.values()) sum += v * v;
|
|
214
|
+
return Math.sqrt(sum);
|
|
215
|
+
}
|
|
216
|
+
|
|
217
|
+
// =============================================================================
|
|
218
|
+
// Anchor set comparison
|
|
219
|
+
// =============================================================================
|
|
220
|
+
|
|
221
|
+
/**
|
|
222
|
+
* Determine whether two anchor sets "materially differ".
|
|
223
|
+
*
|
|
224
|
+
* Rules (all case-insensitive):
|
|
225
|
+
* 1. Either side empty → NOT materially different (caller handles via
|
|
226
|
+
* strictest-default path).
|
|
227
|
+
* 2. One set is a subset of the other → NOT materially different
|
|
228
|
+
* (allows `"Bob"` ↔ `"Bob Smith"`).
|
|
229
|
+
* 3. Intersection empty → materially different (`"Dan"` vs `"Dad"`).
|
|
230
|
+
* 4. Partial overlap → materially different when AT MOST half of the
|
|
231
|
+
* smaller set is shared (boundary `≤ 0.5` treated as material to
|
|
232
|
+
* fence off primary-subject mismatches like
|
|
233
|
+
* `[alice, auth-service]` vs `[bob, auth-service]` where the only
|
|
234
|
+
* shared anchor is the context, not the subject).
|
|
235
|
+
*/
|
|
236
|
+
export function anchorSetsMateriallyDiffer(a: string[], b: string[]): boolean {
|
|
237
|
+
if (a.length === 0 || b.length === 0) return false;
|
|
238
|
+
|
|
239
|
+
const setA = new Set(a.map((x) => x.toLowerCase()));
|
|
240
|
+
const setB = new Set(b.map((x) => x.toLowerCase()));
|
|
241
|
+
|
|
242
|
+
const aSubB = [...setA].every((x) => setB.has(x));
|
|
243
|
+
const bSubA = [...setB].every((x) => setA.has(x));
|
|
244
|
+
if (aSubB || bSubA) return false;
|
|
245
|
+
|
|
246
|
+
const intersect = [...setA].filter((x) => setB.has(x));
|
|
247
|
+
if (intersect.length === 0) return true;
|
|
248
|
+
|
|
249
|
+
const smaller = Math.min(setA.size, setB.size);
|
|
250
|
+
return intersect.length / smaller <= 0.5;
|
|
251
|
+
}
|
|
252
|
+
|
|
253
|
+
// =============================================================================
|
|
254
|
+
// Merge safety gate
|
|
255
|
+
// =============================================================================
|
|
256
|
+
|
|
257
|
+
export type MergeSafetyMethod = "entity_aware" | "lexical_only" | "strictest_default";
|
|
258
|
+
|
|
259
|
+
export interface MergeSafetyResult {
|
|
260
|
+
accepted: boolean;
|
|
261
|
+
score: number;
|
|
262
|
+
threshold: number;
|
|
263
|
+
reason: string;
|
|
264
|
+
method: MergeSafetyMethod;
|
|
265
|
+
}
|
|
266
|
+
|
|
267
|
+
/**
|
|
268
|
+
* Merge safety gate.
|
|
269
|
+
*
|
|
270
|
+
* Flow:
|
|
271
|
+
* 1. Compute normalized character 3-gram cosine similarity between the
|
|
272
|
+
* candidate and existing observation texts.
|
|
273
|
+
* 2. Extract anchor sets for both sides. Entity-aware first
|
|
274
|
+
* (`entity_mentions`), lexical fallback otherwise. If EITHER side
|
|
275
|
+
* lacks `entity_mentions` coverage, both sides fall back to lexical
|
|
276
|
+
* so the comparison is apples-to-apples.
|
|
277
|
+
* 3. Decide:
|
|
278
|
+
* - Both anchor sets empty (strictest default) → accept iff
|
|
279
|
+
* `score >= MERGE_SCORE_STRICT`.
|
|
280
|
+
* - Anchors materially differ → **HARD REJECT regardless of text
|
|
281
|
+
* similarity**. This is the primary safety goal: two observations
|
|
282
|
+
* whose canonical subjects differ are never the same observation,
|
|
283
|
+
* even if the LLM emits identical wording. Historically the gate
|
|
284
|
+
* upgraded to a stricter threshold instead of hard-rejecting, but
|
|
285
|
+
* that allowed merges at score 1.0 when the LLM emitted templated
|
|
286
|
+
* text with no subject name.
|
|
287
|
+
* - Anchors compatible (subset or high overlap) → accept iff
|
|
288
|
+
* `score >= MERGE_SCORE_NORMAL`.
|
|
289
|
+
*/
|
|
290
|
+
export function passesMergeSafety(
|
|
291
|
+
store: Store,
|
|
292
|
+
candidateText: string,
|
|
293
|
+
candidateSourceDocIds: number[],
|
|
294
|
+
existingText: string,
|
|
295
|
+
existingSourceDocIds: number[]
|
|
296
|
+
): MergeSafetyResult {
|
|
297
|
+
const score = normalizedCosine3Gram(candidateText, existingText);
|
|
298
|
+
|
|
299
|
+
const candEnt = extractSourceDocEntities(store, candidateSourceDocIds);
|
|
300
|
+
const existEnt = extractSourceDocEntities(store, existingSourceDocIds);
|
|
301
|
+
|
|
302
|
+
// Use entity-aware path only when BOTH sides have entity mentions —
|
|
303
|
+
// otherwise the comparison is apples-to-oranges (one side is a set of
|
|
304
|
+
// canonical IDs, the other is a set of lexical tokens).
|
|
305
|
+
const bothEntity =
|
|
306
|
+
candEnt.method === "entity_mentions" && existEnt.method === "entity_mentions";
|
|
307
|
+
|
|
308
|
+
let anchorsA: string[];
|
|
309
|
+
let anchorsB: string[];
|
|
310
|
+
let method: MergeSafetyMethod;
|
|
311
|
+
|
|
312
|
+
if (bothEntity) {
|
|
313
|
+
anchorsA = candEnt.entities;
|
|
314
|
+
anchorsB = existEnt.entities;
|
|
315
|
+
method = "entity_aware";
|
|
316
|
+
} else {
|
|
317
|
+
anchorsA = extractSubjectAnchorsLexical(candidateText);
|
|
318
|
+
anchorsB = extractSubjectAnchorsLexical(existingText);
|
|
319
|
+
method = "lexical_only";
|
|
320
|
+
}
|
|
321
|
+
|
|
322
|
+
// Strictest default: both sides empty → no subject signal at all
|
|
323
|
+
if (anchorsA.length === 0 && anchorsB.length === 0) {
|
|
324
|
+
const threshold = MERGE_SCORE_STRICT;
|
|
325
|
+
const accepted = score >= threshold;
|
|
326
|
+
return {
|
|
327
|
+
accepted,
|
|
328
|
+
score,
|
|
329
|
+
threshold,
|
|
330
|
+
reason: accepted
|
|
331
|
+
? `strictest-default met (${score.toFixed(3)} >= ${threshold})`
|
|
332
|
+
: `strictest-default unmet (${score.toFixed(3)} < ${threshold})`,
|
|
333
|
+
method: "strictest_default",
|
|
334
|
+
};
|
|
335
|
+
}
|
|
336
|
+
|
|
337
|
+
// Hard reject on materially different anchors — this is the primary
|
|
338
|
+
// safety goal of the extraction. Applies to BOTH entity_aware and
|
|
339
|
+
// lexical_only modes so the policy is uniform.
|
|
340
|
+
if (anchorSetsMateriallyDiffer(anchorsA, anchorsB)) {
|
|
341
|
+
return {
|
|
342
|
+
accepted: false,
|
|
343
|
+
score,
|
|
344
|
+
// Reported threshold is STRICT only for operator logging; the
|
|
345
|
+
// decision was hard-reject, not threshold-gated.
|
|
346
|
+
threshold: MERGE_SCORE_STRICT,
|
|
347
|
+
reason: `${method} materially different anchors — hard reject (score=${score.toFixed(3)})`,
|
|
348
|
+
method,
|
|
349
|
+
};
|
|
350
|
+
}
|
|
351
|
+
|
|
352
|
+
// Compatible anchors (subset or high overlap): gate on text similarity
|
|
353
|
+
const threshold = MERGE_SCORE_NORMAL;
|
|
354
|
+
const accepted = score >= threshold;
|
|
355
|
+
return {
|
|
356
|
+
accepted,
|
|
357
|
+
score,
|
|
358
|
+
threshold,
|
|
359
|
+
reason: accepted
|
|
360
|
+
? `${method} aligned anchors — ${score.toFixed(3)} >= ${threshold}`
|
|
361
|
+
: `${method} aligned anchors — ${score.toFixed(3)} < ${threshold}`,
|
|
362
|
+
method,
|
|
363
|
+
};
|
|
364
|
+
}
|