@pentatonic-ai/ai-agent-sdk 0.10.5 → 0.10.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.cjs +1 -1
- package/dist/index.js +1 -1
- package/package.json +1 -1
- package/packages/memory-engine-v2/compat/requirements.txt +6 -0
- package/packages/memory-engine-v2/compat/server.py +258 -18
- package/packages/memory-engine-v2/docker-compose.aws.yml +62 -1
- package/packages/memory-engine-v2/docker-compose.yml +8 -1
- package/packages/memory-engine-v2/eval/recall_at_k.py +242 -0
- package/packages/memory-engine-v2/eval/retrieval_golden.seed.json +69 -0
- package/packages/memory-engine-v2/extractor-async/Dockerfile +1 -1
- package/packages/memory-engine-v2/extractor-async/extraction_schema.py +246 -0
- package/packages/memory-engine-v2/extractor-async/test_guided_json_parser.py +455 -0
- package/packages/memory-engine-v2/extractor-async/worker.py +391 -31
- package/packages/memory-engine-v2/extractor-sync/server.py +6 -2
- package/packages/memory-engine-v2/extractor-sync/test_paired_extraction.py +82 -1
- package/packages/memory-engine-v2/org-model/migrations/004_source_kind_code_reference.sql +12 -0
- package/packages/memory-engine-v2/org-model/migrations/005_fk_indexes.sql +20 -0
- package/packages/memory-engine-v2/resolution-queue-design.md +165 -0
- package/packages/memory-engine-v2/scripts/backfill_entity_reconciliation.py +11 -2
- package/packages/memory-engine-v2/scripts/backfill_sparse_vectors.py +369 -0
- package/packages/memory-engine-v2/scripts/bakeoff_guided_vs_kv.py +607 -0
- package/packages/memory-engine-v2/scripts/entity_resolution_v2.py +1041 -0
- package/packages/memory-engine-v2/tests/test_entity_resolution_v2.py +507 -0
- package/packages/memory-engine-v2/tests/test_hybrid_retrieval.py +810 -0
|
@@ -0,0 +1,1041 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""Entity-resolution v2: blocking + embedding similarity + LLM
|
|
3
|
+
adjudication. Roadmap BET 1b — DRY-RUN TOOLING.
|
|
4
|
+
|
|
5
|
+
What v1 (backfill_entity_reconciliation.py) covers: variants that share
|
|
6
|
+
an exact normalized surface form (alias overlap) or were paired by a
|
|
7
|
+
name↔email co-occurrence in an event. What it does NOT cover — and
|
|
8
|
+
this script adds:
|
|
9
|
+
|
|
10
|
+
- "Johann_Boedecker" (underscore separator)
|
|
11
|
+
- "Bödecker, Johann" (diacritic + comma inversion)
|
|
12
|
+
- "Johann" (bare first name)
|
|
13
|
+
- "Johanna Phil" (near-miss that must NOT merge)
|
|
14
|
+
|
|
15
|
+
Pipeline (within ONE arena, person entities):
|
|
16
|
+
|
|
17
|
+
1. v1 signals first — co_occurrence + alias_overlap proposals are
|
|
18
|
+
IMPORTED from v1's machinery, unchanged. They outrank everything.
|
|
19
|
+
2. BLOCKING — candidate generation only, NEVER identity. Punctuation/
|
|
20
|
+
underscore strip, diacritic fold (NFD + strip combining), token
|
|
21
|
+
sort, char-trigram buckets. Block keys: first-token, last-token,
|
|
22
|
+
email local-part, trigram bucket — within (arena, entity_type).
|
|
23
|
+
3. EMBEDDING similarity within blocks (pluggable backend; the HTTP
|
|
24
|
+
backend hits an OpenAI-compatible /v1/embeddings endpoint given
|
|
25
|
+
via --embed-url — never hardcoded, never called from tests).
|
|
26
|
+
Cosine ≥ 0.92 → high-confidence; 0.75–0.92 → ambiguous band;
|
|
27
|
+
< 0.75 → drop.
|
|
28
|
+
4. LLM ADJUDICATION of the ambiguous band (Anthropic API, env
|
|
29
|
+
ANTHROPIC_API_KEY, model env-pinned, temperature 0). Strict JSON
|
|
30
|
+
{"same_person": "yes"|"no"|"unsure", "reason": ...}. "unsure"
|
|
31
|
+
NEVER merges — it lands in the human-review section. --no-llm
|
|
32
|
+
routes the whole band to human review.
|
|
33
|
+
5. BARE-FIRST-NAME POLICY — a single-token entity merges only if it
|
|
34
|
+
has exactly ONE candidate in its blocks AND adjudication == yes.
|
|
35
|
+
6. Tiered report (JSONL + markdown):
|
|
36
|
+
co_occurrence > alias_overlap > embedding_llm > heuristic.
|
|
37
|
+
|
|
38
|
+
SAFETY (the engine is shared multi-tenant infra; the pip-agents arena
|
|
39
|
+
is LEGACY/FROZEN and must be untouchable by construction):
|
|
40
|
+
|
|
41
|
+
- --arena is REQUIRED; every SQL statement this script issues is
|
|
42
|
+
arena-scoped (programmatically asserted — see ARENA_SCOPED_SQL +
|
|
43
|
+
assert_arena_scoped()).
|
|
44
|
+
- DRY-RUN BY DEFAULT. Without --apply the session is forced
|
|
45
|
+
READ ONLY at the Postgres level (SET default_transaction_read_only
|
|
46
|
+
= on) — structurally incapable of writing.
|
|
47
|
+
- --apply additionally REQUIRES --i-have-a-snapshot (operator
|
|
48
|
+
acknowledgment that a pg_dump snapshot exists). Refused otherwise,
|
|
49
|
+
before any connection is opened.
|
|
50
|
+
|
|
51
|
+
Usage:
|
|
52
|
+
|
|
53
|
+
python3 entity_resolution_v2.py \\
|
|
54
|
+
--arena <arena-id> \\
|
|
55
|
+
--pg-dsn postgresql://... \\
|
|
56
|
+
--embed-url http://<embed-gateway>/v1/embeddings \\
|
|
57
|
+
[--no-llm] # route ambiguous band to human review
|
|
58
|
+
[--out /tmp/resolution.jsonl] # tiered merge-candidate report
|
|
59
|
+
[--summary /tmp/resolution.md]
|
|
60
|
+
[--apply --i-have-a-snapshot] # write (gated; see above)
|
|
61
|
+
|
|
62
|
+
Exit codes: 0 success; 1 partial apply failure; 2 bad args / refused.
|
|
63
|
+
"""
|
|
64
|
+
|
|
65
|
+
from __future__ import annotations
|
|
66
|
+
|
|
67
|
+
import argparse
|
|
68
|
+
import importlib.util
|
|
69
|
+
import json
|
|
70
|
+
import math
|
|
71
|
+
import os
|
|
72
|
+
import re
|
|
73
|
+
import sys
|
|
74
|
+
import unicodedata
|
|
75
|
+
import urllib.error
|
|
76
|
+
import urllib.request
|
|
77
|
+
from collections import defaultdict
|
|
78
|
+
from dataclasses import dataclass, field
|
|
79
|
+
from datetime import datetime, timezone
|
|
80
|
+
from pathlib import Path
|
|
81
|
+
|
|
82
|
+
# ----------------------------------------------------------------------
|
|
83
|
+
# Import v1's load/apply/report machinery (same dir; scripts/ is not a
|
|
84
|
+
# package, so load by path). v1 stays the single owner of: entity
|
|
85
|
+
# loading, co-occurrence collection, union-find proposal building,
|
|
86
|
+
# richest-row-wins canonical selection, per-proposal transactional
|
|
87
|
+
# apply, entity_merges audit + rollback_payload.
|
|
88
|
+
# ----------------------------------------------------------------------
|
|
89
|
+
|
|
90
|
+
_V1_PATH = Path(__file__).resolve().parent / "backfill_entity_reconciliation.py"
|
|
91
|
+
if "backfill_entity_reconciliation" in sys.modules:
|
|
92
|
+
v1 = sys.modules["backfill_entity_reconciliation"]
|
|
93
|
+
else:
|
|
94
|
+
_spec = importlib.util.spec_from_file_location(
|
|
95
|
+
"backfill_entity_reconciliation", _V1_PATH)
|
|
96
|
+
assert _spec and _spec.loader
|
|
97
|
+
v1 = importlib.util.module_from_spec(_spec)
|
|
98
|
+
# Register BEFORE exec: dataclass processing (py3.13+) looks the
|
|
99
|
+
# defining module up in sys.modules.
|
|
100
|
+
sys.modules["backfill_entity_reconciliation"] = v1
|
|
101
|
+
_spec.loader.exec_module(v1)
|
|
102
|
+
|
|
103
|
+
Entity = v1.Entity
|
|
104
|
+
MergeProposal = v1.MergeProposal
|
|
105
|
+
psycopg = v1.psycopg # None when the driver isn't installed (tests)
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
# ----------------------------------------------------------------------
|
|
109
|
+
# Thresholds & constants
|
|
110
|
+
# ----------------------------------------------------------------------
|
|
111
|
+
|
|
112
|
+
HIGH_THRESHOLD = 0.92 # >= : high-confidence candidate
|
|
113
|
+
LOW_THRESHOLD = 0.75 # < : drop. [LOW, HIGH) : ambiguous band → LLM
|
|
114
|
+
DEFAULT_TOP_K_FACTS = 8
|
|
115
|
+
DEFAULT_MAX_BLOCK = 100 # blocks bigger than this are skipped for
|
|
116
|
+
# pair generation (candidate gen, not recall-
|
|
117
|
+
# critical: v1 signals still cover them)
|
|
118
|
+
ADJUDICATION_SAMPLE_FACTS = 5
|
|
119
|
+
|
|
120
|
+
LLM_MODEL_ENV = "ENTITY_RESOLUTION_LLM_MODEL"
|
|
121
|
+
LLM_MODEL_DEFAULT = "claude-haiku-4-5"
|
|
122
|
+
ANTHROPIC_URL = "https://api.anthropic.com/v1/messages"
|
|
123
|
+
ANTHROPIC_VERSION = "2023-06-01"
|
|
124
|
+
|
|
125
|
+
TIER_ORDER = ("co_occurrence", "alias_overlap", "embedding_llm", "heuristic")
|
|
126
|
+
|
|
127
|
+
# ----------------------------------------------------------------------
|
|
128
|
+
# Every SQL statement THIS script issues. All must be arena-scoped —
|
|
129
|
+
# assert_arena_scoped() enforces it and the unit tests assert it too.
|
|
130
|
+
# (v1's statements are arena-scoped at load + repoint; its by-id
|
|
131
|
+
# entity UPDATE/DELETE statements operate only on ids returned by the
|
|
132
|
+
# arena-scoped loads.)
|
|
133
|
+
# ----------------------------------------------------------------------
|
|
134
|
+
|
|
135
|
+
FACTS_FOR_ENTITY_SQL = """
|
|
136
|
+
SELECT statement FROM facts
|
|
137
|
+
WHERE arena = %s AND (subject_entity_id = %s OR object_entity_id = %s)
|
|
138
|
+
ORDER BY confidence DESC, asserted_at DESC
|
|
139
|
+
LIMIT %s
|
|
140
|
+
"""
|
|
141
|
+
|
|
142
|
+
ENTITY_COUNT_SQL = """
|
|
143
|
+
SELECT COUNT(*) AS n FROM entities
|
|
144
|
+
WHERE arena = %s AND entity_type = %s
|
|
145
|
+
"""
|
|
146
|
+
|
|
147
|
+
# Reads pg_catalog constraint metadata only — no tenant rows — so it
|
|
148
|
+
# is exempt from the arena-scoping rule below.
|
|
149
|
+
ENTITY_MERGES_CHECKDEF_SQL = """
|
|
150
|
+
SELECT pg_get_constraintdef(oid) AS def
|
|
151
|
+
FROM pg_constraint
|
|
152
|
+
WHERE conrelid = 'entity_merges'::regclass AND contype = 'c'
|
|
153
|
+
"""
|
|
154
|
+
|
|
155
|
+
ARENA_SCOPED_SQL: dict[str, str] = {
|
|
156
|
+
"facts_for_entity": FACTS_FOR_ENTITY_SQL,
|
|
157
|
+
"entity_count": ENTITY_COUNT_SQL,
|
|
158
|
+
}
|
|
159
|
+
|
|
160
|
+
|
|
161
|
+
def assert_arena_scoped() -> None:
|
|
162
|
+
"""Every tenant-data SQL statement this script issues must contain
|
|
163
|
+
an arena predicate. Runs at startup; also unit-tested."""
|
|
164
|
+
for name, sql in ARENA_SCOPED_SQL.items():
|
|
165
|
+
if "arena = %s" not in sql:
|
|
166
|
+
raise AssertionError(
|
|
167
|
+
f"SQL '{name}' is not arena-scoped — refusing to run. "
|
|
168
|
+
f"Every statement must carry 'arena = %s'."
|
|
169
|
+
)
|
|
170
|
+
|
|
171
|
+
|
|
172
|
+
# ----------------------------------------------------------------------
|
|
173
|
+
# 1. Candidate-generation normalization — NEVER used for identity,
|
|
174
|
+
# ONLY for blocking. Identity stays with entity_id.py /
|
|
175
|
+
# v1._normalize_surface.
|
|
176
|
+
# ----------------------------------------------------------------------
|
|
177
|
+
|
|
178
|
+
_PUNCT_RE = re.compile(r"[^\w\s@]|_", re.UNICODE)
|
|
179
|
+
_WS_RE = re.compile(r"\s+")
|
|
180
|
+
|
|
181
|
+
|
|
182
|
+
def fold_diacritics(s: str) -> str:
|
|
183
|
+
"""'Bödecker' → 'Bodecker' (NFD decompose, strip combining marks)."""
|
|
184
|
+
return "".join(
|
|
185
|
+
ch for ch in unicodedata.normalize("NFD", s)
|
|
186
|
+
if not unicodedata.combining(ch)
|
|
187
|
+
)
|
|
188
|
+
|
|
189
|
+
|
|
190
|
+
def block_normalize(s: str) -> str:
|
|
191
|
+
"""Blocking-only normal form: diacritic fold + lowercase +
|
|
192
|
+
punctuation/underscores → space + collapse whitespace.
|
|
193
|
+
|
|
194
|
+
'Johann_Boedecker' → 'johann boedecker'
|
|
195
|
+
'Bödecker, Johann' → 'bodecker johann'
|
|
196
|
+
"""
|
|
197
|
+
s = fold_diacritics(unicodedata.normalize("NFKC", s or ""))
|
|
198
|
+
s = _PUNCT_RE.sub(" ", s.lower())
|
|
199
|
+
return _WS_RE.sub(" ", s).strip()
|
|
200
|
+
|
|
201
|
+
|
|
202
|
+
def block_tokens(s: str) -> list[str]:
|
|
203
|
+
n = block_normalize(s)
|
|
204
|
+
return n.split(" ") if n else []
|
|
205
|
+
|
|
206
|
+
|
|
207
|
+
def token_sort(s: str) -> str:
|
|
208
|
+
"""Order-insensitive form: 'Bödecker, Johann' and
|
|
209
|
+
'Johann Boedecker' both sort to a stable token sequence."""
|
|
210
|
+
return " ".join(sorted(block_tokens(s)))
|
|
211
|
+
|
|
212
|
+
|
|
213
|
+
def char_trigrams(s: str) -> set[str]:
|
|
214
|
+
"""Trigrams over the token-sorted, space-stripped form. Catches
|
|
215
|
+
'boedecker' vs 'bodecker' (oe vs folded ö) via shared trigrams."""
|
|
216
|
+
joined = token_sort(s).replace(" ", "")
|
|
217
|
+
if len(joined) < 3:
|
|
218
|
+
return {joined} if joined else set()
|
|
219
|
+
return {joined[i:i + 3] for i in range(len(joined) - 2)}
|
|
220
|
+
|
|
221
|
+
|
|
222
|
+
def _entity_surface_forms(e: Entity) -> list[str]:
|
|
223
|
+
"""canonical + aliases + (for email forms) the local part, which is
|
|
224
|
+
itself a name-ish surface ('johann.boedecker@…' → 'johann boedecker')."""
|
|
225
|
+
forms = [e.canonical_name, *e.aliases]
|
|
226
|
+
extra: list[str] = []
|
|
227
|
+
for f in forms:
|
|
228
|
+
if v1._looks_like_email(f):
|
|
229
|
+
lp = v1._local_part(f)
|
|
230
|
+
if lp:
|
|
231
|
+
extra.append(lp)
|
|
232
|
+
return forms + extra
|
|
233
|
+
|
|
234
|
+
|
|
235
|
+
def blocking_keys(e: Entity) -> set[str]:
|
|
236
|
+
"""Block keys for one entity: first-token, last-token (of the
|
|
237
|
+
token-SORTED form), email local-part, char-trigram buckets."""
|
|
238
|
+
keys: set[str] = set()
|
|
239
|
+
for form in _entity_surface_forms(e):
|
|
240
|
+
if v1._looks_like_email(form):
|
|
241
|
+
lp = v1._local_part(form)
|
|
242
|
+
if lp:
|
|
243
|
+
keys.add("local:" + block_normalize(lp).replace(" ", ""))
|
|
244
|
+
continue
|
|
245
|
+
toks = sorted(block_tokens(form))
|
|
246
|
+
if not toks:
|
|
247
|
+
continue
|
|
248
|
+
keys.add("first:" + toks[0])
|
|
249
|
+
keys.add("last:" + toks[-1])
|
|
250
|
+
for tri in char_trigrams(form):
|
|
251
|
+
keys.add("tri:" + tri)
|
|
252
|
+
return keys
|
|
253
|
+
|
|
254
|
+
|
|
255
|
+
def is_bare_name(e: Entity) -> bool:
|
|
256
|
+
"""True when every non-email surface form is a single token
|
|
257
|
+
('Johann'). Such entities are subject to the bare-first-name
|
|
258
|
+
policy (§4 of the module docstring)."""
|
|
259
|
+
name_forms = [f for f in [e.canonical_name, *e.aliases]
|
|
260
|
+
if not v1._looks_like_email(f)]
|
|
261
|
+
if not name_forms:
|
|
262
|
+
return False
|
|
263
|
+
return all(len(block_tokens(f)) <= 1 for f in name_forms)
|
|
264
|
+
|
|
265
|
+
|
|
266
|
+
@dataclass
|
|
267
|
+
class CandidatePair:
|
|
268
|
+
a: Entity
|
|
269
|
+
b: Entity
|
|
270
|
+
shared_keys: set[str] = field(default_factory=set)
|
|
271
|
+
similarity: float | None = None
|
|
272
|
+
band: str | None = None # 'high' | 'ambiguous' | 'drop'
|
|
273
|
+
verdict: str | None = None # 'yes' | 'no' | 'unsure' | None
|
|
274
|
+
reason: str | None = None
|
|
275
|
+
|
|
276
|
+
@property
|
|
277
|
+
def key(self) -> frozenset:
|
|
278
|
+
return frozenset({self.a.id, self.b.id})
|
|
279
|
+
|
|
280
|
+
|
|
281
|
+
def generate_candidate_pairs(
|
|
282
|
+
entities: list[Entity],
|
|
283
|
+
already_grouped: set[frozenset] | None = None,
|
|
284
|
+
max_block: int = DEFAULT_MAX_BLOCK,
|
|
285
|
+
) -> list[CandidatePair]:
|
|
286
|
+
"""Blocking: pair every two entities sharing >= 1 block key, within
|
|
287
|
+
the (already arena-scoped) entity list. Pairs already grouped by a
|
|
288
|
+
v1 signal are skipped (v1 outranks). Oversized blocks are skipped
|
|
289
|
+
for pair generation (candidate-gen recall guard, not identity)."""
|
|
290
|
+
already_grouped = already_grouped or set()
|
|
291
|
+
blocks: dict[str, list[Entity]] = defaultdict(list)
|
|
292
|
+
for e in entities:
|
|
293
|
+
for k in blocking_keys(e):
|
|
294
|
+
blocks[k].append(e)
|
|
295
|
+
|
|
296
|
+
pairs: dict[frozenset, CandidatePair] = {}
|
|
297
|
+
for k, members in blocks.items():
|
|
298
|
+
if len(members) < 2 or len(members) > max_block:
|
|
299
|
+
continue
|
|
300
|
+
for i in range(len(members)):
|
|
301
|
+
for j in range(i + 1, len(members)):
|
|
302
|
+
a, b = members[i], members[j]
|
|
303
|
+
fk = frozenset({a.id, b.id})
|
|
304
|
+
if len(fk) < 2 or fk in already_grouped:
|
|
305
|
+
continue
|
|
306
|
+
if fk not in pairs:
|
|
307
|
+
pairs[fk] = CandidatePair(a=a, b=b)
|
|
308
|
+
pairs[fk].shared_keys.add(k)
|
|
309
|
+
return sorted(pairs.values(), key=lambda p: sorted(p.key))
|
|
310
|
+
|
|
311
|
+
|
|
312
|
+
# ----------------------------------------------------------------------
|
|
313
|
+
# 2. Embedding-similarity stage — pluggable backend
|
|
314
|
+
# ----------------------------------------------------------------------
|
|
315
|
+
|
|
316
|
+
class EmbeddingBackend:
|
|
317
|
+
"""Interface: embed(texts) -> list of vectors."""
|
|
318
|
+
|
|
319
|
+
def embed(self, texts: list[str]) -> list[list[float]]: # pragma: no cover
|
|
320
|
+
raise NotImplementedError
|
|
321
|
+
|
|
322
|
+
|
|
323
|
+
class HttpEmbeddingBackend(EmbeddingBackend):
|
|
324
|
+
"""OpenAI-compatible /v1/embeddings endpoint (the engine's Qwen3
|
|
325
|
+
embed gateway). Endpoint comes from --embed-url — NEVER hardcoded.
|
|
326
|
+
NOT called in tests (tests use a fake backend)."""
|
|
327
|
+
|
|
328
|
+
def __init__(self, url: str, model: str | None = None,
|
|
329
|
+
timeout: float = 30.0, batch_size: int = 32) -> None:
|
|
330
|
+
if not url:
|
|
331
|
+
raise ValueError("HttpEmbeddingBackend requires --embed-url")
|
|
332
|
+
self.url = url
|
|
333
|
+
self.model = model
|
|
334
|
+
self.timeout = timeout
|
|
335
|
+
self.batch_size = batch_size
|
|
336
|
+
|
|
337
|
+
def embed(self, texts: list[str]) -> list[list[float]]:
|
|
338
|
+
out: list[list[float]] = []
|
|
339
|
+
for i in range(0, len(texts), self.batch_size):
|
|
340
|
+
batch = texts[i:i + self.batch_size]
|
|
341
|
+
body: dict = {"input": batch}
|
|
342
|
+
if self.model:
|
|
343
|
+
body["model"] = self.model
|
|
344
|
+
req = urllib.request.Request(
|
|
345
|
+
self.url,
|
|
346
|
+
data=json.dumps(body).encode(),
|
|
347
|
+
headers={"content-type": "application/json"},
|
|
348
|
+
method="POST",
|
|
349
|
+
)
|
|
350
|
+
with urllib.request.urlopen(req, timeout=self.timeout) as resp:
|
|
351
|
+
payload = json.loads(resp.read())
|
|
352
|
+
data = sorted(payload["data"], key=lambda d: d.get("index", 0))
|
|
353
|
+
out.extend([d["embedding"] for d in data])
|
|
354
|
+
return out
|
|
355
|
+
|
|
356
|
+
|
|
357
|
+
class LocalEmbeddingBackend(EmbeddingBackend):
|
|
358
|
+
"""--embed-backend local. STUB: the repo carries no suitable local
|
|
359
|
+
CPU embedding dependency (requirements are psycopg/httpx/fastapi/
|
|
360
|
+
qdrant-client only — no sentence-transformers / fastembed / onnx
|
|
361
|
+
model). Rather than silently pulling a new heavyweight dep into
|
|
362
|
+
ops tooling, this stays a stub until one is deliberately added."""
|
|
363
|
+
|
|
364
|
+
def __init__(self, *_args, **_kwargs) -> None:
|
|
365
|
+
raise NotImplementedError(
|
|
366
|
+
"--embed-backend local: no local CPU embedding dependency "
|
|
367
|
+
"exists in this repo (checked: compat/extractor requirements "
|
|
368
|
+
"— psycopg, httpx, fastapi, qdrant-client only). Use "
|
|
369
|
+
"--embed-backend http with --embed-url pointing at the "
|
|
370
|
+
"engine's Qwen3 embed gateway, or add a small CPU model "
|
|
371
|
+
"dependency deliberately (e.g. fastembed) in its own PR."
|
|
372
|
+
)
|
|
373
|
+
|
|
374
|
+
|
|
375
|
+
def cosine(a: list[float], b: list[float]) -> float:
|
|
376
|
+
dot = sum(x * y for x, y in zip(a, b))
|
|
377
|
+
na = math.sqrt(sum(x * x for x in a))
|
|
378
|
+
nb = math.sqrt(sum(y * y for y in b))
|
|
379
|
+
if na == 0.0 or nb == 0.0:
|
|
380
|
+
return 0.0
|
|
381
|
+
return dot / (na * nb)
|
|
382
|
+
|
|
383
|
+
|
|
384
|
+
def embedding_bundle(e: Entity, fact_statements: list[str]) -> str:
|
|
385
|
+
"""Surface-form bundle + top-K fact statements → one embeddable
|
|
386
|
+
text per entity."""
|
|
387
|
+
lines = [f"name: {e.canonical_name}"]
|
|
388
|
+
if e.aliases:
|
|
389
|
+
lines.append("aliases: " + "; ".join(e.aliases))
|
|
390
|
+
for s in fact_statements:
|
|
391
|
+
lines.append(f"fact: {s}")
|
|
392
|
+
return "\n".join(lines)
|
|
393
|
+
|
|
394
|
+
|
|
395
|
+
def route_band(similarity: float,
|
|
396
|
+
high: float = HIGH_THRESHOLD,
|
|
397
|
+
low: float = LOW_THRESHOLD) -> str:
|
|
398
|
+
"""Threshold routing: >= high → 'high'; [low, high) → 'ambiguous';
|
|
399
|
+
< low → 'drop'."""
|
|
400
|
+
if similarity >= high:
|
|
401
|
+
return "high"
|
|
402
|
+
if similarity >= low:
|
|
403
|
+
return "ambiguous"
|
|
404
|
+
return "drop"
|
|
405
|
+
|
|
406
|
+
|
|
407
|
+
def load_fact_statements(conn, arena: str, entity_id: str,
|
|
408
|
+
top_k: int = DEFAULT_TOP_K_FACTS) -> list[str]:
|
|
409
|
+
"""Top-K fact statements for one entity. Arena-scoped."""
|
|
410
|
+
with conn.cursor() as cur:
|
|
411
|
+
cur.execute(FACTS_FOR_ENTITY_SQL,
|
|
412
|
+
(arena, entity_id, entity_id, top_k))
|
|
413
|
+
return [r[0] for r in cur.fetchall()]
|
|
414
|
+
|
|
415
|
+
|
|
416
|
+
# ----------------------------------------------------------------------
|
|
417
|
+
# 3. LLM adjudication of the ambiguous band
|
|
418
|
+
# ----------------------------------------------------------------------
|
|
419
|
+
|
|
420
|
+
ADJUDICATION_PROMPT = """\
|
|
421
|
+
You are adjudicating whether two database entity records refer to the \
|
|
422
|
+
SAME real person. Be conservative: a false merge corrupts the knowledge \
|
|
423
|
+
graph and is far worse than leaving a duplicate.
|
|
424
|
+
|
|
425
|
+
Entity A:
|
|
426
|
+
canonical name: {a_name}
|
|
427
|
+
aliases: {a_aliases}
|
|
428
|
+
sample facts:
|
|
429
|
+
{a_facts}
|
|
430
|
+
|
|
431
|
+
Entity B:
|
|
432
|
+
canonical name: {b_name}
|
|
433
|
+
aliases: {b_aliases}
|
|
434
|
+
sample facts:
|
|
435
|
+
{b_facts}
|
|
436
|
+
|
|
437
|
+
Answer with STRICT JSON only, no prose, exactly this shape:
|
|
438
|
+
{{"same_person": "yes" | "no" | "unsure", "reason": "<one sentence>"}}
|
|
439
|
+
|
|
440
|
+
Rules:
|
|
441
|
+
- "yes" only if the names are plausibly the same person AND nothing in \
|
|
442
|
+
the facts contradicts it.
|
|
443
|
+
- Different people who merely share a first name (e.g. "Johann" vs \
|
|
444
|
+
"Johanna") are "no".
|
|
445
|
+
- If the evidence is thin or conflicting, answer "unsure".
|
|
446
|
+
"""
|
|
447
|
+
|
|
448
|
+
|
|
449
|
+
@dataclass
|
|
450
|
+
class Adjudication:
|
|
451
|
+
same_person: str # 'yes' | 'no' | 'unsure'
|
|
452
|
+
reason: str
|
|
453
|
+
|
|
454
|
+
|
|
455
|
+
class Adjudicator:
|
|
456
|
+
def adjudicate(self, a: Entity, a_facts: list[str],
|
|
457
|
+
b: Entity, b_facts: list[str]) -> Adjudication: # pragma: no cover
|
|
458
|
+
raise NotImplementedError
|
|
459
|
+
|
|
460
|
+
|
|
461
|
+
class NoLLMAdjudicator(Adjudicator):
|
|
462
|
+
"""--no-llm / offline mode: the whole ambiguous band routes to
|
|
463
|
+
human review ('unsure' never merges)."""
|
|
464
|
+
|
|
465
|
+
def adjudicate(self, a, a_facts, b, b_facts) -> Adjudication:
|
|
466
|
+
return Adjudication("unsure", "LLM adjudication disabled (--no-llm); "
|
|
467
|
+
"routed to human review")
|
|
468
|
+
|
|
469
|
+
|
|
470
|
+
class AnthropicAdjudicator(Adjudicator):
|
|
471
|
+
"""Anthropic API via env ANTHROPIC_API_KEY; model env-pinned
|
|
472
|
+
($ENTITY_RESOLUTION_LLM_MODEL, default claude-haiku-4-5);
|
|
473
|
+
temperature 0. Strict-JSON response; any parse failure degrades to
|
|
474
|
+
'unsure' (never merges)."""
|
|
475
|
+
|
|
476
|
+
def __init__(self, api_key: str | None = None,
|
|
477
|
+
model: str | None = None, timeout: float = 60.0) -> None:
|
|
478
|
+
self.api_key = api_key or os.environ.get("ANTHROPIC_API_KEY")
|
|
479
|
+
if not self.api_key:
|
|
480
|
+
raise ValueError(
|
|
481
|
+
"ANTHROPIC_API_KEY not set — use --no-llm to route the "
|
|
482
|
+
"ambiguous band to human review instead.")
|
|
483
|
+
self.model = model or os.environ.get(LLM_MODEL_ENV, LLM_MODEL_DEFAULT)
|
|
484
|
+
self.timeout = timeout
|
|
485
|
+
|
|
486
|
+
@staticmethod
|
|
487
|
+
def _fmt_facts(facts: list[str]) -> str:
|
|
488
|
+
sample = facts[:ADJUDICATION_SAMPLE_FACTS]
|
|
489
|
+
if not sample:
|
|
490
|
+
return " (no facts recorded)"
|
|
491
|
+
return "\n".join(f" - {s}" for s in sample)
|
|
492
|
+
|
|
493
|
+
def adjudicate(self, a, a_facts, b, b_facts) -> Adjudication:
|
|
494
|
+
prompt = ADJUDICATION_PROMPT.format(
|
|
495
|
+
a_name=a.canonical_name, a_aliases=", ".join(a.aliases) or "(none)",
|
|
496
|
+
a_facts=self._fmt_facts(a_facts),
|
|
497
|
+
b_name=b.canonical_name, b_aliases=", ".join(b.aliases) or "(none)",
|
|
498
|
+
b_facts=self._fmt_facts(b_facts),
|
|
499
|
+
)
|
|
500
|
+
body = {
|
|
501
|
+
"model": self.model,
|
|
502
|
+
"max_tokens": 200,
|
|
503
|
+
"temperature": 0,
|
|
504
|
+
"messages": [{"role": "user", "content": prompt}],
|
|
505
|
+
}
|
|
506
|
+
req = urllib.request.Request(
|
|
507
|
+
ANTHROPIC_URL,
|
|
508
|
+
data=json.dumps(body).encode(),
|
|
509
|
+
headers={
|
|
510
|
+
"x-api-key": self.api_key,
|
|
511
|
+
"anthropic-version": ANTHROPIC_VERSION,
|
|
512
|
+
"content-type": "application/json",
|
|
513
|
+
},
|
|
514
|
+
method="POST",
|
|
515
|
+
)
|
|
516
|
+
try:
|
|
517
|
+
with urllib.request.urlopen(req, timeout=self.timeout) as resp:
|
|
518
|
+
payload = json.loads(resp.read())
|
|
519
|
+
text = "".join(
|
|
520
|
+
blk.get("text", "") for blk in payload.get("content", [])
|
|
521
|
+
if blk.get("type") == "text"
|
|
522
|
+
)
|
|
523
|
+
return parse_adjudication(text)
|
|
524
|
+
except (urllib.error.URLError, TimeoutError, json.JSONDecodeError,
|
|
525
|
+
KeyError) as e:
|
|
526
|
+
return Adjudication("unsure", f"adjudication call failed: {e}")
|
|
527
|
+
|
|
528
|
+
|
|
529
|
+
def parse_adjudication(text: str) -> Adjudication:
|
|
530
|
+
"""Strict JSON {"same_person": yes|no|unsure, "reason": ...}.
|
|
531
|
+
Anything malformed → 'unsure' (never merges)."""
|
|
532
|
+
m = re.search(r"\{.*\}", text, re.DOTALL)
|
|
533
|
+
if not m:
|
|
534
|
+
return Adjudication("unsure", f"non-JSON adjudication output: {text[:120]!r}")
|
|
535
|
+
try:
|
|
536
|
+
obj = json.loads(m.group(0))
|
|
537
|
+
except json.JSONDecodeError:
|
|
538
|
+
return Adjudication("unsure", f"unparseable adjudication JSON: {text[:120]!r}")
|
|
539
|
+
verdict = obj.get("same_person")
|
|
540
|
+
if verdict not in ("yes", "no", "unsure"):
|
|
541
|
+
return Adjudication("unsure", f"invalid same_person value: {verdict!r}")
|
|
542
|
+
return Adjudication(verdict, str(obj.get("reason", ""))[:500])
|
|
543
|
+
|
|
544
|
+
|
|
545
|
+
# ----------------------------------------------------------------------
|
|
546
|
+
# 4. Pair routing: thresholds + adjudication + bare-first-name policy
|
|
547
|
+
# ----------------------------------------------------------------------
|
|
548
|
+
|
|
549
|
+
@dataclass
|
|
550
|
+
class RoutedPairs:
|
|
551
|
+
merge: list[CandidatePair] = field(default_factory=list)
|
|
552
|
+
human_review: list[CandidatePair] = field(default_factory=list)
|
|
553
|
+
dropped: list[CandidatePair] = field(default_factory=list)
|
|
554
|
+
|
|
555
|
+
|
|
556
|
+
def route_pairs(
|
|
557
|
+
pairs: list[CandidatePair],
|
|
558
|
+
adjudicator: Adjudicator,
|
|
559
|
+
facts_by_entity: dict[str, list[str]],
|
|
560
|
+
high: float = HIGH_THRESHOLD,
|
|
561
|
+
low: float = LOW_THRESHOLD,
|
|
562
|
+
) -> RoutedPairs:
|
|
563
|
+
"""Each pair must already carry .similarity. Routing:
|
|
564
|
+
|
|
565
|
+
- sim < low → dropped
|
|
566
|
+
- bare-name involved → ALWAYS adjudicated; merges
|
|
567
|
+
only on 'yes' AND (enforced later) exactly one candidate
|
|
568
|
+
- sim >= high, no bare name → merge (high-confidence)
|
|
569
|
+
- low <= sim < high → adjudicate: yes → merge,
|
|
570
|
+
no → dropped, unsure → human review
|
|
571
|
+
"""
|
|
572
|
+
routed = RoutedPairs()
|
|
573
|
+
|
|
574
|
+
# Bare-name candidate-count map (policy: exactly one candidate in
|
|
575
|
+
# block, counted over NON-dropped pairs).
|
|
576
|
+
bare_candidates: dict[str, int] = defaultdict(int)
|
|
577
|
+
for p in pairs:
|
|
578
|
+
assert p.similarity is not None, "route_pairs needs scored pairs"
|
|
579
|
+
if route_band(p.similarity, high, low) == "drop":
|
|
580
|
+
continue
|
|
581
|
+
for e in (p.a, p.b):
|
|
582
|
+
if is_bare_name(e):
|
|
583
|
+
bare_candidates[e.id] += 1
|
|
584
|
+
|
|
585
|
+
for p in pairs:
|
|
586
|
+
p.band = route_band(p.similarity, high, low)
|
|
587
|
+
if p.band == "drop":
|
|
588
|
+
routed.dropped.append(p)
|
|
589
|
+
continue
|
|
590
|
+
|
|
591
|
+
bare_ids = [e.id for e in (p.a, p.b) if is_bare_name(e)]
|
|
592
|
+
if bare_ids:
|
|
593
|
+
# Bare-first-name policy: single-token entities merge only
|
|
594
|
+
# if exactly one candidate in block AND adjudication = yes.
|
|
595
|
+
if any(bare_candidates[eid] != 1 for eid in bare_ids):
|
|
596
|
+
p.verdict = "unsure"
|
|
597
|
+
p.reason = ("bare-first-name policy: entity has more than "
|
|
598
|
+
"one candidate in its blocks; left unmerged")
|
|
599
|
+
routed.human_review.append(p)
|
|
600
|
+
continue
|
|
601
|
+
adj = adjudicator.adjudicate(
|
|
602
|
+
p.a, facts_by_entity.get(p.a.id, []),
|
|
603
|
+
p.b, facts_by_entity.get(p.b.id, []))
|
|
604
|
+
p.verdict, p.reason = adj.same_person, adj.reason
|
|
605
|
+
if adj.same_person == "yes":
|
|
606
|
+
routed.merge.append(p)
|
|
607
|
+
elif adj.same_person == "no":
|
|
608
|
+
routed.dropped.append(p)
|
|
609
|
+
else:
|
|
610
|
+
routed.human_review.append(p)
|
|
611
|
+
continue
|
|
612
|
+
|
|
613
|
+
if p.band == "high":
|
|
614
|
+
p.verdict = "auto"
|
|
615
|
+
p.reason = (f"cosine similarity {p.similarity:.3f} >= "
|
|
616
|
+
f"high-confidence threshold {high}")
|
|
617
|
+
routed.merge.append(p)
|
|
618
|
+
continue
|
|
619
|
+
|
|
620
|
+
# ambiguous band → LLM adjudication
|
|
621
|
+
adj = adjudicator.adjudicate(
|
|
622
|
+
p.a, facts_by_entity.get(p.a.id, []),
|
|
623
|
+
p.b, facts_by_entity.get(p.b.id, []))
|
|
624
|
+
p.verdict, p.reason = adj.same_person, adj.reason
|
|
625
|
+
if adj.same_person == "yes":
|
|
626
|
+
routed.merge.append(p)
|
|
627
|
+
elif adj.same_person == "no":
|
|
628
|
+
routed.dropped.append(p)
|
|
629
|
+
else:
|
|
630
|
+
routed.human_review.append(p)
|
|
631
|
+
|
|
632
|
+
return routed
|
|
633
|
+
|
|
634
|
+
|
|
635
|
+
def pairs_to_proposals(merge_pairs: list[CandidatePair]) -> list[MergeProposal]:
|
|
636
|
+
"""Union-find accepted pairs into groups; canonical = richest row
|
|
637
|
+
(same ordering as v1: facts desc, rels desc, provenance desc, id)."""
|
|
638
|
+
parent: dict[str, str] = {}
|
|
639
|
+
entities: dict[str, Entity] = {}
|
|
640
|
+
|
|
641
|
+
def find(x: str) -> str:
|
|
642
|
+
while parent[x] != x:
|
|
643
|
+
parent[x] = parent[parent[x]]
|
|
644
|
+
x = parent[x]
|
|
645
|
+
return x
|
|
646
|
+
|
|
647
|
+
for p in merge_pairs:
|
|
648
|
+
for e in (p.a, p.b):
|
|
649
|
+
parent.setdefault(e.id, e.id)
|
|
650
|
+
entities[e.id] = e
|
|
651
|
+
ra, rb = find(p.a.id), find(p.b.id)
|
|
652
|
+
if ra != rb:
|
|
653
|
+
parent[ra] = rb
|
|
654
|
+
|
|
655
|
+
groups: dict[str, list[Entity]] = defaultdict(list)
|
|
656
|
+
for eid in parent:
|
|
657
|
+
groups[find(eid)].append(entities[eid])
|
|
658
|
+
|
|
659
|
+
proposals: list[MergeProposal] = []
|
|
660
|
+
for group in groups.values():
|
|
661
|
+
if len(group) < 2:
|
|
662
|
+
continue
|
|
663
|
+
group_sorted = sorted(
|
|
664
|
+
group,
|
|
665
|
+
key=lambda e: (-e.fact_count, -e.rel_count,
|
|
666
|
+
-len(e.provenance_event_ids), e.id),
|
|
667
|
+
)
|
|
668
|
+
proposals.append(MergeProposal(
|
|
669
|
+
canonical=group_sorted[0],
|
|
670
|
+
deprecated=group_sorted[1:],
|
|
671
|
+
signal="embedding_llm",
|
|
672
|
+
))
|
|
673
|
+
return proposals
|
|
674
|
+
|
|
675
|
+
|
|
676
|
+
# ----------------------------------------------------------------------
|
|
677
|
+
# 5. Tiered report (JSONL + markdown)
|
|
678
|
+
# ----------------------------------------------------------------------
|
|
679
|
+
|
|
680
|
+
def _proposal_record(p: MergeProposal, tier: str,
|
|
681
|
+
evidence: list[dict] | None = None) -> dict:
|
|
682
|
+
return {
|
|
683
|
+
"type": "merge_proposal",
|
|
684
|
+
"tier": tier,
|
|
685
|
+
"signal": p.signal,
|
|
686
|
+
"canonical": {
|
|
687
|
+
"id": p.canonical.id,
|
|
688
|
+
"canonical_name": p.canonical.canonical_name,
|
|
689
|
+
"fact_count": p.canonical.fact_count,
|
|
690
|
+
"rel_count": p.canonical.rel_count,
|
|
691
|
+
},
|
|
692
|
+
"deprecated": [
|
|
693
|
+
{
|
|
694
|
+
"id": d.id,
|
|
695
|
+
"canonical_name": d.canonical_name,
|
|
696
|
+
"aliases": d.aliases,
|
|
697
|
+
"fact_count": d.fact_count,
|
|
698
|
+
"rel_count": d.rel_count,
|
|
699
|
+
} for d in p.deprecated
|
|
700
|
+
],
|
|
701
|
+
"evidence": evidence or [],
|
|
702
|
+
}
|
|
703
|
+
|
|
704
|
+
|
|
705
|
+
def _pair_record(p: CandidatePair, record_type: str) -> dict:
|
|
706
|
+
return {
|
|
707
|
+
"type": record_type,
|
|
708
|
+
"a": {"id": p.a.id, "canonical_name": p.a.canonical_name},
|
|
709
|
+
"b": {"id": p.b.id, "canonical_name": p.b.canonical_name},
|
|
710
|
+
"similarity": p.similarity,
|
|
711
|
+
"band": p.band,
|
|
712
|
+
"verdict": p.verdict,
|
|
713
|
+
"reason": p.reason,
|
|
714
|
+
"shared_block_keys": sorted(p.shared_keys)[:10],
|
|
715
|
+
}
|
|
716
|
+
|
|
717
|
+
|
|
718
|
+
def build_report_records(
|
|
719
|
+
arena: str,
|
|
720
|
+
v1_proposals: list[MergeProposal],
|
|
721
|
+
v2_proposals: list[MergeProposal],
|
|
722
|
+
routed: RoutedPairs,
|
|
723
|
+
before_count: int,
|
|
724
|
+
heuristic_proposals: list[MergeProposal] | None = None,
|
|
725
|
+
) -> list[dict]:
|
|
726
|
+
"""Tier order: co_occurrence > alias_overlap > embedding_llm >
|
|
727
|
+
heuristic. Returns JSONL-able records, header first."""
|
|
728
|
+
pair_evidence: dict[frozenset, dict] = {
|
|
729
|
+
p.key: _pair_record(p, "evidence") for p in routed.merge
|
|
730
|
+
}
|
|
731
|
+
|
|
732
|
+
records: list[dict] = []
|
|
733
|
+
deprecated_total = sum(
|
|
734
|
+
len(p.deprecated)
|
|
735
|
+
for p in [*v1_proposals, *v2_proposals, *(heuristic_proposals or [])]
|
|
736
|
+
)
|
|
737
|
+
records.append({
|
|
738
|
+
"type": "header",
|
|
739
|
+
"arena": arena,
|
|
740
|
+
"generated_at": datetime.now(timezone.utc).isoformat(),
|
|
741
|
+
"tiers": list(TIER_ORDER),
|
|
742
|
+
"entity_count_before": before_count,
|
|
743
|
+
"entity_count_after_if_applied": before_count - deprecated_total,
|
|
744
|
+
"other_arenas": "untouched — every SQL statement is arena-scoped "
|
|
745
|
+
"(programmatically asserted; dry-run sessions are "
|
|
746
|
+
"READ ONLY at the Postgres level)",
|
|
747
|
+
})
|
|
748
|
+
|
|
749
|
+
by_tier: list[tuple[str, list[MergeProposal]]] = [
|
|
750
|
+
("co_occurrence", [p for p in v1_proposals if p.signal == "co_occurrence"]),
|
|
751
|
+
("alias_overlap", [p for p in v1_proposals if p.signal == "alias_overlap"]),
|
|
752
|
+
("embedding_llm", v2_proposals),
|
|
753
|
+
("heuristic", heuristic_proposals or []),
|
|
754
|
+
]
|
|
755
|
+
for tier, proposals in by_tier:
|
|
756
|
+
for p in proposals:
|
|
757
|
+
ev = []
|
|
758
|
+
if tier == "embedding_llm":
|
|
759
|
+
ids = {p.canonical.id, *(d.id for d in p.deprecated)}
|
|
760
|
+
ev = [rec for key, rec in pair_evidence.items() if key <= ids]
|
|
761
|
+
records.append(_proposal_record(p, tier, ev))
|
|
762
|
+
|
|
763
|
+
for p in routed.human_review:
|
|
764
|
+
records.append(_pair_record(p, "human_review"))
|
|
765
|
+
for p in routed.dropped:
|
|
766
|
+
records.append(_pair_record(p, "dropped"))
|
|
767
|
+
return records
|
|
768
|
+
|
|
769
|
+
|
|
770
|
+
def write_jsonl(records: list[dict], path: str) -> None:
|
|
771
|
+
with open(path, "w") as f:
|
|
772
|
+
for r in records:
|
|
773
|
+
f.write(json.dumps(r) + "\n")
|
|
774
|
+
|
|
775
|
+
|
|
776
|
+
def write_markdown_summary(records: list[dict], path: str) -> None:
|
|
777
|
+
header = records[0]
|
|
778
|
+
proposals = [r for r in records if r["type"] == "merge_proposal"]
|
|
779
|
+
review = [r for r in records if r["type"] == "human_review"]
|
|
780
|
+
dropped = [r for r in records if r["type"] == "dropped"]
|
|
781
|
+
|
|
782
|
+
lines = [
|
|
783
|
+
f"# Entity-resolution v2 — merge-candidate report",
|
|
784
|
+
"",
|
|
785
|
+
f"- **Arena:** `{header['arena']}` (one arena at a time; required `--arena`)",
|
|
786
|
+
f"- **Generated:** {header['generated_at']}",
|
|
787
|
+
f"- **Entities before:** {header['entity_count_before']}",
|
|
788
|
+
f"- **Entities after (if every proposal applied):** "
|
|
789
|
+
f"{header['entity_count_after_if_applied']}",
|
|
790
|
+
"",
|
|
791
|
+
"## Proposals by tier",
|
|
792
|
+
"",
|
|
793
|
+
"| tier | proposals | rows deprecated |",
|
|
794
|
+
"|---|---:|---:|",
|
|
795
|
+
]
|
|
796
|
+
for tier in TIER_ORDER:
|
|
797
|
+
tier_props = [p for p in proposals if p["tier"] == tier]
|
|
798
|
+
lines.append(f"| {tier} | {len(tier_props)} | "
|
|
799
|
+
f"{sum(len(p['deprecated']) for p in tier_props)} |")
|
|
800
|
+
lines += ["", "## Proposals", ""]
|
|
801
|
+
for p in proposals:
|
|
802
|
+
deps = ", ".join(f"`{d['canonical_name']}`" for d in p["deprecated"])
|
|
803
|
+
lines.append(f"- **[{p['tier']}]** `{p['canonical']['canonical_name']}` "
|
|
804
|
+
f"absorbs {deps}")
|
|
805
|
+
for ev in p.get("evidence", []):
|
|
806
|
+
sim = f"{ev['similarity']:.3f}" if ev.get("similarity") is not None else "n/a"
|
|
807
|
+
lines.append(f" - sim={sim} verdict={ev.get('verdict')} — "
|
|
808
|
+
f"{ev.get('reason')}")
|
|
809
|
+
lines += ["", f"## Human review ({len(review)})", ""]
|
|
810
|
+
if not review:
|
|
811
|
+
lines.append("(none)")
|
|
812
|
+
for r in review:
|
|
813
|
+
sim = f"{r['similarity']:.3f}" if r.get("similarity") is not None else "n/a"
|
|
814
|
+
lines.append(f"- `{r['a']['canonical_name']}` ↔ "
|
|
815
|
+
f"`{r['b']['canonical_name']}` (sim={sim}) — {r.get('reason')}")
|
|
816
|
+
lines += ["", f"## Dropped pairs: {len(dropped)}", ""]
|
|
817
|
+
lines += [
|
|
818
|
+
"## Other arenas",
|
|
819
|
+
"",
|
|
820
|
+
header["other_arenas"],
|
|
821
|
+
"",
|
|
822
|
+
"The pip-agents arena (LEGACY/FROZEN) is untouchable by "
|
|
823
|
+
"construction: no statement in this tool can address it unless "
|
|
824
|
+
"an operator passes `--arena pip-agents`, which the runbook "
|
|
825
|
+
"forbids.",
|
|
826
|
+
]
|
|
827
|
+
with open(path, "w") as f:
|
|
828
|
+
f.write("\n".join(lines) + "\n")
|
|
829
|
+
|
|
830
|
+
|
|
831
|
+
# ----------------------------------------------------------------------
|
|
832
|
+
# CLI
|
|
833
|
+
# ----------------------------------------------------------------------
|
|
834
|
+
|
|
835
|
+
def parse_args(argv: list[str] | None = None) -> argparse.Namespace:
|
|
836
|
+
p = argparse.ArgumentParser(
|
|
837
|
+
description=__doc__,
|
|
838
|
+
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
839
|
+
)
|
|
840
|
+
p.add_argument("--arena", required=True,
|
|
841
|
+
help="arena id to resolve (REQUIRED; one at a time; "
|
|
842
|
+
"every SQL statement is scoped to it)")
|
|
843
|
+
p.add_argument("--pg-dsn", default=os.environ.get("PG_DSN"),
|
|
844
|
+
help="postgres DSN; defaults to $PG_DSN")
|
|
845
|
+
p.add_argument("--entity-type", default="person",
|
|
846
|
+
help="entity type to resolve (default: person)")
|
|
847
|
+
p.add_argument("--apply", action="store_true",
|
|
848
|
+
help="actually write merges; default is dry-run "
|
|
849
|
+
"(READ ONLY session). Requires --i-have-a-snapshot.")
|
|
850
|
+
p.add_argument("--i-have-a-snapshot", action="store_true",
|
|
851
|
+
help="operator acknowledgment that a pg_dump snapshot "
|
|
852
|
+
"of org-model exists. --apply is refused without it.")
|
|
853
|
+
p.add_argument("--embed-url", default=None,
|
|
854
|
+
help="OpenAI-compatible /v1/embeddings endpoint "
|
|
855
|
+
"(the engine's Qwen3 embed gateway). Required for "
|
|
856
|
+
"the http backend; never hardcoded.")
|
|
857
|
+
p.add_argument("--embed-backend", choices=("http", "local"),
|
|
858
|
+
default="http",
|
|
859
|
+
help="embedding backend (default http; 'local' is a "
|
|
860
|
+
"stub — no suitable CPU dep in repo)")
|
|
861
|
+
p.add_argument("--embed-model", default=None,
|
|
862
|
+
help="model name to pass to the embeddings endpoint "
|
|
863
|
+
"(optional; gateway default if omitted)")
|
|
864
|
+
p.add_argument("--no-llm", action="store_true",
|
|
865
|
+
help="offline mode: route the whole ambiguous band to "
|
|
866
|
+
"human review instead of LLM adjudication")
|
|
867
|
+
p.add_argument("--high-threshold", type=float, default=HIGH_THRESHOLD)
|
|
868
|
+
p.add_argument("--low-threshold", type=float, default=LOW_THRESHOLD)
|
|
869
|
+
p.add_argument("--top-k-facts", type=int, default=DEFAULT_TOP_K_FACTS)
|
|
870
|
+
p.add_argument("--max-block", type=int, default=DEFAULT_MAX_BLOCK)
|
|
871
|
+
p.add_argument("--heuristic-merge", action="store_true",
|
|
872
|
+
help="also include v1's heuristic tier (off by default)")
|
|
873
|
+
p.add_argument("--out", default=None,
|
|
874
|
+
help="tiered merge-candidate report JSONL path")
|
|
875
|
+
p.add_argument("--summary", default=None,
|
|
876
|
+
help="markdown summary path (default: <out>.md)")
|
|
877
|
+
p.add_argument("--merged-by", default=None,
|
|
878
|
+
help="audit tag (default: resolution-v2-YYYY-MM)")
|
|
879
|
+
return p.parse_args(argv)
|
|
880
|
+
|
|
881
|
+
|
|
882
|
+
def validate_args(args: argparse.Namespace) -> str | None:
|
|
883
|
+
"""Returns an error string, or None when ok. Checked BEFORE any
|
|
884
|
+
connection is opened — --apply is structurally refused without the
|
|
885
|
+
snapshot acknowledgment."""
|
|
886
|
+
if args.apply and not args.i_have_a_snapshot:
|
|
887
|
+
return ("--apply requires --i-have-a-snapshot (operator "
|
|
888
|
+
"acknowledgment that a pg_dump snapshot of org-model "
|
|
889
|
+
"exists — see the memory clean-rebuild runbook). Refusing.")
|
|
890
|
+
if not args.pg_dsn:
|
|
891
|
+
return "--pg-dsn (or $PG_DSN) required"
|
|
892
|
+
if args.embed_backend == "http" and not args.embed_url:
|
|
893
|
+
return ("--embed-url required for the http embedding backend "
|
|
894
|
+
"(the endpoint is never hardcoded)")
|
|
895
|
+
if not (0.0 <= args.low_threshold <= args.high_threshold <= 1.0):
|
|
896
|
+
return "--low-threshold must be <= --high-threshold, both in [0,1]"
|
|
897
|
+
return None
|
|
898
|
+
|
|
899
|
+
|
|
900
|
+
def _entity_merges_check_allows(conn, value: str) -> bool:
|
|
901
|
+
"""True if the entity_merges merge_signal CHECK constraint admits
|
|
902
|
+
`value`. v2's 'embedding_llm' needs the draft migration in
|
|
903
|
+
resolution-queue-design.md applied first."""
|
|
904
|
+
with conn.cursor() as cur:
|
|
905
|
+
cur.execute(ENTITY_MERGES_CHECKDEF_SQL)
|
|
906
|
+
defs = [r[0] for r in cur.fetchall()]
|
|
907
|
+
return any(value in d for d in defs)
|
|
908
|
+
|
|
909
|
+
|
|
910
|
+
def main(argv: list[str] | None = None) -> int:
|
|
911
|
+
args = parse_args(argv)
|
|
912
|
+
err = validate_args(args)
|
|
913
|
+
if err:
|
|
914
|
+
print(f"error: {err}", file=sys.stderr)
|
|
915
|
+
return 2
|
|
916
|
+
|
|
917
|
+
if psycopg is None:
|
|
918
|
+
print("error: psycopg is required to run this script "
|
|
919
|
+
"(pip install 'psycopg[binary]')", file=sys.stderr)
|
|
920
|
+
return 2
|
|
921
|
+
|
|
922
|
+
assert_arena_scoped()
|
|
923
|
+
|
|
924
|
+
merged_by = args.merged_by or \
|
|
925
|
+
f"resolution-v2-{datetime.now(timezone.utc):%Y-%m}"
|
|
926
|
+
|
|
927
|
+
# Backends (constructed before connecting so misconfig fails fast).
|
|
928
|
+
try:
|
|
929
|
+
if args.embed_backend == "local":
|
|
930
|
+
backend: EmbeddingBackend = LocalEmbeddingBackend()
|
|
931
|
+
else:
|
|
932
|
+
backend = HttpEmbeddingBackend(args.embed_url, args.embed_model)
|
|
933
|
+
adjudicator: Adjudicator = (
|
|
934
|
+
NoLLMAdjudicator() if args.no_llm else AnthropicAdjudicator()
|
|
935
|
+
)
|
|
936
|
+
except (NotImplementedError, ValueError) as e:
|
|
937
|
+
print(f"error: {e}", file=sys.stderr)
|
|
938
|
+
return 2
|
|
939
|
+
|
|
940
|
+
with psycopg.connect(args.pg_dsn, autocommit=False) as conn:
|
|
941
|
+
if not args.apply:
|
|
942
|
+
# Structural dry-run: the session cannot write.
|
|
943
|
+
conn.execute("SET default_transaction_read_only = on")
|
|
944
|
+
print(f"[resolution-v2] arena={args.arena} type={args.entity_type} "
|
|
945
|
+
f"apply={args.apply} no_llm={args.no_llm}")
|
|
946
|
+
|
|
947
|
+
entities = v1.load_entities(conn, args.arena, args.entity_type)
|
|
948
|
+
before_count = len(entities)
|
|
949
|
+
print(f"[resolution-v2] loaded {before_count} {args.entity_type} "
|
|
950
|
+
f"entities (arena-scoped)")
|
|
951
|
+
|
|
952
|
+
# ---- Tier 1+2 (and optional 4): v1's machinery, unchanged ----
|
|
953
|
+
cooc = v1.collect_cooccurrence_pairs(conn, args.arena) \
|
|
954
|
+
if args.entity_type == "person" else set()
|
|
955
|
+
v1_all = v1.build_proposals(entities, cooc, args.heuristic_merge)
|
|
956
|
+
v1_proposals = [p for p in v1_all
|
|
957
|
+
if p.signal in ("co_occurrence", "alias_overlap")]
|
|
958
|
+
heuristic_proposals = [p for p in v1_all if p.signal == "heuristic"]
|
|
959
|
+
already = set()
|
|
960
|
+
for p in v1_all:
|
|
961
|
+
ids = [p.canonical.id, *(d.id for d in p.deprecated)]
|
|
962
|
+
for i in range(len(ids)):
|
|
963
|
+
for j in range(i + 1, len(ids)):
|
|
964
|
+
already.add(frozenset({ids[i], ids[j]}))
|
|
965
|
+
print(f"[resolution-v2] v1 tiers: {len(v1_proposals)} proposals "
|
|
966
|
+
f"(+{len(heuristic_proposals)} heuristic)")
|
|
967
|
+
|
|
968
|
+
# ---- Blocking ------------------------------------------------
|
|
969
|
+
pairs = generate_candidate_pairs(entities, already, args.max_block)
|
|
970
|
+
print(f"[resolution-v2] blocking generated {len(pairs)} candidate "
|
|
971
|
+
f"pairs not covered by v1 signals")
|
|
972
|
+
|
|
973
|
+
# ---- Embedding similarity -------------------------------------
|
|
974
|
+
ids_needed = sorted({e.id for p in pairs for e in (p.a, p.b)})
|
|
975
|
+
facts_by_entity = {
|
|
976
|
+
eid: load_fact_statements(conn, args.arena, eid, args.top_k_facts)
|
|
977
|
+
for eid in ids_needed
|
|
978
|
+
}
|
|
979
|
+
ents_by_id = {e.id: e for e in entities}
|
|
980
|
+
bundles = [embedding_bundle(ents_by_id[eid], facts_by_entity[eid])
|
|
981
|
+
for eid in ids_needed]
|
|
982
|
+
vectors = dict(zip(ids_needed, backend.embed(bundles))) \
|
|
983
|
+
if bundles else {}
|
|
984
|
+
for p in pairs:
|
|
985
|
+
p.similarity = cosine(vectors[p.a.id], vectors[p.b.id])
|
|
986
|
+
|
|
987
|
+
# ---- Routing: thresholds + adjudication + bare-name policy ----
|
|
988
|
+
routed = route_pairs(pairs, adjudicator, facts_by_entity,
|
|
989
|
+
args.high_threshold, args.low_threshold)
|
|
990
|
+
v2_proposals = pairs_to_proposals(routed.merge)
|
|
991
|
+
print(f"[resolution-v2] embedding+llm tier: "
|
|
992
|
+
f"{len(v2_proposals)} proposals; "
|
|
993
|
+
f"{len(routed.human_review)} pairs → human review; "
|
|
994
|
+
f"{len(routed.dropped)} dropped")
|
|
995
|
+
|
|
996
|
+
# ---- Report ----------------------------------------------------
|
|
997
|
+
records = build_report_records(
|
|
998
|
+
args.arena, v1_proposals, v2_proposals, routed,
|
|
999
|
+
before_count, heuristic_proposals)
|
|
1000
|
+
out = args.out or f"/tmp/entity_resolution_v2_{args.arena}.jsonl"
|
|
1001
|
+
write_jsonl(records, out)
|
|
1002
|
+
summary = args.summary or (out.rsplit(".", 1)[0] + ".md")
|
|
1003
|
+
write_markdown_summary(records, summary)
|
|
1004
|
+
print(f"[resolution-v2] report → {out}\n"
|
|
1005
|
+
f"[resolution-v2] summary → {summary}")
|
|
1006
|
+
|
|
1007
|
+
if not args.apply:
|
|
1008
|
+
print("[resolution-v2] dry-run only (session was READ ONLY); "
|
|
1009
|
+
"pass --apply --i-have-a-snapshot to execute")
|
|
1010
|
+
return 0
|
|
1011
|
+
|
|
1012
|
+
# ---- Apply (gated) ---------------------------------------------
|
|
1013
|
+
to_apply: list[MergeProposal] = [*v1_proposals, *heuristic_proposals]
|
|
1014
|
+
skipped_v2 = 0
|
|
1015
|
+
if v2_proposals:
|
|
1016
|
+
if _entity_merges_check_allows(conn, "embedding_llm"):
|
|
1017
|
+
to_apply.extend(v2_proposals)
|
|
1018
|
+
else:
|
|
1019
|
+
skipped_v2 = len(v2_proposals)
|
|
1020
|
+
print("[resolution-v2] WARNING: entity_merges merge_signal "
|
|
1021
|
+
"CHECK does not admit 'embedding_llm' — the draft "
|
|
1022
|
+
"migration in resolution-queue-design.md must be "
|
|
1023
|
+
"applied first. Skipping the embedding_llm tier "
|
|
1024
|
+
f"({skipped_v2} proposals) to keep the audit honest.",
|
|
1025
|
+
file=sys.stderr)
|
|
1026
|
+
|
|
1027
|
+
succeeded, failed, errors = v1.apply_proposals(
|
|
1028
|
+
conn, args.arena, to_apply, merged_by)
|
|
1029
|
+
conn.commit()
|
|
1030
|
+
with conn.cursor() as cur:
|
|
1031
|
+
cur.execute(ENTITY_COUNT_SQL, (args.arena, args.entity_type))
|
|
1032
|
+
after = cur.fetchone()[0]
|
|
1033
|
+
print(f"[resolution-v2] applied: {succeeded} succeeded, "
|
|
1034
|
+
f"{failed} failed; entities {before_count} → {after}")
|
|
1035
|
+
for e in errors[:20]:
|
|
1036
|
+
print(f" ERR: {e}")
|
|
1037
|
+
return 1 if failed else 0
|
|
1038
|
+
|
|
1039
|
+
|
|
1040
|
+
if __name__ == "__main__":
|
|
1041
|
+
sys.exit(main())
|