@pentatonic-ai/ai-agent-sdk 0.10.4 → 0.10.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,1041 @@
1
+ #!/usr/bin/env python3
2
+ """Entity-resolution v2: blocking + embedding similarity + LLM
3
+ adjudication. Roadmap BET 1b — DRY-RUN TOOLING.
4
+
5
+ What v1 (backfill_entity_reconciliation.py) covers: variants that share
6
+ an exact normalized surface form (alias overlap) or were paired by a
7
+ name↔email co-occurrence in an event. What it does NOT cover — and
8
+ this script adds:
9
+
10
+ - "Johann_Boedecker" (underscore separator)
11
+ - "Bödecker, Johann" (diacritic + comma inversion)
12
+ - "Johann" (bare first name)
13
+ - "Johanna Phil" (near-miss that must NOT merge)
14
+
15
+ Pipeline (within ONE arena, person entities):
16
+
17
+ 1. v1 signals first — co_occurrence + alias_overlap proposals are
18
+ IMPORTED from v1's machinery, unchanged. They outrank everything.
19
+ 2. BLOCKING — candidate generation only, NEVER identity. Punctuation/
20
+ underscore strip, diacritic fold (NFD + strip combining), token
21
+ sort, char-trigram buckets. Block keys: first-token, last-token,
22
+ email local-part, trigram bucket — within (arena, entity_type).
23
+ 3. EMBEDDING similarity within blocks (pluggable backend; the HTTP
24
+ backend hits an OpenAI-compatible /v1/embeddings endpoint given
25
+ via --embed-url — never hardcoded, never called from tests).
26
+ Cosine ≥ 0.92 → high-confidence; 0.75–0.92 → ambiguous band;
27
+ < 0.75 → drop.
28
+ 4. LLM ADJUDICATION of the ambiguous band (Anthropic API, env
29
+ ANTHROPIC_API_KEY, model env-pinned, temperature 0). Strict JSON
30
+ {"same_person": "yes"|"no"|"unsure", "reason": ...}. "unsure"
31
+ NEVER merges — it lands in the human-review section. --no-llm
32
+ routes the whole band to human review.
33
+ 5. BARE-FIRST-NAME POLICY — a single-token entity merges only if it
34
+ has exactly ONE candidate in its blocks AND adjudication == yes.
35
+ 6. Tiered report (JSONL + markdown):
36
+ co_occurrence > alias_overlap > embedding_llm > heuristic.
37
+
38
+ SAFETY (the engine is shared multi-tenant infra; the pip-agents arena
39
+ is LEGACY/FROZEN and must be untouchable by construction):
40
+
41
+ - --arena is REQUIRED; every SQL statement this script issues is
42
+ arena-scoped (programmatically asserted — see ARENA_SCOPED_SQL +
43
+ assert_arena_scoped()).
44
+ - DRY-RUN BY DEFAULT. Without --apply the session is forced
45
+ READ ONLY at the Postgres level (SET default_transaction_read_only
46
+ = on) — structurally incapable of writing.
47
+ - --apply additionally REQUIRES --i-have-a-snapshot (operator
48
+ acknowledgment that a pg_dump snapshot exists). Refused otherwise,
49
+ before any connection is opened.
50
+
51
+ Usage:
52
+
53
+ python3 entity_resolution_v2.py \\
54
+ --arena <arena-id> \\
55
+ --pg-dsn postgresql://... \\
56
+ --embed-url http://<embed-gateway>/v1/embeddings \\
57
+ [--no-llm] # route ambiguous band to human review
58
+ [--out /tmp/resolution.jsonl] # tiered merge-candidate report
59
+ [--summary /tmp/resolution.md]
60
+ [--apply --i-have-a-snapshot] # write (gated; see above)
61
+
62
+ Exit codes: 0 success; 1 partial apply failure; 2 bad args / refused.
63
+ """
64
+
65
+ from __future__ import annotations
66
+
67
+ import argparse
68
+ import importlib.util
69
+ import json
70
+ import math
71
+ import os
72
+ import re
73
+ import sys
74
+ import unicodedata
75
+ import urllib.error
76
+ import urllib.request
77
+ from collections import defaultdict
78
+ from dataclasses import dataclass, field
79
+ from datetime import datetime, timezone
80
+ from pathlib import Path
81
+
82
+ # ----------------------------------------------------------------------
83
+ # Import v1's load/apply/report machinery (same dir; scripts/ is not a
84
+ # package, so load by path). v1 stays the single owner of: entity
85
+ # loading, co-occurrence collection, union-find proposal building,
86
+ # richest-row-wins canonical selection, per-proposal transactional
87
+ # apply, entity_merges audit + rollback_payload.
88
+ # ----------------------------------------------------------------------
89
+
90
+ _V1_PATH = Path(__file__).resolve().parent / "backfill_entity_reconciliation.py"
91
+ if "backfill_entity_reconciliation" in sys.modules:
92
+ v1 = sys.modules["backfill_entity_reconciliation"]
93
+ else:
94
+ _spec = importlib.util.spec_from_file_location(
95
+ "backfill_entity_reconciliation", _V1_PATH)
96
+ assert _spec and _spec.loader
97
+ v1 = importlib.util.module_from_spec(_spec)
98
+ # Register BEFORE exec: dataclass processing (py3.13+) looks the
99
+ # defining module up in sys.modules.
100
+ sys.modules["backfill_entity_reconciliation"] = v1
101
+ _spec.loader.exec_module(v1)
102
+
103
+ Entity = v1.Entity
104
+ MergeProposal = v1.MergeProposal
105
+ psycopg = v1.psycopg # None when the driver isn't installed (tests)
106
+
107
+
108
+ # ----------------------------------------------------------------------
109
+ # Thresholds & constants
110
+ # ----------------------------------------------------------------------
111
+
112
+ HIGH_THRESHOLD = 0.92 # >= : high-confidence candidate
113
+ LOW_THRESHOLD = 0.75 # < : drop. [LOW, HIGH) : ambiguous band → LLM
114
+ DEFAULT_TOP_K_FACTS = 8
115
+ DEFAULT_MAX_BLOCK = 100 # blocks bigger than this are skipped for
116
+ # pair generation (candidate gen, not recall-
117
+ # critical: v1 signals still cover them)
118
+ ADJUDICATION_SAMPLE_FACTS = 5
119
+
120
+ LLM_MODEL_ENV = "ENTITY_RESOLUTION_LLM_MODEL"
121
+ LLM_MODEL_DEFAULT = "claude-haiku-4-5"
122
+ ANTHROPIC_URL = "https://api.anthropic.com/v1/messages"
123
+ ANTHROPIC_VERSION = "2023-06-01"
124
+
125
+ TIER_ORDER = ("co_occurrence", "alias_overlap", "embedding_llm", "heuristic")
126
+
127
+ # ----------------------------------------------------------------------
128
+ # Every SQL statement THIS script issues. All must be arena-scoped —
129
+ # assert_arena_scoped() enforces it and the unit tests assert it too.
130
+ # (v1's statements are arena-scoped at load + repoint; its by-id
131
+ # entity UPDATE/DELETE statements operate only on ids returned by the
132
+ # arena-scoped loads.)
133
+ # ----------------------------------------------------------------------
134
+
135
+ FACTS_FOR_ENTITY_SQL = """
136
+ SELECT statement FROM facts
137
+ WHERE arena = %s AND (subject_entity_id = %s OR object_entity_id = %s)
138
+ ORDER BY confidence DESC, asserted_at DESC
139
+ LIMIT %s
140
+ """
141
+
142
+ ENTITY_COUNT_SQL = """
143
+ SELECT COUNT(*) AS n FROM entities
144
+ WHERE arena = %s AND entity_type = %s
145
+ """
146
+
147
+ # Reads pg_catalog constraint metadata only — no tenant rows — so it
148
+ # is exempt from the arena-scoping rule below.
149
+ ENTITY_MERGES_CHECKDEF_SQL = """
150
+ SELECT pg_get_constraintdef(oid) AS def
151
+ FROM pg_constraint
152
+ WHERE conrelid = 'entity_merges'::regclass AND contype = 'c'
153
+ """
154
+
155
+ ARENA_SCOPED_SQL: dict[str, str] = {
156
+ "facts_for_entity": FACTS_FOR_ENTITY_SQL,
157
+ "entity_count": ENTITY_COUNT_SQL,
158
+ }
159
+
160
+
161
+ def assert_arena_scoped() -> None:
162
+ """Every tenant-data SQL statement this script issues must contain
163
+ an arena predicate. Runs at startup; also unit-tested."""
164
+ for name, sql in ARENA_SCOPED_SQL.items():
165
+ if "arena = %s" not in sql:
166
+ raise AssertionError(
167
+ f"SQL '{name}' is not arena-scoped — refusing to run. "
168
+ f"Every statement must carry 'arena = %s'."
169
+ )
170
+
171
+
172
+ # ----------------------------------------------------------------------
173
+ # 1. Candidate-generation normalization — NEVER used for identity,
174
+ # ONLY for blocking. Identity stays with entity_id.py /
175
+ # v1._normalize_surface.
176
+ # ----------------------------------------------------------------------
177
+
178
+ _PUNCT_RE = re.compile(r"[^\w\s@]|_", re.UNICODE)
179
+ _WS_RE = re.compile(r"\s+")
180
+
181
+
182
+ def fold_diacritics(s: str) -> str:
183
+ """'Bödecker' → 'Bodecker' (NFD decompose, strip combining marks)."""
184
+ return "".join(
185
+ ch for ch in unicodedata.normalize("NFD", s)
186
+ if not unicodedata.combining(ch)
187
+ )
188
+
189
+
190
+ def block_normalize(s: str) -> str:
191
+ """Blocking-only normal form: diacritic fold + lowercase +
192
+ punctuation/underscores → space + collapse whitespace.
193
+
194
+ 'Johann_Boedecker' → 'johann boedecker'
195
+ 'Bödecker, Johann' → 'bodecker johann'
196
+ """
197
+ s = fold_diacritics(unicodedata.normalize("NFKC", s or ""))
198
+ s = _PUNCT_RE.sub(" ", s.lower())
199
+ return _WS_RE.sub(" ", s).strip()
200
+
201
+
202
+ def block_tokens(s: str) -> list[str]:
203
+ n = block_normalize(s)
204
+ return n.split(" ") if n else []
205
+
206
+
207
+ def token_sort(s: str) -> str:
208
+ """Order-insensitive form: 'Bödecker, Johann' and
209
+ 'Johann Boedecker' both sort to a stable token sequence."""
210
+ return " ".join(sorted(block_tokens(s)))
211
+
212
+
213
+ def char_trigrams(s: str) -> set[str]:
214
+ """Trigrams over the token-sorted, space-stripped form. Catches
215
+ 'boedecker' vs 'bodecker' (oe vs folded ö) via shared trigrams."""
216
+ joined = token_sort(s).replace(" ", "")
217
+ if len(joined) < 3:
218
+ return {joined} if joined else set()
219
+ return {joined[i:i + 3] for i in range(len(joined) - 2)}
220
+
221
+
222
+ def _entity_surface_forms(e: Entity) -> list[str]:
223
+ """canonical + aliases + (for email forms) the local part, which is
224
+ itself a name-ish surface ('johann.boedecker@…' → 'johann boedecker')."""
225
+ forms = [e.canonical_name, *e.aliases]
226
+ extra: list[str] = []
227
+ for f in forms:
228
+ if v1._looks_like_email(f):
229
+ lp = v1._local_part(f)
230
+ if lp:
231
+ extra.append(lp)
232
+ return forms + extra
233
+
234
+
235
+ def blocking_keys(e: Entity) -> set[str]:
236
+ """Block keys for one entity: first-token, last-token (of the
237
+ token-SORTED form), email local-part, char-trigram buckets."""
238
+ keys: set[str] = set()
239
+ for form in _entity_surface_forms(e):
240
+ if v1._looks_like_email(form):
241
+ lp = v1._local_part(form)
242
+ if lp:
243
+ keys.add("local:" + block_normalize(lp).replace(" ", ""))
244
+ continue
245
+ toks = sorted(block_tokens(form))
246
+ if not toks:
247
+ continue
248
+ keys.add("first:" + toks[0])
249
+ keys.add("last:" + toks[-1])
250
+ for tri in char_trigrams(form):
251
+ keys.add("tri:" + tri)
252
+ return keys
253
+
254
+
255
+ def is_bare_name(e: Entity) -> bool:
256
+ """True when every non-email surface form is a single token
257
+ ('Johann'). Such entities are subject to the bare-first-name
258
+ policy (§4 of the module docstring)."""
259
+ name_forms = [f for f in [e.canonical_name, *e.aliases]
260
+ if not v1._looks_like_email(f)]
261
+ if not name_forms:
262
+ return False
263
+ return all(len(block_tokens(f)) <= 1 for f in name_forms)
264
+
265
+
266
+ @dataclass
267
+ class CandidatePair:
268
+ a: Entity
269
+ b: Entity
270
+ shared_keys: set[str] = field(default_factory=set)
271
+ similarity: float | None = None
272
+ band: str | None = None # 'high' | 'ambiguous' | 'drop'
273
+ verdict: str | None = None # 'yes' | 'no' | 'unsure' | None
274
+ reason: str | None = None
275
+
276
+ @property
277
+ def key(self) -> frozenset:
278
+ return frozenset({self.a.id, self.b.id})
279
+
280
+
281
+ def generate_candidate_pairs(
282
+ entities: list[Entity],
283
+ already_grouped: set[frozenset] | None = None,
284
+ max_block: int = DEFAULT_MAX_BLOCK,
285
+ ) -> list[CandidatePair]:
286
+ """Blocking: pair every two entities sharing >= 1 block key, within
287
+ the (already arena-scoped) entity list. Pairs already grouped by a
288
+ v1 signal are skipped (v1 outranks). Oversized blocks are skipped
289
+ for pair generation (candidate-gen recall guard, not identity)."""
290
+ already_grouped = already_grouped or set()
291
+ blocks: dict[str, list[Entity]] = defaultdict(list)
292
+ for e in entities:
293
+ for k in blocking_keys(e):
294
+ blocks[k].append(e)
295
+
296
+ pairs: dict[frozenset, CandidatePair] = {}
297
+ for k, members in blocks.items():
298
+ if len(members) < 2 or len(members) > max_block:
299
+ continue
300
+ for i in range(len(members)):
301
+ for j in range(i + 1, len(members)):
302
+ a, b = members[i], members[j]
303
+ fk = frozenset({a.id, b.id})
304
+ if len(fk) < 2 or fk in already_grouped:
305
+ continue
306
+ if fk not in pairs:
307
+ pairs[fk] = CandidatePair(a=a, b=b)
308
+ pairs[fk].shared_keys.add(k)
309
+ return sorted(pairs.values(), key=lambda p: sorted(p.key))
310
+
311
+
312
+ # ----------------------------------------------------------------------
313
+ # 2. Embedding-similarity stage — pluggable backend
314
+ # ----------------------------------------------------------------------
315
+
316
+ class EmbeddingBackend:
317
+ """Interface: embed(texts) -> list of vectors."""
318
+
319
+ def embed(self, texts: list[str]) -> list[list[float]]: # pragma: no cover
320
+ raise NotImplementedError
321
+
322
+
323
+ class HttpEmbeddingBackend(EmbeddingBackend):
324
+ """OpenAI-compatible /v1/embeddings endpoint (the engine's Qwen3
325
+ embed gateway). Endpoint comes from --embed-url — NEVER hardcoded.
326
+ NOT called in tests (tests use a fake backend)."""
327
+
328
+ def __init__(self, url: str, model: str | None = None,
329
+ timeout: float = 30.0, batch_size: int = 32) -> None:
330
+ if not url:
331
+ raise ValueError("HttpEmbeddingBackend requires --embed-url")
332
+ self.url = url
333
+ self.model = model
334
+ self.timeout = timeout
335
+ self.batch_size = batch_size
336
+
337
+ def embed(self, texts: list[str]) -> list[list[float]]:
338
+ out: list[list[float]] = []
339
+ for i in range(0, len(texts), self.batch_size):
340
+ batch = texts[i:i + self.batch_size]
341
+ body: dict = {"input": batch}
342
+ if self.model:
343
+ body["model"] = self.model
344
+ req = urllib.request.Request(
345
+ self.url,
346
+ data=json.dumps(body).encode(),
347
+ headers={"content-type": "application/json"},
348
+ method="POST",
349
+ )
350
+ with urllib.request.urlopen(req, timeout=self.timeout) as resp:
351
+ payload = json.loads(resp.read())
352
+ data = sorted(payload["data"], key=lambda d: d.get("index", 0))
353
+ out.extend([d["embedding"] for d in data])
354
+ return out
355
+
356
+
357
+ class LocalEmbeddingBackend(EmbeddingBackend):
358
+ """--embed-backend local. STUB: the repo carries no suitable local
359
+ CPU embedding dependency (requirements are psycopg/httpx/fastapi/
360
+ qdrant-client only — no sentence-transformers / fastembed / onnx
361
+ model). Rather than silently pulling a new heavyweight dep into
362
+ ops tooling, this stays a stub until one is deliberately added."""
363
+
364
+ def __init__(self, *_args, **_kwargs) -> None:
365
+ raise NotImplementedError(
366
+ "--embed-backend local: no local CPU embedding dependency "
367
+ "exists in this repo (checked: compat/extractor requirements "
368
+ "— psycopg, httpx, fastapi, qdrant-client only). Use "
369
+ "--embed-backend http with --embed-url pointing at the "
370
+ "engine's Qwen3 embed gateway, or add a small CPU model "
371
+ "dependency deliberately (e.g. fastembed) in its own PR."
372
+ )
373
+
374
+
375
+ def cosine(a: list[float], b: list[float]) -> float:
376
+ dot = sum(x * y for x, y in zip(a, b))
377
+ na = math.sqrt(sum(x * x for x in a))
378
+ nb = math.sqrt(sum(y * y for y in b))
379
+ if na == 0.0 or nb == 0.0:
380
+ return 0.0
381
+ return dot / (na * nb)
382
+
383
+
384
+ def embedding_bundle(e: Entity, fact_statements: list[str]) -> str:
385
+ """Surface-form bundle + top-K fact statements → one embeddable
386
+ text per entity."""
387
+ lines = [f"name: {e.canonical_name}"]
388
+ if e.aliases:
389
+ lines.append("aliases: " + "; ".join(e.aliases))
390
+ for s in fact_statements:
391
+ lines.append(f"fact: {s}")
392
+ return "\n".join(lines)
393
+
394
+
395
+ def route_band(similarity: float,
396
+ high: float = HIGH_THRESHOLD,
397
+ low: float = LOW_THRESHOLD) -> str:
398
+ """Threshold routing: >= high → 'high'; [low, high) → 'ambiguous';
399
+ < low → 'drop'."""
400
+ if similarity >= high:
401
+ return "high"
402
+ if similarity >= low:
403
+ return "ambiguous"
404
+ return "drop"
405
+
406
+
407
+ def load_fact_statements(conn, arena: str, entity_id: str,
408
+ top_k: int = DEFAULT_TOP_K_FACTS) -> list[str]:
409
+ """Top-K fact statements for one entity. Arena-scoped."""
410
+ with conn.cursor() as cur:
411
+ cur.execute(FACTS_FOR_ENTITY_SQL,
412
+ (arena, entity_id, entity_id, top_k))
413
+ return [r[0] for r in cur.fetchall()]
414
+
415
+
416
+ # ----------------------------------------------------------------------
417
+ # 3. LLM adjudication of the ambiguous band
418
+ # ----------------------------------------------------------------------
419
+
420
+ ADJUDICATION_PROMPT = """\
421
+ You are adjudicating whether two database entity records refer to the \
422
+ SAME real person. Be conservative: a false merge corrupts the knowledge \
423
+ graph and is far worse than leaving a duplicate.
424
+
425
+ Entity A:
426
+ canonical name: {a_name}
427
+ aliases: {a_aliases}
428
+ sample facts:
429
+ {a_facts}
430
+
431
+ Entity B:
432
+ canonical name: {b_name}
433
+ aliases: {b_aliases}
434
+ sample facts:
435
+ {b_facts}
436
+
437
+ Answer with STRICT JSON only, no prose, exactly this shape:
438
+ {{"same_person": "yes" | "no" | "unsure", "reason": "<one sentence>"}}
439
+
440
+ Rules:
441
+ - "yes" only if the names are plausibly the same person AND nothing in \
442
+ the facts contradicts it.
443
+ - Different people who merely share a first name (e.g. "Johann" vs \
444
+ "Johanna") are "no".
445
+ - If the evidence is thin or conflicting, answer "unsure".
446
+ """
447
+
448
+
449
+ @dataclass
450
+ class Adjudication:
451
+ same_person: str # 'yes' | 'no' | 'unsure'
452
+ reason: str
453
+
454
+
455
+ class Adjudicator:
456
+ def adjudicate(self, a: Entity, a_facts: list[str],
457
+ b: Entity, b_facts: list[str]) -> Adjudication: # pragma: no cover
458
+ raise NotImplementedError
459
+
460
+
461
+ class NoLLMAdjudicator(Adjudicator):
462
+ """--no-llm / offline mode: the whole ambiguous band routes to
463
+ human review ('unsure' never merges)."""
464
+
465
+ def adjudicate(self, a, a_facts, b, b_facts) -> Adjudication:
466
+ return Adjudication("unsure", "LLM adjudication disabled (--no-llm); "
467
+ "routed to human review")
468
+
469
+
470
+ class AnthropicAdjudicator(Adjudicator):
471
+ """Anthropic API via env ANTHROPIC_API_KEY; model env-pinned
472
+ ($ENTITY_RESOLUTION_LLM_MODEL, default claude-haiku-4-5);
473
+ temperature 0. Strict-JSON response; any parse failure degrades to
474
+ 'unsure' (never merges)."""
475
+
476
+ def __init__(self, api_key: str | None = None,
477
+ model: str | None = None, timeout: float = 60.0) -> None:
478
+ self.api_key = api_key or os.environ.get("ANTHROPIC_API_KEY")
479
+ if not self.api_key:
480
+ raise ValueError(
481
+ "ANTHROPIC_API_KEY not set — use --no-llm to route the "
482
+ "ambiguous band to human review instead.")
483
+ self.model = model or os.environ.get(LLM_MODEL_ENV, LLM_MODEL_DEFAULT)
484
+ self.timeout = timeout
485
+
486
+ @staticmethod
487
+ def _fmt_facts(facts: list[str]) -> str:
488
+ sample = facts[:ADJUDICATION_SAMPLE_FACTS]
489
+ if not sample:
490
+ return " (no facts recorded)"
491
+ return "\n".join(f" - {s}" for s in sample)
492
+
493
+ def adjudicate(self, a, a_facts, b, b_facts) -> Adjudication:
494
+ prompt = ADJUDICATION_PROMPT.format(
495
+ a_name=a.canonical_name, a_aliases=", ".join(a.aliases) or "(none)",
496
+ a_facts=self._fmt_facts(a_facts),
497
+ b_name=b.canonical_name, b_aliases=", ".join(b.aliases) or "(none)",
498
+ b_facts=self._fmt_facts(b_facts),
499
+ )
500
+ body = {
501
+ "model": self.model,
502
+ "max_tokens": 200,
503
+ "temperature": 0,
504
+ "messages": [{"role": "user", "content": prompt}],
505
+ }
506
+ req = urllib.request.Request(
507
+ ANTHROPIC_URL,
508
+ data=json.dumps(body).encode(),
509
+ headers={
510
+ "x-api-key": self.api_key,
511
+ "anthropic-version": ANTHROPIC_VERSION,
512
+ "content-type": "application/json",
513
+ },
514
+ method="POST",
515
+ )
516
+ try:
517
+ with urllib.request.urlopen(req, timeout=self.timeout) as resp:
518
+ payload = json.loads(resp.read())
519
+ text = "".join(
520
+ blk.get("text", "") for blk in payload.get("content", [])
521
+ if blk.get("type") == "text"
522
+ )
523
+ return parse_adjudication(text)
524
+ except (urllib.error.URLError, TimeoutError, json.JSONDecodeError,
525
+ KeyError) as e:
526
+ return Adjudication("unsure", f"adjudication call failed: {e}")
527
+
528
+
529
+ def parse_adjudication(text: str) -> Adjudication:
530
+ """Strict JSON {"same_person": yes|no|unsure, "reason": ...}.
531
+ Anything malformed → 'unsure' (never merges)."""
532
+ m = re.search(r"\{.*\}", text, re.DOTALL)
533
+ if not m:
534
+ return Adjudication("unsure", f"non-JSON adjudication output: {text[:120]!r}")
535
+ try:
536
+ obj = json.loads(m.group(0))
537
+ except json.JSONDecodeError:
538
+ return Adjudication("unsure", f"unparseable adjudication JSON: {text[:120]!r}")
539
+ verdict = obj.get("same_person")
540
+ if verdict not in ("yes", "no", "unsure"):
541
+ return Adjudication("unsure", f"invalid same_person value: {verdict!r}")
542
+ return Adjudication(verdict, str(obj.get("reason", ""))[:500])
543
+
544
+
545
+ # ----------------------------------------------------------------------
546
+ # 4. Pair routing: thresholds + adjudication + bare-first-name policy
547
+ # ----------------------------------------------------------------------
548
+
549
+ @dataclass
550
+ class RoutedPairs:
551
+ merge: list[CandidatePair] = field(default_factory=list)
552
+ human_review: list[CandidatePair] = field(default_factory=list)
553
+ dropped: list[CandidatePair] = field(default_factory=list)
554
+
555
+
556
+ def route_pairs(
557
+ pairs: list[CandidatePair],
558
+ adjudicator: Adjudicator,
559
+ facts_by_entity: dict[str, list[str]],
560
+ high: float = HIGH_THRESHOLD,
561
+ low: float = LOW_THRESHOLD,
562
+ ) -> RoutedPairs:
563
+ """Each pair must already carry .similarity. Routing:
564
+
565
+ - sim < low → dropped
566
+ - bare-name involved → ALWAYS adjudicated; merges
567
+ only on 'yes' AND (enforced later) exactly one candidate
568
+ - sim >= high, no bare name → merge (high-confidence)
569
+ - low <= sim < high → adjudicate: yes → merge,
570
+ no → dropped, unsure → human review
571
+ """
572
+ routed = RoutedPairs()
573
+
574
+ # Bare-name candidate-count map (policy: exactly one candidate in
575
+ # block, counted over NON-dropped pairs).
576
+ bare_candidates: dict[str, int] = defaultdict(int)
577
+ for p in pairs:
578
+ assert p.similarity is not None, "route_pairs needs scored pairs"
579
+ if route_band(p.similarity, high, low) == "drop":
580
+ continue
581
+ for e in (p.a, p.b):
582
+ if is_bare_name(e):
583
+ bare_candidates[e.id] += 1
584
+
585
+ for p in pairs:
586
+ p.band = route_band(p.similarity, high, low)
587
+ if p.band == "drop":
588
+ routed.dropped.append(p)
589
+ continue
590
+
591
+ bare_ids = [e.id for e in (p.a, p.b) if is_bare_name(e)]
592
+ if bare_ids:
593
+ # Bare-first-name policy: single-token entities merge only
594
+ # if exactly one candidate in block AND adjudication = yes.
595
+ if any(bare_candidates[eid] != 1 for eid in bare_ids):
596
+ p.verdict = "unsure"
597
+ p.reason = ("bare-first-name policy: entity has more than "
598
+ "one candidate in its blocks; left unmerged")
599
+ routed.human_review.append(p)
600
+ continue
601
+ adj = adjudicator.adjudicate(
602
+ p.a, facts_by_entity.get(p.a.id, []),
603
+ p.b, facts_by_entity.get(p.b.id, []))
604
+ p.verdict, p.reason = adj.same_person, adj.reason
605
+ if adj.same_person == "yes":
606
+ routed.merge.append(p)
607
+ elif adj.same_person == "no":
608
+ routed.dropped.append(p)
609
+ else:
610
+ routed.human_review.append(p)
611
+ continue
612
+
613
+ if p.band == "high":
614
+ p.verdict = "auto"
615
+ p.reason = (f"cosine similarity {p.similarity:.3f} >= "
616
+ f"high-confidence threshold {high}")
617
+ routed.merge.append(p)
618
+ continue
619
+
620
+ # ambiguous band → LLM adjudication
621
+ adj = adjudicator.adjudicate(
622
+ p.a, facts_by_entity.get(p.a.id, []),
623
+ p.b, facts_by_entity.get(p.b.id, []))
624
+ p.verdict, p.reason = adj.same_person, adj.reason
625
+ if adj.same_person == "yes":
626
+ routed.merge.append(p)
627
+ elif adj.same_person == "no":
628
+ routed.dropped.append(p)
629
+ else:
630
+ routed.human_review.append(p)
631
+
632
+ return routed
633
+
634
+
635
+ def pairs_to_proposals(merge_pairs: list[CandidatePair]) -> list[MergeProposal]:
636
+ """Union-find accepted pairs into groups; canonical = richest row
637
+ (same ordering as v1: facts desc, rels desc, provenance desc, id)."""
638
+ parent: dict[str, str] = {}
639
+ entities: dict[str, Entity] = {}
640
+
641
+ def find(x: str) -> str:
642
+ while parent[x] != x:
643
+ parent[x] = parent[parent[x]]
644
+ x = parent[x]
645
+ return x
646
+
647
+ for p in merge_pairs:
648
+ for e in (p.a, p.b):
649
+ parent.setdefault(e.id, e.id)
650
+ entities[e.id] = e
651
+ ra, rb = find(p.a.id), find(p.b.id)
652
+ if ra != rb:
653
+ parent[ra] = rb
654
+
655
+ groups: dict[str, list[Entity]] = defaultdict(list)
656
+ for eid in parent:
657
+ groups[find(eid)].append(entities[eid])
658
+
659
+ proposals: list[MergeProposal] = []
660
+ for group in groups.values():
661
+ if len(group) < 2:
662
+ continue
663
+ group_sorted = sorted(
664
+ group,
665
+ key=lambda e: (-e.fact_count, -e.rel_count,
666
+ -len(e.provenance_event_ids), e.id),
667
+ )
668
+ proposals.append(MergeProposal(
669
+ canonical=group_sorted[0],
670
+ deprecated=group_sorted[1:],
671
+ signal="embedding_llm",
672
+ ))
673
+ return proposals
674
+
675
+
676
+ # ----------------------------------------------------------------------
677
+ # 5. Tiered report (JSONL + markdown)
678
+ # ----------------------------------------------------------------------
679
+
680
+ def _proposal_record(p: MergeProposal, tier: str,
681
+ evidence: list[dict] | None = None) -> dict:
682
+ return {
683
+ "type": "merge_proposal",
684
+ "tier": tier,
685
+ "signal": p.signal,
686
+ "canonical": {
687
+ "id": p.canonical.id,
688
+ "canonical_name": p.canonical.canonical_name,
689
+ "fact_count": p.canonical.fact_count,
690
+ "rel_count": p.canonical.rel_count,
691
+ },
692
+ "deprecated": [
693
+ {
694
+ "id": d.id,
695
+ "canonical_name": d.canonical_name,
696
+ "aliases": d.aliases,
697
+ "fact_count": d.fact_count,
698
+ "rel_count": d.rel_count,
699
+ } for d in p.deprecated
700
+ ],
701
+ "evidence": evidence or [],
702
+ }
703
+
704
+
705
+ def _pair_record(p: CandidatePair, record_type: str) -> dict:
706
+ return {
707
+ "type": record_type,
708
+ "a": {"id": p.a.id, "canonical_name": p.a.canonical_name},
709
+ "b": {"id": p.b.id, "canonical_name": p.b.canonical_name},
710
+ "similarity": p.similarity,
711
+ "band": p.band,
712
+ "verdict": p.verdict,
713
+ "reason": p.reason,
714
+ "shared_block_keys": sorted(p.shared_keys)[:10],
715
+ }
716
+
717
+
718
+ def build_report_records(
719
+ arena: str,
720
+ v1_proposals: list[MergeProposal],
721
+ v2_proposals: list[MergeProposal],
722
+ routed: RoutedPairs,
723
+ before_count: int,
724
+ heuristic_proposals: list[MergeProposal] | None = None,
725
+ ) -> list[dict]:
726
+ """Tier order: co_occurrence > alias_overlap > embedding_llm >
727
+ heuristic. Returns JSONL-able records, header first."""
728
+ pair_evidence: dict[frozenset, dict] = {
729
+ p.key: _pair_record(p, "evidence") for p in routed.merge
730
+ }
731
+
732
+ records: list[dict] = []
733
+ deprecated_total = sum(
734
+ len(p.deprecated)
735
+ for p in [*v1_proposals, *v2_proposals, *(heuristic_proposals or [])]
736
+ )
737
+ records.append({
738
+ "type": "header",
739
+ "arena": arena,
740
+ "generated_at": datetime.now(timezone.utc).isoformat(),
741
+ "tiers": list(TIER_ORDER),
742
+ "entity_count_before": before_count,
743
+ "entity_count_after_if_applied": before_count - deprecated_total,
744
+ "other_arenas": "untouched — every SQL statement is arena-scoped "
745
+ "(programmatically asserted; dry-run sessions are "
746
+ "READ ONLY at the Postgres level)",
747
+ })
748
+
749
+ by_tier: list[tuple[str, list[MergeProposal]]] = [
750
+ ("co_occurrence", [p for p in v1_proposals if p.signal == "co_occurrence"]),
751
+ ("alias_overlap", [p for p in v1_proposals if p.signal == "alias_overlap"]),
752
+ ("embedding_llm", v2_proposals),
753
+ ("heuristic", heuristic_proposals or []),
754
+ ]
755
+ for tier, proposals in by_tier:
756
+ for p in proposals:
757
+ ev = []
758
+ if tier == "embedding_llm":
759
+ ids = {p.canonical.id, *(d.id for d in p.deprecated)}
760
+ ev = [rec for key, rec in pair_evidence.items() if key <= ids]
761
+ records.append(_proposal_record(p, tier, ev))
762
+
763
+ for p in routed.human_review:
764
+ records.append(_pair_record(p, "human_review"))
765
+ for p in routed.dropped:
766
+ records.append(_pair_record(p, "dropped"))
767
+ return records
768
+
769
+
770
+ def write_jsonl(records: list[dict], path: str) -> None:
771
+ with open(path, "w") as f:
772
+ for r in records:
773
+ f.write(json.dumps(r) + "\n")
774
+
775
+
776
+ def write_markdown_summary(records: list[dict], path: str) -> None:
777
+ header = records[0]
778
+ proposals = [r for r in records if r["type"] == "merge_proposal"]
779
+ review = [r for r in records if r["type"] == "human_review"]
780
+ dropped = [r for r in records if r["type"] == "dropped"]
781
+
782
+ lines = [
783
+ f"# Entity-resolution v2 — merge-candidate report",
784
+ "",
785
+ f"- **Arena:** `{header['arena']}` (one arena at a time; required `--arena`)",
786
+ f"- **Generated:** {header['generated_at']}",
787
+ f"- **Entities before:** {header['entity_count_before']}",
788
+ f"- **Entities after (if every proposal applied):** "
789
+ f"{header['entity_count_after_if_applied']}",
790
+ "",
791
+ "## Proposals by tier",
792
+ "",
793
+ "| tier | proposals | rows deprecated |",
794
+ "|---|---:|---:|",
795
+ ]
796
+ for tier in TIER_ORDER:
797
+ tier_props = [p for p in proposals if p["tier"] == tier]
798
+ lines.append(f"| {tier} | {len(tier_props)} | "
799
+ f"{sum(len(p['deprecated']) for p in tier_props)} |")
800
+ lines += ["", "## Proposals", ""]
801
+ for p in proposals:
802
+ deps = ", ".join(f"`{d['canonical_name']}`" for d in p["deprecated"])
803
+ lines.append(f"- **[{p['tier']}]** `{p['canonical']['canonical_name']}` "
804
+ f"absorbs {deps}")
805
+ for ev in p.get("evidence", []):
806
+ sim = f"{ev['similarity']:.3f}" if ev.get("similarity") is not None else "n/a"
807
+ lines.append(f" - sim={sim} verdict={ev.get('verdict')} — "
808
+ f"{ev.get('reason')}")
809
+ lines += ["", f"## Human review ({len(review)})", ""]
810
+ if not review:
811
+ lines.append("(none)")
812
+ for r in review:
813
+ sim = f"{r['similarity']:.3f}" if r.get("similarity") is not None else "n/a"
814
+ lines.append(f"- `{r['a']['canonical_name']}` ↔ "
815
+ f"`{r['b']['canonical_name']}` (sim={sim}) — {r.get('reason')}")
816
+ lines += ["", f"## Dropped pairs: {len(dropped)}", ""]
817
+ lines += [
818
+ "## Other arenas",
819
+ "",
820
+ header["other_arenas"],
821
+ "",
822
+ "The pip-agents arena (LEGACY/FROZEN) is untouchable by "
823
+ "construction: no statement in this tool can address it unless "
824
+ "an operator passes `--arena pip-agents`, which the runbook "
825
+ "forbids.",
826
+ ]
827
+ with open(path, "w") as f:
828
+ f.write("\n".join(lines) + "\n")
829
+
830
+
831
+ # ----------------------------------------------------------------------
832
+ # CLI
833
+ # ----------------------------------------------------------------------
834
+
835
+ def parse_args(argv: list[str] | None = None) -> argparse.Namespace:
836
+ p = argparse.ArgumentParser(
837
+ description=__doc__,
838
+ formatter_class=argparse.RawDescriptionHelpFormatter,
839
+ )
840
+ p.add_argument("--arena", required=True,
841
+ help="arena id to resolve (REQUIRED; one at a time; "
842
+ "every SQL statement is scoped to it)")
843
+ p.add_argument("--pg-dsn", default=os.environ.get("PG_DSN"),
844
+ help="postgres DSN; defaults to $PG_DSN")
845
+ p.add_argument("--entity-type", default="person",
846
+ help="entity type to resolve (default: person)")
847
+ p.add_argument("--apply", action="store_true",
848
+ help="actually write merges; default is dry-run "
849
+ "(READ ONLY session). Requires --i-have-a-snapshot.")
850
+ p.add_argument("--i-have-a-snapshot", action="store_true",
851
+ help="operator acknowledgment that a pg_dump snapshot "
852
+ "of org-model exists. --apply is refused without it.")
853
+ p.add_argument("--embed-url", default=None,
854
+ help="OpenAI-compatible /v1/embeddings endpoint "
855
+ "(the engine's Qwen3 embed gateway). Required for "
856
+ "the http backend; never hardcoded.")
857
+ p.add_argument("--embed-backend", choices=("http", "local"),
858
+ default="http",
859
+ help="embedding backend (default http; 'local' is a "
860
+ "stub — no suitable CPU dep in repo)")
861
+ p.add_argument("--embed-model", default=None,
862
+ help="model name to pass to the embeddings endpoint "
863
+ "(optional; gateway default if omitted)")
864
+ p.add_argument("--no-llm", action="store_true",
865
+ help="offline mode: route the whole ambiguous band to "
866
+ "human review instead of LLM adjudication")
867
+ p.add_argument("--high-threshold", type=float, default=HIGH_THRESHOLD)
868
+ p.add_argument("--low-threshold", type=float, default=LOW_THRESHOLD)
869
+ p.add_argument("--top-k-facts", type=int, default=DEFAULT_TOP_K_FACTS)
870
+ p.add_argument("--max-block", type=int, default=DEFAULT_MAX_BLOCK)
871
+ p.add_argument("--heuristic-merge", action="store_true",
872
+ help="also include v1's heuristic tier (off by default)")
873
+ p.add_argument("--out", default=None,
874
+ help="tiered merge-candidate report JSONL path")
875
+ p.add_argument("--summary", default=None,
876
+ help="markdown summary path (default: <out>.md)")
877
+ p.add_argument("--merged-by", default=None,
878
+ help="audit tag (default: resolution-v2-YYYY-MM)")
879
+ return p.parse_args(argv)
880
+
881
+
882
+ def validate_args(args: argparse.Namespace) -> str | None:
883
+ """Returns an error string, or None when ok. Checked BEFORE any
884
+ connection is opened — --apply is structurally refused without the
885
+ snapshot acknowledgment."""
886
+ if args.apply and not args.i_have_a_snapshot:
887
+ return ("--apply requires --i-have-a-snapshot (operator "
888
+ "acknowledgment that a pg_dump snapshot of org-model "
889
+ "exists — see the memory clean-rebuild runbook). Refusing.")
890
+ if not args.pg_dsn:
891
+ return "--pg-dsn (or $PG_DSN) required"
892
+ if args.embed_backend == "http" and not args.embed_url:
893
+ return ("--embed-url required for the http embedding backend "
894
+ "(the endpoint is never hardcoded)")
895
+ if not (0.0 <= args.low_threshold <= args.high_threshold <= 1.0):
896
+ return "--low-threshold must be <= --high-threshold, both in [0,1]"
897
+ return None
898
+
899
+
900
+ def _entity_merges_check_allows(conn, value: str) -> bool:
901
+ """True if the entity_merges merge_signal CHECK constraint admits
902
+ `value`. v2's 'embedding_llm' needs the draft migration in
903
+ resolution-queue-design.md applied first."""
904
+ with conn.cursor() as cur:
905
+ cur.execute(ENTITY_MERGES_CHECKDEF_SQL)
906
+ defs = [r[0] for r in cur.fetchall()]
907
+ return any(value in d for d in defs)
908
+
909
+
910
+ def main(argv: list[str] | None = None) -> int:
911
+ args = parse_args(argv)
912
+ err = validate_args(args)
913
+ if err:
914
+ print(f"error: {err}", file=sys.stderr)
915
+ return 2
916
+
917
+ if psycopg is None:
918
+ print("error: psycopg is required to run this script "
919
+ "(pip install 'psycopg[binary]')", file=sys.stderr)
920
+ return 2
921
+
922
+ assert_arena_scoped()
923
+
924
+ merged_by = args.merged_by or \
925
+ f"resolution-v2-{datetime.now(timezone.utc):%Y-%m}"
926
+
927
+ # Backends (constructed before connecting so misconfig fails fast).
928
+ try:
929
+ if args.embed_backend == "local":
930
+ backend: EmbeddingBackend = LocalEmbeddingBackend()
931
+ else:
932
+ backend = HttpEmbeddingBackend(args.embed_url, args.embed_model)
933
+ adjudicator: Adjudicator = (
934
+ NoLLMAdjudicator() if args.no_llm else AnthropicAdjudicator()
935
+ )
936
+ except (NotImplementedError, ValueError) as e:
937
+ print(f"error: {e}", file=sys.stderr)
938
+ return 2
939
+
940
+ with psycopg.connect(args.pg_dsn, autocommit=False) as conn:
941
+ if not args.apply:
942
+ # Structural dry-run: the session cannot write.
943
+ conn.execute("SET default_transaction_read_only = on")
944
+ print(f"[resolution-v2] arena={args.arena} type={args.entity_type} "
945
+ f"apply={args.apply} no_llm={args.no_llm}")
946
+
947
+ entities = v1.load_entities(conn, args.arena, args.entity_type)
948
+ before_count = len(entities)
949
+ print(f"[resolution-v2] loaded {before_count} {args.entity_type} "
950
+ f"entities (arena-scoped)")
951
+
952
+ # ---- Tier 1+2 (and optional 4): v1's machinery, unchanged ----
953
+ cooc = v1.collect_cooccurrence_pairs(conn, args.arena) \
954
+ if args.entity_type == "person" else set()
955
+ v1_all = v1.build_proposals(entities, cooc, args.heuristic_merge)
956
+ v1_proposals = [p for p in v1_all
957
+ if p.signal in ("co_occurrence", "alias_overlap")]
958
+ heuristic_proposals = [p for p in v1_all if p.signal == "heuristic"]
959
+ already = set()
960
+ for p in v1_all:
961
+ ids = [p.canonical.id, *(d.id for d in p.deprecated)]
962
+ for i in range(len(ids)):
963
+ for j in range(i + 1, len(ids)):
964
+ already.add(frozenset({ids[i], ids[j]}))
965
+ print(f"[resolution-v2] v1 tiers: {len(v1_proposals)} proposals "
966
+ f"(+{len(heuristic_proposals)} heuristic)")
967
+
968
+ # ---- Blocking ------------------------------------------------
969
+ pairs = generate_candidate_pairs(entities, already, args.max_block)
970
+ print(f"[resolution-v2] blocking generated {len(pairs)} candidate "
971
+ f"pairs not covered by v1 signals")
972
+
973
+ # ---- Embedding similarity -------------------------------------
974
+ ids_needed = sorted({e.id for p in pairs for e in (p.a, p.b)})
975
+ facts_by_entity = {
976
+ eid: load_fact_statements(conn, args.arena, eid, args.top_k_facts)
977
+ for eid in ids_needed
978
+ }
979
+ ents_by_id = {e.id: e for e in entities}
980
+ bundles = [embedding_bundle(ents_by_id[eid], facts_by_entity[eid])
981
+ for eid in ids_needed]
982
+ vectors = dict(zip(ids_needed, backend.embed(bundles))) \
983
+ if bundles else {}
984
+ for p in pairs:
985
+ p.similarity = cosine(vectors[p.a.id], vectors[p.b.id])
986
+
987
+ # ---- Routing: thresholds + adjudication + bare-name policy ----
988
+ routed = route_pairs(pairs, adjudicator, facts_by_entity,
989
+ args.high_threshold, args.low_threshold)
990
+ v2_proposals = pairs_to_proposals(routed.merge)
991
+ print(f"[resolution-v2] embedding+llm tier: "
992
+ f"{len(v2_proposals)} proposals; "
993
+ f"{len(routed.human_review)} pairs → human review; "
994
+ f"{len(routed.dropped)} dropped")
995
+
996
+ # ---- Report ----------------------------------------------------
997
+ records = build_report_records(
998
+ args.arena, v1_proposals, v2_proposals, routed,
999
+ before_count, heuristic_proposals)
1000
+ out = args.out or f"/tmp/entity_resolution_v2_{args.arena}.jsonl"
1001
+ write_jsonl(records, out)
1002
+ summary = args.summary or (out.rsplit(".", 1)[0] + ".md")
1003
+ write_markdown_summary(records, summary)
1004
+ print(f"[resolution-v2] report → {out}\n"
1005
+ f"[resolution-v2] summary → {summary}")
1006
+
1007
+ if not args.apply:
1008
+ print("[resolution-v2] dry-run only (session was READ ONLY); "
1009
+ "pass --apply --i-have-a-snapshot to execute")
1010
+ return 0
1011
+
1012
+ # ---- Apply (gated) ---------------------------------------------
1013
+ to_apply: list[MergeProposal] = [*v1_proposals, *heuristic_proposals]
1014
+ skipped_v2 = 0
1015
+ if v2_proposals:
1016
+ if _entity_merges_check_allows(conn, "embedding_llm"):
1017
+ to_apply.extend(v2_proposals)
1018
+ else:
1019
+ skipped_v2 = len(v2_proposals)
1020
+ print("[resolution-v2] WARNING: entity_merges merge_signal "
1021
+ "CHECK does not admit 'embedding_llm' — the draft "
1022
+ "migration in resolution-queue-design.md must be "
1023
+ "applied first. Skipping the embedding_llm tier "
1024
+ f"({skipped_v2} proposals) to keep the audit honest.",
1025
+ file=sys.stderr)
1026
+
1027
+ succeeded, failed, errors = v1.apply_proposals(
1028
+ conn, args.arena, to_apply, merged_by)
1029
+ conn.commit()
1030
+ with conn.cursor() as cur:
1031
+ cur.execute(ENTITY_COUNT_SQL, (args.arena, args.entity_type))
1032
+ after = cur.fetchone()[0]
1033
+ print(f"[resolution-v2] applied: {succeeded} succeeded, "
1034
+ f"{failed} failed; entities {before_count} → {after}")
1035
+ for e in errors[:20]:
1036
+ print(f" ERR: {e}")
1037
+ return 1 if failed else 0
1038
+
1039
+
1040
+ if __name__ == "__main__":
1041
+ sys.exit(main())