@pentatonic-ai/ai-agent-sdk 0.10.19 → 0.10.21

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,240 @@
1
+ #!/usr/bin/env python3
2
+ """Build a student retrain corpus from CLEAN teacher gold.
3
+
4
+ The student (NuExtract-2.0-4B FT) was originally trained on teacher traces
5
+ produced under the *old* distiller prompts (bbdaba / f1e0ff), which had no
6
+ email-discipline or modality rules — so the student learned to (a) promote
7
+ bystander emails into a person's aliases (the Johann/hotel over-merge), (b)
8
+ collapse future/invited roles to established `state` facts, (c) conflate
9
+ "X & Y" into one entity, and (d) mint email-named / generic-infra entities.
10
+
11
+ This builder draws ONLY from traces produced under the clean prompt (#126
12
+ modality/attribution + #129 email-discipline & entity-separation), whose
13
+ `system_prompt_hash` is the deployed clean hash. Building from old-prompt
14
+ traces would just re-teach the defects, so that is explicitly NOT the default
15
+ (you must pass --allow-dirty-hash to override, and you shouldn't).
16
+
17
+ A defect filter runs as a SECOND line of defence: even clean-prompt output is
18
+ screened for the known defect signatures and dropped if any survive. Every
19
+ drop is counted by reason so the corpus's cleanliness is auditable (not a
20
+ black box — see the printed report).
21
+
22
+ INPUT — an NDJSON stream of trace rows, one object per line:
23
+ {"event_id": "...", "user_prompt": "...", "raw_response": "...",
24
+ "system_prompt_hash": "..."}
25
+ Produce it from the engine box's org_model DB with row_to_json (the escaping
26
+ that bit us before — \\copy double-escapes, $-quoting gets eaten by the shell —
27
+ is avoided by -At + row_to_json):
28
+
29
+ sudo docker exec -i pme2-org-model psql -U pme -d org_model -At -c \\
30
+ "SELECT row_to_json(t) FROM (
31
+ SELECT event_id, user_prompt, raw_response, system_prompt_hash
32
+ FROM distillation_traces
33
+ WHERE system_prompt_hash = '6ccfe70f1286a131'
34
+ ) t" > traces.ndjson
35
+
36
+ OUTPUT — {"input": <per-event block>, "output": <extraction JSON string>}
37
+ JSONL(.gz), the exact shape train_lora.py's load() consumes (it keeps rows
38
+ where both `input` and `output` are truthy, then trains user=input ->
39
+ assistant=output via the NuExtract chat template; no system prompt in the
40
+ pair). The corpus is PER-EVENT while a trace is a 3-event chunk, so each
41
+ trace's user_prompt is split on the `[event K]` markers and matched to
42
+ raw_response[index == K].
43
+
44
+ Usage:
45
+ python build_retrain_corpus.py --traces traces.ndjson --out retrain_clean.jsonl.gz
46
+ zcat traces.ndjson.gz | python build_retrain_corpus.py --traces - --out c.jsonl.gz
47
+ """
48
+ from __future__ import annotations
49
+
50
+ import argparse
51
+ import gzip
52
+ import hashlib
53
+ import json
54
+ import re
55
+ import sys
56
+ from collections import Counter
57
+
58
+ # The clean prompt deployed as SDK 0.10.19 (#126 + #129). Verify against the
59
+ # running extractor-async (worker.SYSTEM_PROMPT_HASH) before a real corpus cut —
60
+ # a prompt edit advances this and old-hash traces must not silently leak in.
61
+ CLEAN_PROMPT_HASH = "6ccfe70f1286a131"
62
+
63
+ # Generic infra / environment tokens that must never be standalone entities
64
+ # (mirrors the #129 DISTINCT ENTITIES rule — kept in sync by hand).
65
+ INFRA_TOKENS = {
66
+ "prod", "production", "staging", "stage", "uat", "qa", "dev", "test",
67
+ "warehouse", "datalake", "data lake", "cluster", "backend", "frontend",
68
+ "the system", "the platform", "the api", "the database", "the server",
69
+ }
70
+
71
+ EVENT_BLOCK_RE = re.compile(r"(?=^\[event \d+\])", re.MULTILINE)
72
+ EVENT_IDX_RE = re.compile(r"^\[event (\d+)\]")
73
+ EMAIL_RE = re.compile(r"^[^@\s]+@[^@\s]+\.[^@\s]+$")
74
+
75
+
76
+ def _email_plausibly_belongs(person_name: str, email: str) -> bool:
77
+ """Keep an email on a person only if it plausibly is theirs: a name token
78
+ appears in the local-part, or the initials match. Same heuristic as the
79
+ write-side guard (#128) so corpus filtering and runtime agree."""
80
+ local = email.split("@", 1)[0].lower()
81
+ local_alnum = re.sub(r"[^a-z0-9]", "", local)
82
+ tokens = [t for t in re.split(r"\s+", person_name.lower()) if t]
83
+ if not tokens:
84
+ return False
85
+ for t in tokens:
86
+ t_alnum = re.sub(r"[^a-z0-9]", "", t)
87
+ if len(t_alnum) >= 3 and t_alnum in local_alnum:
88
+ return True
89
+ initials = "".join(t[0] for t in tokens if t)
90
+ if len(initials) >= 2 and initials in local_alnum:
91
+ return True
92
+ return False
93
+
94
+
95
+ def _entity_defect(ent: dict) -> str | None:
96
+ """Return a drop-reason if this entity carries a known defect, else None."""
97
+ name = (ent.get("name") or "").strip()
98
+ etype = (ent.get("type") or "").lower()
99
+ if not name:
100
+ return "empty_entity_name"
101
+ if EMAIL_RE.match(name):
102
+ return "email_as_entity"
103
+ if name.lower() in INFRA_TOKENS:
104
+ return "generic_infra_entity"
105
+ # Conflation: "Acme & Globex" / "Alice and Bob" smuggled into one node.
106
+ if re.search(r"\s&\s", name) or re.search(r"\b and \b", name.lower()):
107
+ return "conflated_entity"
108
+ if etype == "person":
109
+ for a in ent.get("aliases") or []:
110
+ if isinstance(a, str) and "@" in a and " " not in a \
111
+ and not _email_plausibly_belongs(name, a):
112
+ return "bystander_email_alias"
113
+ return None
114
+
115
+
116
+ def _output_is_clean(obj: dict) -> str | None:
117
+ """Screen one per-event extraction object; return a drop-reason or None."""
118
+ if not isinstance(obj, dict):
119
+ return "output_not_object"
120
+ for ent in obj.get("entities") or []:
121
+ r = _entity_defect(ent)
122
+ if r:
123
+ return r
124
+ return None
125
+
126
+
127
+ def split_events(user_prompt: str) -> dict[int, str]:
128
+ """Split a chunk prompt into {event_index: block_text}."""
129
+ blocks: dict[int, str] = {}
130
+ for block in EVENT_BLOCK_RE.split(user_prompt):
131
+ block = block.rstrip()
132
+ m = EVENT_IDX_RE.match(block)
133
+ if m:
134
+ blocks[int(m.group(1))] = block
135
+ return blocks
136
+
137
+
138
+ def main() -> int:
139
+ ap = argparse.ArgumentParser(description=__doc__,
140
+ formatter_class=argparse.RawDescriptionHelpFormatter)
141
+ ap.add_argument("--traces", required=True,
142
+ help="NDJSON trace rows, or '-' for stdin")
143
+ ap.add_argument("--out", required=True, help="output .jsonl.gz")
144
+ ap.add_argument("--hash", default=CLEAN_PROMPT_HASH,
145
+ help=f"keep only this system_prompt_hash (default {CLEAN_PROMPT_HASH})")
146
+ ap.add_argument("--allow-dirty-hash", action="store_true",
147
+ help="do NOT filter by hash — DANGER: re-teaches old-prompt defects")
148
+ ap.add_argument("--report", help="optional path for a JSON stats report")
149
+ args = ap.parse_args()
150
+
151
+ fh = sys.stdin if args.traces == "-" else open(args.traces, encoding="utf-8")
152
+ stats = Counter()
153
+ seen: set[str] = set()
154
+ examples: list[dict] = []
155
+
156
+ for line in fh:
157
+ line = line.strip()
158
+ if not line:
159
+ continue
160
+ stats["trace_rows"] += 1
161
+ try:
162
+ row = json.loads(line)
163
+ except json.JSONDecodeError:
164
+ stats["drop_trace_unparseable"] += 1
165
+ continue
166
+
167
+ if not args.allow_dirty_hash and row.get("system_prompt_hash") != args.hash:
168
+ stats["drop_wrong_hash"] += 1
169
+ continue
170
+
171
+ raw = row.get("raw_response") or ""
172
+ try:
173
+ parsed = json.loads(raw)
174
+ except json.JSONDecodeError:
175
+ stats["drop_response_unparseable"] += 1
176
+ continue
177
+ # raw_response is either a single per-event object (current trace
178
+ # format — one row per event) or, for legacy chunked traces, a JSON
179
+ # array / {"events": [...]} of per-event objects.
180
+ if isinstance(parsed, dict):
181
+ objs = parsed.get("events") if isinstance(parsed.get("events"), list) else [parsed]
182
+ elif isinstance(parsed, list):
183
+ objs = parsed
184
+ else:
185
+ stats["drop_response_shape"] += 1
186
+ continue
187
+
188
+ blocks = split_events(row.get("user_prompt") or "")
189
+ for obj in objs:
190
+ if not isinstance(obj, dict):
191
+ stats["drop_obj_not_object"] += 1
192
+ continue
193
+ idx = obj.get("index")
194
+ block = blocks.get(idx) if idx is not None else None
195
+ # Single-event trace: one block, one object — match by position
196
+ # even if the stored index doesn't line up with the marker.
197
+ if block is None and len(objs) == 1 and len(blocks) == 1:
198
+ block = next(iter(blocks.values()))
199
+ if not block:
200
+ stats["drop_no_matching_block"] += 1
201
+ continue
202
+
203
+ key = hashlib.sha1(block.encode("utf-8")).hexdigest()
204
+ if key in seen:
205
+ stats["drop_dup"] += 1
206
+ continue
207
+
208
+ reason = _output_is_clean(obj)
209
+ if reason:
210
+ stats[f"drop_{reason}"] += 1
211
+ continue
212
+
213
+ seen.add(key)
214
+ examples.append({"input": block, "output": json.dumps(obj, ensure_ascii=False)})
215
+ stats["kept"] += 1
216
+
217
+ if args.traces != "-":
218
+ fh.close()
219
+
220
+ with gzip.open(args.out, "wt", encoding="utf-8") as out:
221
+ for ex in examples:
222
+ out.write(json.dumps(ex, ensure_ascii=False) + "\n")
223
+
224
+ report = {"out": args.out, "hash": (None if args.allow_dirty_hash else args.hash),
225
+ "stats": dict(sorted(stats.items()))}
226
+ print(json.dumps(report, indent=2))
227
+ if args.report:
228
+ with open(args.report, "w", encoding="utf-8") as rf:
229
+ json.dump(report, rf, indent=2)
230
+
231
+ if stats["kept"] == 0:
232
+ print("\nWARNING: 0 examples kept. If you targeted the clean hash, the "
233
+ "clean-prompt teacher has not produced enough gold yet — let it "
234
+ "accumulate (or run a teacher-only re-distill of a curated event "
235
+ "slice through the clean prompt), then re-run.", file=sys.stderr)
236
+ return 0
237
+
238
+
239
+ if __name__ == "__main__":
240
+ raise SystemExit(main())
@@ -0,0 +1,440 @@
1
+ #!/usr/bin/env python3
2
+ """Fusion de-fragmentation — cluster same-surname PERSON fragments and propose
3
+ merges (RFC-decay-and-fusion A2/A3). DRY-RUN by default.
4
+
5
+ The deterministic upsert resolver (worker.py) converges same-form / shared-alias
6
+ entities, but it CANNOT safely merge surface-form/nickname variants of one real
7
+ person (e.g. "Will Vickers" / "William Vickers" / "William F. Vickers" / bare
8
+ "Vickers") — they have different normalized names and no shared alias, so they
9
+ fragment (209 "Vickers" nodes observed). Merging them is Fusion's job, and it is
10
+ DESTRUCTIVE (repoints facts/relationships, tombstones losers), so the over-merge
11
+ failure mode (folding two DIFFERENT people, or a person into an org) must be
12
+ designed out. This tool is conservative + dry-run-first; --apply is double-gated.
13
+
14
+ CLUSTERING (anti-over-merge by construction):
15
+ - PERSON entities only; never crosses entity_type (so "Vickers Oils" the org is
16
+ never pulled in).
17
+ - Same surname token (the --surname scope).
18
+ - First-name compatibility for the NON-surname tokens: equal, or one an initial
19
+ of the other (W ↔ William), or one a prefix of the other (Will ⊂ William).
20
+ Two DISTINCT full first names (Will vs Jane) are INCOMPATIBLE → never merged.
21
+ - Union-find over compatible NON-bare names → each cluster = one real person.
22
+ - Bare "<surname>" nodes (no first name) are folded in ONLY when there is
23
+ exactly ONE non-bare cluster for the surname (unambiguous); otherwise they
24
+ are left for human review (never used to bridge two distinct people).
25
+
26
+ CANONICAL (A3 scored master, replaces richest-row-wins):
27
+ + has email (attributes.email or an email alias) strongest identity signal
28
+ + full name (>=2 name tokens) a real rendering, not a stub
29
+ + corroboration (provenance event count) grounded in more events
30
+ + fact count the node that holds the picture
31
+ - bare single-token name penalize stub
32
+ - ID-like (digit ratio > 0.5) penalize 7B numeric-id junk
33
+
34
+ OUTPUT: per cluster — master, losers, why, and the repoint impact (facts +
35
+ relationships that would move onto the master). No DB writes in dry-run
36
+ (the session is forced read-only). --apply would execute via the reviewed
37
+ fusion_drive merge executor + entity_merges audit (NOT enabled here).
38
+
39
+ Usage:
40
+ python fusion_defrag.py --arena 'pentatonic-team%' --surname vickers
41
+ python fusion_defrag.py --arena 'pentatonic-team%' --surname vickers --json out.json
42
+ """
43
+ from __future__ import annotations
44
+
45
+ import argparse
46
+ import json
47
+ import re
48
+ import sys
49
+ import uuid
50
+ from collections import defaultdict
51
+
52
+
53
+ def _connect(dsn: str):
54
+ import psycopg
55
+ import psycopg.rows
56
+ return psycopg.connect(dsn, row_factory=psycopg.rows.dict_row)
57
+
58
+
59
+ _TOKEN_RE = re.compile(r"[^a-z0-9]+")
60
+ _EMAIL_RE = re.compile(r"^[^@\s]+@[^@\s]+\.[^@\s]+$")
61
+
62
+
63
+ def name_tokens(name: str) -> list[str]:
64
+ return [t for t in _TOKEN_RE.split((name or "").lower()) if t]
65
+
66
+
67
+ def digit_ratio(name: str) -> float:
68
+ s = re.sub(r"\s+", "", name or "")
69
+ return sum(c.isdigit() for c in s) / len(s) if s else 0.0
70
+
71
+
72
+ # Honorifics/titles are NOT first names — strip them so "Herr Johann Boedecker"
73
+ # matches "Johann Boedecker", and a title-only "Herr Boedecker" reduces to a bare
74
+ # surname (held for review, not merged on the title).
75
+ _TITLES = {"herr", "frau", "fr", "dr", "prof", "mr", "mrs", "ms", "miss",
76
+ "sir", "dame", "mx", "mme", "mlle", "hr"}
77
+
78
+
79
+ def first_name_tokens(name: str, surname: str) -> list[str]:
80
+ """Name tokens minus the surname (first occurrence) and minus honorifics."""
81
+ toks = name_tokens(name)
82
+ out, dropped = [], False
83
+ for t in toks:
84
+ if not dropped and t == surname:
85
+ dropped = True
86
+ continue
87
+ if t in _TITLES:
88
+ continue
89
+ out.append(t)
90
+ return out
91
+
92
+
93
+ def first_names_compatible(a: list[str], b: list[str]) -> bool:
94
+ """Compatible iff the leading given-name tokens don't CONFLICT. Equal /
95
+ initial-of / prefix-of are compatible; two distinct full names are not.
96
+ Empty (bare surname) is handled separately by the caller — NOT here."""
97
+ if not a or not b:
98
+ return False # bare names never auto-bridge via this predicate
99
+ x, y = a[0], b[0]
100
+ if x == y:
101
+ return True
102
+ # initial ↔ full (w / william)
103
+ if (len(x) == 1 and y.startswith(x)) or (len(y) == 1 and x.startswith(y)):
104
+ return True
105
+ # nickname/prefix (will / william) — require >=3 chars to avoid junk
106
+ if len(x) >= 3 and y.startswith(x):
107
+ return True
108
+ if len(y) >= 3 and x.startswith(y):
109
+ return True
110
+ return False
111
+
112
+
113
+ class UnionFind:
114
+ def __init__(self, ids):
115
+ self.p = {i: i for i in ids}
116
+
117
+ def find(self, i):
118
+ while self.p[i] != i:
119
+ self.p[i] = self.p[self.p[i]]
120
+ i = self.p[i]
121
+ return i
122
+
123
+ def union(self, a, b):
124
+ ra, rb = self.find(a), self.find(b)
125
+ if ra != rb:
126
+ self.p[ra] = rb
127
+
128
+
129
+ def has_email(ent: dict) -> bool:
130
+ attrs = ent.get("attributes") or {}
131
+ if isinstance(attrs, dict) and attrs.get("email"):
132
+ return True
133
+ return any(isinstance(a, str) and _EMAIL_RE.match(a) for a in (ent.get("aliases") or []))
134
+
135
+
136
+ def master_score(ent: dict, surname: str) -> float:
137
+ fn = first_name_tokens(ent["canonical_name"], surname)
138
+ score = 0.0
139
+ if has_email(ent):
140
+ score += 3.0
141
+ if len(name_tokens(ent["canonical_name"])) >= 2:
142
+ score += 2.0
143
+ score += min(len(ent.get("provenance_event_ids") or []), 20) * 0.2
144
+ score += min(ent.get("fact_n", 0), 40) * 0.05
145
+ if not fn: # bare surname
146
+ score -= 3.0
147
+ if digit_ratio(ent["canonical_name"]) > 0.5:
148
+ score -= 5.0
149
+ return score
150
+
151
+
152
+ def _union(*lists):
153
+ seen = {}
154
+ for lst in lists:
155
+ for x in lst or []:
156
+ seen.setdefault(x, None)
157
+ return list(seen.keys())
158
+
159
+
160
+ def apply_cluster(cur, conn, arena: str, master: dict, losers: list[dict]) -> dict:
161
+ """Fold `losers` into `master` within ONE transaction. Faithful inline of the
162
+ reviewed fusion_drive _execute_entity_plan + build_entity_merge_plan: repoint
163
+ facts (subject/object) and relationships (endpoints, summing weight on the
164
+ post-repoint (from,to,type) collision), accrete aliases+provenance onto the
165
+ master, write one entity_merges audit row per loser (rollback_payload = full
166
+ loser row), then delete the losers. Re-validates losers still exist first."""
167
+ loser_ids = [l["id"] for l in losers]
168
+ # Load edges/facts touching the losers AND THE MASTER. Including the master is
169
+ # load-bearing for relationships: a loser edge repointed onto the master can
170
+ # collide with an edge the master ALREADY has (or another loser's) on the
171
+ # UNIQUE(arena,from,to,type) key — if we don't see the master's existing edges
172
+ # in collision detection, the repoint UPDATE hits a duplicate-key violation
173
+ # (caught the hard way on the first Vickers apply; the txn rolled back clean).
174
+ # Facts have no such unique key, so master facts are loaded but never repointed
175
+ # (repoint decisions key on the loser set only).
176
+ targets = loser_ids + [master["id"]]
177
+ cur.execute(
178
+ "SELECT id, subject_entity_id, object_entity_id FROM facts "
179
+ "WHERE arena = %s AND (subject_entity_id = ANY(%s) OR object_entity_id = ANY(%s))",
180
+ (arena, targets, targets),
181
+ )
182
+ facts = cur.fetchall()
183
+ cur.execute(
184
+ "SELECT id, from_entity_id, to_entity_id, relationship_type, weight, "
185
+ "provenance_event_ids FROM relationships "
186
+ "WHERE arena = %s AND (from_entity_id = ANY(%s) OR to_entity_id = ANY(%s))",
187
+ (arena, targets, targets),
188
+ )
189
+ rels = cur.fetchall()
190
+
191
+ lset = set(loser_ids)
192
+ aliases = _union(master.get("aliases") or [],
193
+ [l["canonical_name"] for l in losers],
194
+ *[l.get("aliases") or [] for l in losers])
195
+ aliases = [a for a in aliases if a != master["canonical_name"]]
196
+ provenance = _union(master.get("provenance_event_ids") or [],
197
+ *[l.get("provenance_event_ids") or [] for l in losers])
198
+ fact_subj = [f["id"] for f in facts if f["subject_entity_id"] in lset]
199
+ fact_obj = [f["id"] for f in facts if f["object_entity_id"] in lset]
200
+
201
+ def rk(r):
202
+ frm = master["id"] if r["from_entity_id"] in lset else r["from_entity_id"]
203
+ to = master["id"] if r["to_entity_id"] in lset else r["to_entity_id"]
204
+ return (frm, to, r["relationship_type"])
205
+ by_key, rel_repoints, rel_collisions = {}, [], []
206
+ for r in rels:
207
+ touches = r["from_entity_id"] in lset or r["to_entity_id"] in lset
208
+ key = rk(r)
209
+ if key in by_key:
210
+ keep = by_key[key]
211
+ rel_collisions.append({
212
+ "keep": keep["id"], "drop": r["id"],
213
+ "summed_weight": round((keep.get("weight") or 1.0) + (r.get("weight") or 1.0), 4),
214
+ "provenance": _union(keep.get("provenance_event_ids") or [],
215
+ r.get("provenance_event_ids") or []),
216
+ })
217
+ else:
218
+ by_key[key] = r
219
+ if touches:
220
+ rel_repoints.append(r["id"])
221
+
222
+ with conn.transaction():
223
+ live = set()
224
+ cur.execute("SELECT id FROM entities WHERE id = ANY(%s)", (loser_ids,))
225
+ live = {r["id"] for r in cur.fetchall()}
226
+ if live != lset:
227
+ return {"applied": False, "reason": "stale: some losers already gone"}
228
+ cur.execute("UPDATE entities SET aliases=%s, provenance_event_ids=%s, last_seen=NOW() "
229
+ "WHERE id=%s", (aliases, provenance, master["id"]))
230
+ for fid in fact_subj:
231
+ cur.execute("UPDATE facts SET subject_entity_id=%s WHERE id=%s", (master["id"], fid))
232
+ for fid in fact_obj:
233
+ cur.execute("UPDATE facts SET object_entity_id=%s WHERE id=%s", (master["id"], fid))
234
+ # DELETE colliding edges BEFORE repointing — else repointing a "keep" edge
235
+ # onto the master collides with the not-yet-deleted "drop" on the UNIQUE
236
+ # (arena,from,to,type) key. Carry each drop's weight+provenance onto its keep.
237
+ for col in rel_collisions:
238
+ cur.execute("UPDATE relationships SET weight=%s, provenance_event_ids=%s WHERE id=%s",
239
+ (col["summed_weight"], col["provenance"], col["keep"]))
240
+ cur.execute("DELETE FROM relationships WHERE id=%s", (col["drop"],))
241
+ for rid in rel_repoints:
242
+ cur.execute(
243
+ "UPDATE relationships SET "
244
+ "from_entity_id = CASE WHEN from_entity_id = ANY(%s) THEN %s ELSE from_entity_id END, "
245
+ "to_entity_id = CASE WHEN to_entity_id = ANY(%s) THEN %s ELSE to_entity_id END "
246
+ "WHERE id=%s",
247
+ (loser_ids, master["id"], loser_ids, master["id"], rid))
248
+ for l in losers:
249
+ cur.execute(
250
+ "INSERT INTO entity_merges (id, arena, canonical_id, deprecated_id, "
251
+ "deprecated_canonical_name, deprecated_aliases, merge_signal, "
252
+ "facts_repointed, relationships_repointed, merged_by, rollback_payload) "
253
+ "VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s::jsonb)",
254
+ ("em_" + uuid.uuid4().hex[:20], arena, master["id"], l["id"],
255
+ l["canonical_name"], l.get("aliases") or [], "heuristic",
256
+ len(fact_subj) + len(fact_obj), len(rel_repoints), "fusion-defrag",
257
+ json.dumps(l, default=str)))
258
+ cur.execute("DELETE FROM entities WHERE id = ANY(%s)", (loser_ids,))
259
+ return {"applied": True, "facts_repointed": len(fact_subj) + len(fact_obj),
260
+ "rels_repointed": len(rel_repoints), "rel_collisions": len(rel_collisions),
261
+ "tombstoned": len(loser_ids)}
262
+
263
+
264
+ def main() -> int:
265
+ ap = argparse.ArgumentParser(description=__doc__,
266
+ formatter_class=argparse.RawDescriptionHelpFormatter)
267
+ ap.add_argument("--arena", required=True, help="arena LIKE filter (REQUIRED)")
268
+ ap.add_argument("--surname", required=True, help="surname token to scope (e.g. vickers)")
269
+ ap.add_argument("--pg-dsn", default="", help="Postgres DSN (or PG_DSN env)")
270
+ ap.add_argument("--json", help="write the proposal as JSON")
271
+ ap.add_argument("--apply", action="store_true",
272
+ help="EXECUTE the merges (default: dry-run). Requires --i-have-a-snapshot.")
273
+ ap.add_argument("--i-have-a-snapshot", action="store_true",
274
+ help="operator asserts a DB/row snapshot exists for rollback (required with --apply)")
275
+ args = ap.parse_args()
276
+ if args.apply and not args.i_have_a_snapshot:
277
+ print("REFUSED: --apply requires --i-have-a-snapshot (take a snapshot first; "
278
+ "merges repoint facts/rels + tombstone nodes — entity_merges holds rollback "
279
+ "payloads but a row/DB snapshot is the real safety net).", file=sys.stderr)
280
+ return 2
281
+
282
+ import os
283
+ dsn = args.pg_dsn or os.environ.get("PG_DSN", "")
284
+ if not dsn:
285
+ print("FATAL: no --pg-dsn / PG_DSN", file=sys.stderr)
286
+ return 2
287
+ surname = args.surname.lower()
288
+
289
+ with _connect(dsn) as conn:
290
+ with conn.cursor() as cur:
291
+ if not args.apply:
292
+ cur.execute("SET default_transaction_read_only = on") # dry-run safety
293
+ cur.execute("SET max_parallel_workers_per_gather = 0")
294
+ # Person fragments whose CANONICAL NAME carries the surname as a real
295
+ # token. Critically NOT alias-scoped: an @vickers-oil.com email domain
296
+ # in aliases would otherwise drag in unrelated employees (Paul Vann,
297
+ # Matt Tooze) who merely WORK at a Vickers company — the over-merge the
298
+ # first dry-run caught. Email-named stubs (canonical_name has '@') are
299
+ # also excluded: tokenizing an email is not a safe surname signal (they
300
+ # converge via the alias-resolution path instead).
301
+ cur.execute(
302
+ """
303
+ SELECT e.id, e.arena, e.canonical_name, e.aliases, e.provenance_event_ids,
304
+ e.attributes, e.last_seen,
305
+ (SELECT count(*) FROM facts f
306
+ WHERE f.provenance_event_ids && e.provenance_event_ids
307
+ AND (f.subject_entity_id = e.id OR f.object_entity_id = e.id)) AS fact_n
308
+ FROM entities e
309
+ WHERE e.arena LIKE %s AND e.entity_type = 'person'
310
+ AND position('@' in e.canonical_name) = 0
311
+ AND lower(e.canonical_name) ~ %s
312
+ """,
313
+ (args.arena, rf"(^|[^a-z]){surname}([^a-z]|$)"),
314
+ )
315
+ ents = cur.fetchall()
316
+ # Confirm the surname is a real NAME token (regex is a coarse guard).
317
+ ents = [e for e in ents if surname in name_tokens(e["canonical_name"])]
318
+ print(f"[defrag] arena={args.arena} surname={surname!r}: "
319
+ f"{len(ents)} person fragments")
320
+ if not ents:
321
+ return 0
322
+
323
+ by_id = {e["id"]: e for e in ents}
324
+ non_bare = [e for e in ents if first_name_tokens(e["canonical_name"], surname)]
325
+ bare = [e for e in ents if not first_name_tokens(e["canonical_name"], surname)]
326
+
327
+ # Union-find over compatible non-bare names.
328
+ uf = UnionFind([e["id"] for e in non_bare])
329
+ for i in range(len(non_bare)):
330
+ for j in range(i + 1, len(non_bare)):
331
+ a, b = non_bare[i], non_bare[j]
332
+ # Never union across exact arenas — entity id = hash(arena|
333
+ # type|name), so cross-arena same-name nodes are genuinely
334
+ # different scoped entities; merging them would be wrong.
335
+ if a["arena"] != b["arena"]:
336
+ continue
337
+ if first_names_compatible(
338
+ first_name_tokens(a["canonical_name"], surname),
339
+ first_name_tokens(b["canonical_name"], surname),
340
+ ):
341
+ uf.union(a["id"], b["id"])
342
+ clusters: dict[str, list[dict]] = defaultdict(list)
343
+ for e in non_bare:
344
+ clusters[uf.find(e["id"])].append(e)
345
+
346
+ # Bare surnames: fold in ONLY if exactly one non-bare cluster exists.
347
+ bare_note = ""
348
+ if bare:
349
+ if len(clusters) == 1:
350
+ only = next(iter(clusters))
351
+ cl_arena = clusters[only][0]["arena"]
352
+ same = [b for b in bare if b["arena"] == cl_arena]
353
+ clusters[only].extend(same)
354
+ bare_note = f"{len(same)} bare-'{surname}' node(s) folded into the single cluster"
355
+ else:
356
+ bare_note = (f"{len(bare)} bare-'{surname}' node(s) LEFT FOR REVIEW "
357
+ f"({len(clusters)} distinct name-clusters — ambiguous which person)")
358
+
359
+ proposals = []
360
+ for cid, members in clusters.items():
361
+ if len(members) < 2:
362
+ continue
363
+ master = max(members, key=lambda e: (master_score(e, surname),
364
+ len(e.get("provenance_event_ids") or []),
365
+ len(e["canonical_name"])))
366
+ losers = [e for e in members if e["id"] != master["id"]]
367
+ loser_ids = [l["id"] for l in losers]
368
+ # Repoint impact (read-only counts).
369
+ cur.execute(
370
+ "SELECT count(*) AS n FROM facts WHERE arena LIKE %s AND "
371
+ "(subject_entity_id = ANY(%s) OR object_entity_id = ANY(%s))",
372
+ (args.arena, loser_ids, loser_ids),
373
+ )
374
+ facts_repointed = cur.fetchone()["n"]
375
+ cur.execute(
376
+ "SELECT count(*) AS n FROM relationships WHERE arena LIKE %s AND "
377
+ "(from_entity_id = ANY(%s) OR to_entity_id = ANY(%s))",
378
+ (args.arena, loser_ids, loser_ids),
379
+ )
380
+ rels_repointed = cur.fetchone()["n"]
381
+ proposals.append({
382
+ "arena": master["arena"],
383
+ "master_row": master, # full row for apply
384
+ "loser_rows": losers,
385
+ "master": {"id": master["id"], "name": master["canonical_name"],
386
+ "facts": master.get("fact_n", 0),
387
+ "prov": len(master.get("provenance_event_ids") or []),
388
+ "email": has_email(master),
389
+ "score": round(master_score(master, surname), 2)},
390
+ "losers": [{"id": l["id"], "name": l["canonical_name"],
391
+ "facts": l.get("fact_n", 0),
392
+ "prov": len(l.get("provenance_event_ids") or []),
393
+ "email": has_email(l)} for l in losers],
394
+ "facts_repointed": facts_repointed,
395
+ "rels_repointed": rels_repointed,
396
+ })
397
+
398
+ if args.apply and proposals:
399
+ conn.rollback() # end the read-only probe txn cleanly before writes
400
+ print(f"\n[defrag] APPLYING {len(proposals)} cluster(s) — arena-scoped, transactional…")
401
+ for p in proposals:
402
+ p["apply_result"] = apply_cluster(cur, conn, p["arena"],
403
+ p["master_row"], p["loser_rows"])
404
+ print(f" master={p['master']['name']!r} ({p['arena']}): {p['apply_result']}")
405
+
406
+ # ---- report ----
407
+ mode = "APPLIED" if args.apply else "PROPOSED (dry-run, no writes)"
408
+ print(f"\n=== {mode} MERGES — surname '{surname}' ===")
409
+ if bare_note:
410
+ print(f" note: {bare_note}")
411
+ if not proposals:
412
+ print(" (no multi-node clusters — nothing to merge)")
413
+ tot_dep = tot_f = tot_r = 0
414
+ for i, p in enumerate(proposals, 1):
415
+ m = p["master"]
416
+ print(f"\n[{i}] MASTER ← {m['name']!r} ({m['id'][:10]}…) "
417
+ f"score={m['score']} facts={m['facts']} prov={m['prov']} email={m['email']}")
418
+ for l in p["losers"]:
419
+ print(f" merge: {l['name']!r} ({l['id'][:10]}…) "
420
+ f"facts={l['facts']} prov={l['prov']} email={l['email']}")
421
+ print(f" → would repoint {p['facts_repointed']} facts, "
422
+ f"{p['rels_repointed']} relationships onto the master; "
423
+ f"{len(p['losers'])} node(s) tombstoned")
424
+ tot_dep += len(p["losers"]); tot_f += p["facts_repointed"]; tot_r += p["rels_repointed"]
425
+ tail = ("APPLIED — rollback via entity_merges (merged_by='fusion-defrag') + snapshot."
426
+ if args.apply else "DRY-RUN — nothing written.")
427
+ print(f"\nTOTAL: {len(proposals)} cluster(s), {tot_dep} nodes tombstoned, "
428
+ f"{tot_f} facts + {tot_r} relationships repointed. {tail}")
429
+ if args.json:
430
+ with open(args.json, "w") as f:
431
+ json.dump({"surname": surname, "arena": args.arena, "applied": args.apply,
432
+ "bare_note": bare_note,
433
+ "proposals": [{k: v for k, v in p.items()
434
+ if k not in ("master_row", "loser_rows")}
435
+ for p in proposals]}, f, indent=2)
436
+ return 0
437
+
438
+
439
+ if __name__ == "__main__":
440
+ raise SystemExit(main())