@pentatonic-ai/ai-agent-sdk 0.10.1 → 0.10.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,53 @@
1
+ -- pentatonic-memory-engine v2: entity reconciliation audit table.
2
+ --
3
+ -- Backs the entity-reconciliation RFC's backfill (step 3) and the
4
+ -- online merge path. Every time a duplicate entity row is merged
5
+ -- into a canonical row, one record is written here recording:
6
+ --
7
+ -- - which canonical row absorbed the merge
8
+ -- - which row was deprecated (deleted from `entities`)
9
+ -- - what signal triggered the merge (co_occurrence | alias_overlap |
10
+ -- heuristic | online_resolver)
11
+ -- - how many facts / relationships were repointed
12
+ -- - a `rollback_payload` JSONB snapshot of the deprecated row's
13
+ -- state, sufficient to recreate it if the merge proves wrong
14
+ --
15
+ -- The losing row in `entities` IS deleted on merge (otherwise the
16
+ -- alias-GIN + canonical-name lookups still find both, defeating the
17
+ -- whole point). This table is the receipt + rollback substrate.
18
+ --
19
+ -- See RFC: packages/memory-engine-v2/RFC-entity-reconciliation.md §3
20
+
21
+ CREATE TABLE IF NOT EXISTS entity_merges (
22
+ id TEXT PRIMARY KEY,
23
+ arena TEXT NOT NULL,
24
+ canonical_id TEXT NOT NULL REFERENCES entities(id) ON DELETE CASCADE,
25
+ deprecated_id TEXT NOT NULL, -- no FK; row is deleted
26
+ deprecated_canonical_name TEXT NOT NULL, -- preserve for forensics
27
+ deprecated_aliases TEXT[] NOT NULL DEFAULT '{}', -- preserve for forensics
28
+ merge_signal TEXT NOT NULL CHECK (
29
+ merge_signal IN ('co_occurrence', 'alias_overlap', 'heuristic', 'online_resolver')
30
+ ),
31
+ facts_repointed INTEGER NOT NULL DEFAULT 0,
32
+ relationships_repointed INTEGER NOT NULL DEFAULT 0,
33
+ merged_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
34
+ merged_by TEXT NOT NULL, -- 'backfill-YYYY-MM'|'online-resolver'
35
+
36
+ -- Rollback snapshot: enough state to recreate the deprecated row
37
+ -- if the merge is judged wrong. Includes everything that was
38
+ -- specific to the deprecated row (provenance_event_ids etc).
39
+ rollback_payload JSONB NOT NULL DEFAULT '{}'::jsonb
40
+ );
41
+
42
+ -- Look up "what merged into this entity" (canonical-side query).
43
+ CREATE INDEX IF NOT EXISTS idx_entity_merges_canonical
44
+ ON entity_merges(canonical_id);
45
+
46
+ -- Look up "was this id ever a separate entity that got merged" so
47
+ -- callers holding a stale id can resolve it forward.
48
+ CREATE INDEX IF NOT EXISTS idx_entity_merges_deprecated
49
+ ON entity_merges(deprecated_id);
50
+
51
+ -- Per-arena audit listing (e.g. dry-run reports).
52
+ CREATE INDEX IF NOT EXISTS idx_entity_merges_arena_merged_at
53
+ ON entity_merges(arena, merged_at DESC);
@@ -0,0 +1,60 @@
1
+ -- pentatonic-memory-engine v2: distillation trace audit table.
2
+ --
3
+ -- Append-only log of raw LLM teacher I/O per event distillation. We
4
+ -- already keep the *parsed* output as facts/entities/relationships,
5
+ -- but the org-model's normal-form storage is post-merge and
6
+ -- post-filter — perfect for retrieval, wrong shape for training a
7
+ -- student model on the teacher's distribution.
8
+ --
9
+ -- This table is the teacher-distribution record:
10
+ --
11
+ -- user_prompt = the per-event block we fed into the LLM (i.e.
12
+ -- build_event_block(i, ev) — what the model SAW)
13
+ -- raw_response = the per-event slice of the model's pipe-delimited
14
+ -- output (i.e. everything between `=== event K ===`
15
+ -- and the next header — what the model PRODUCED)
16
+ --
17
+ -- Together they form a (input, output) pair suitable for fine-tuning
18
+ -- a seq2seq student (BART/FLAN-T5) to mimic the teacher's extraction
19
+ -- behaviour. `system_prompt_hash` lets us segment training data by
20
+ -- teacher version: when the BATCH_SYSTEM_PROMPT changes, don't train
21
+ -- a student on outputs produced under the old prompt.
22
+ --
23
+ -- Forget semantics: ON DELETE CASCADE from events. A FORGET_MEMORY
24
+ -- that deletes the source event also removes its trace — training
25
+ -- data inherits the same right-to-erasure contract as the rest of
26
+ -- the org-model.
27
+
28
+ CREATE TABLE IF NOT EXISTS distillation_traces (
29
+ id BIGSERIAL PRIMARY KEY,
30
+ event_id TEXT NOT NULL REFERENCES events(id) ON DELETE CASCADE,
31
+
32
+ -- Teacher I/O for THIS event. Both are bounded by MAX_CONTENT_CHARS
33
+ -- + LLM_MAX_TOKENS_PER_EVENT at write time, so storage is
34
+ -- predictable (~2-3KB/row).
35
+ user_prompt TEXT NOT NULL,
36
+ raw_response TEXT NOT NULL,
37
+
38
+ -- Teacher identity. Lets us filter when the prompt or model changes
39
+ -- (don't train students on outputs from a retired prompt). Hash is
40
+ -- truncated sha256(BATCH_SYSTEM_PROMPT)[:16] — long enough to be a
41
+ -- collision-free identifier, short enough to index cheaply.
42
+ llm_model TEXT NOT NULL,
43
+ system_prompt_hash TEXT NOT NULL,
44
+
45
+ -- LLM call latency (chunk-level — one LLM call distills N events
46
+ -- in one request). Useful for setting student-latency targets.
47
+ llm_chunk_ms REAL,
48
+
49
+ created_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
50
+ );
51
+
52
+ CREATE INDEX IF NOT EXISTS idx_distillation_traces_event_id
53
+ ON distillation_traces(event_id);
54
+
55
+ CREATE INDEX IF NOT EXISTS idx_distillation_traces_created_at
56
+ ON distillation_traces(created_at DESC);
57
+
58
+ -- Segment by teacher version when exporting training data.
59
+ CREATE INDEX IF NOT EXISTS idx_distillation_traces_prompt_hash
60
+ ON distillation_traces(system_prompt_hash);
@@ -0,0 +1,581 @@
1
+ #!/usr/bin/env python3
2
+ """Backfill: merge fragmented person entities (and optionally other
3
+ types) into a single canonical row per person.
4
+
5
+ Implements RFC §3 (backfill merge). Dry-run by default; --apply
6
+ required to actually write. One arena at a time. Audit records every
7
+ merge to the `entity_merges` table with enough state in
8
+ `rollback_payload` to recreate the deprecated row if a merge proves
9
+ wrong.
10
+
11
+ Grouping signal priority (per RFC):
12
+
13
+ 1. CO-OCCURRENCE — events whose `attributes` carry both a name and
14
+ an email for the same person (gmail from_name+from_email,
15
+ calendar attendee displayName+email, slack profile rows). This is
16
+ ground-truth pairing.
17
+
18
+ 2. ALIAS_OVERLAP — entity rows where A.canonical_name appears in
19
+ B.aliases (or vice versa). Catches post-fix merges still missing
20
+ from the legacy backfill.
21
+
22
+ 3. HEURISTIC — (OFF by default; --heuristic-merge to enable)
23
+ local-part vs name-tokens overlap for cases where no event ever
24
+ paired the surface forms. Risky; only with explicit operator
25
+ flag + manual audit of the dry-run.
26
+
27
+ Usage:
28
+
29
+ python3 backfill_entity_reconciliation.py \\
30
+ --arena <arena-id> \\
31
+ --pg-dsn postgresql://... \\
32
+ [--apply] # write; default is dry-run
33
+ [--entity-type person] # which type to reconcile; default person
34
+ [--heuristic-merge] # enable signal 3 (off by default)
35
+ [--out /tmp/merges.jsonl] # where to write the merge report
36
+
37
+ Exit codes:
38
+ 0 success (dry-run report written, or --apply completed)
39
+ 1 partial failure (some merges failed; report shows which)
40
+ 2 bad arguments
41
+ """
42
+
43
+ from __future__ import annotations
44
+
45
+ import argparse
46
+ import hashlib
47
+ import json
48
+ import os
49
+ import re
50
+ import sys
51
+ import unicodedata
52
+ from collections import defaultdict
53
+ from dataclasses import dataclass, field
54
+ from datetime import datetime, timezone
55
+
56
+ import psycopg
57
+ import psycopg.rows
58
+
59
+
60
+ # ----------------------------------------------------------------------
61
+ # Constants & helpers
62
+ # ----------------------------------------------------------------------
63
+
64
+ # Person-shaped attribute keys to look up paired (name, email) on
65
+ # events. Producer-agnostic: any source that follows the conventional
66
+ # `<role>_email` / `<role>_name` shape contributes co-occurrence pairs.
67
+ PERSON_ROLE_PAIRS = [
68
+ ("from_email", "from_name"),
69
+ ("to_email", "to_name"),
70
+ ("cc_email", "cc_name"),
71
+ ("reply_to_email", "reply_to_name"),
72
+ ("sender_email", "sender_name"),
73
+ ("organizer_email", "organizer_name"),
74
+ ("organizer_email", "organizer_display_name"),
75
+ ]
76
+ EMAIL_RE = re.compile(r"\b([a-zA-Z0-9._%+\-]+@[a-zA-Z0-9.\-]+\.[a-zA-Z]{2,})\b")
77
+ HEURISTIC_LOCAL_PART_RE = re.compile(r"^([a-zA-Z][a-zA-Z0-9._\-]*)@")
78
+
79
+
80
+ def _normalize_surface(s: str) -> str:
81
+ """MIRROR of extractor-{sync,async}'s _normalize_surface."""
82
+ return re.sub(r"\s+", " ", unicodedata.normalize("NFKC", s)).strip().lower()
83
+
84
+
85
+ def _looks_like_email(s: str) -> bool:
86
+ return isinstance(s, str) and "@" in s and " " not in s and "." in s.split("@", 1)[-1]
87
+
88
+
89
+ def _local_part(email: str) -> str:
90
+ m = HEURISTIC_LOCAL_PART_RE.match(email)
91
+ return m.group(1) if m else ""
92
+
93
+
94
+ def _name_tokens(name: str) -> set[str]:
95
+ """Split a display name into lowercased tokens for heuristic match."""
96
+ return {t for t in re.split(r"[\s\.\-_]+", name.lower()) if t}
97
+
98
+
99
+ def _local_part_tokens(local: str) -> set[str]:
100
+ return {t for t in re.split(r"[\.\-_]+", local.lower()) if t}
101
+
102
+
103
+ # ----------------------------------------------------------------------
104
+ # Data classes
105
+ # ----------------------------------------------------------------------
106
+
107
+ @dataclass
108
+ class Entity:
109
+ id: str
110
+ canonical_name: str
111
+ aliases: list[str]
112
+ provenance_event_ids: list[str]
113
+ fact_count: int = 0 # filled later
114
+ rel_count: int = 0 # filled later
115
+ # All surface forms (normalized) for fast set-overlap matching.
116
+ norm_forms: set[str] = field(default_factory=set)
117
+
118
+
119
+ @dataclass
120
+ class MergeProposal:
121
+ canonical: Entity
122
+ deprecated: list[Entity]
123
+ signal: str # 'co_occurrence' | 'alias_overlap' | 'heuristic'
124
+
125
+
126
+ # ----------------------------------------------------------------------
127
+ # Load + signal extraction
128
+ # ----------------------------------------------------------------------
129
+
130
+ def load_entities(conn: psycopg.Connection, arena: str, entity_type: str) -> list[Entity]:
131
+ """Pull all entities of the given type for this arena, with
132
+ fact_count + rel_count for canonical-selection (richest wins)."""
133
+ out: list[Entity] = []
134
+ with conn.cursor(row_factory=psycopg.rows.dict_row) as cur:
135
+ cur.execute(
136
+ """
137
+ WITH ent AS (
138
+ SELECT id, canonical_name, aliases, provenance_event_ids
139
+ FROM entities
140
+ WHERE arena = %s AND entity_type = %s
141
+ ),
142
+ f AS (
143
+ SELECT subject_entity_id AS eid, COUNT(*) AS n FROM facts
144
+ WHERE arena = %s GROUP BY 1
145
+ UNION ALL
146
+ SELECT object_entity_id AS eid, COUNT(*) AS n FROM facts
147
+ WHERE arena = %s AND object_entity_id IS NOT NULL GROUP BY 1
148
+ ),
149
+ r AS (
150
+ SELECT from_entity_id AS eid, COUNT(*) AS n FROM relationships
151
+ WHERE arena = %s GROUP BY 1
152
+ UNION ALL
153
+ SELECT to_entity_id AS eid, COUNT(*) AS n FROM relationships
154
+ WHERE arena = %s GROUP BY 1
155
+ )
156
+ SELECT
157
+ ent.id, ent.canonical_name, ent.aliases, ent.provenance_event_ids,
158
+ COALESCE((SELECT SUM(n) FROM f WHERE eid = ent.id), 0) AS fact_count,
159
+ COALESCE((SELECT SUM(n) FROM r WHERE eid = ent.id), 0) AS rel_count
160
+ FROM ent
161
+ """,
162
+ (arena, entity_type, arena, arena, arena, arena),
163
+ )
164
+ for r in cur.fetchall():
165
+ forms = {_normalize_surface(r["canonical_name"])}
166
+ for a in r["aliases"] or []:
167
+ forms.add(_normalize_surface(a))
168
+ out.append(Entity(
169
+ id=r["id"],
170
+ canonical_name=r["canonical_name"],
171
+ aliases=list(r["aliases"] or []),
172
+ provenance_event_ids=list(r["provenance_event_ids"] or []),
173
+ fact_count=int(r["fact_count"]),
174
+ rel_count=int(r["rel_count"]),
175
+ norm_forms=forms,
176
+ ))
177
+ return out
178
+
179
+
180
+ def collect_cooccurrence_pairs(
181
+ conn: psycopg.Connection, arena: str
182
+ ) -> set[tuple[str, str]]:
183
+ """Scan events.attributes for paired (name, email) where both
184
+ appear for the same person in the same event. Returns
185
+ set of (normalized_name, normalized_email) pairs."""
186
+ pairs: set[tuple[str, str]] = set()
187
+ with conn.cursor(row_factory=psycopg.rows.dict_row) as cur:
188
+ cur.execute(
189
+ "SELECT attributes FROM events WHERE arena = %s",
190
+ (arena,),
191
+ )
192
+ for row in cur:
193
+ attrs = row["attributes"] or {}
194
+ if not isinstance(attrs, dict):
195
+ continue
196
+ for email_key, name_key in PERSON_ROLE_PAIRS:
197
+ email = attrs.get(email_key)
198
+ name = attrs.get(name_key)
199
+ if isinstance(email, str) and isinstance(name, str) \
200
+ and _looks_like_email(email) and name.strip():
201
+ pairs.add((_normalize_surface(name), _normalize_surface(email)))
202
+ # Structured attendee objects (calendar producers).
203
+ attendees = attrs.get("attendees") or attrs.get("attendee_objects")
204
+ if isinstance(attendees, list):
205
+ for a in attendees:
206
+ if isinstance(a, dict):
207
+ email = a.get("email")
208
+ name = a.get("displayName") or a.get("name")
209
+ if isinstance(email, str) and isinstance(name, str) \
210
+ and _looks_like_email(email) and name.strip():
211
+ pairs.add((_normalize_surface(name),
212
+ _normalize_surface(email)))
213
+ return pairs
214
+
215
+
216
+ # ----------------------------------------------------------------------
217
+ # Merge proposal building
218
+ # ----------------------------------------------------------------------
219
+
220
+ def build_proposals(
221
+ entities: list[Entity],
222
+ cooccurrence_pairs: set[tuple[str, str]],
223
+ use_heuristic: bool,
224
+ ) -> list[MergeProposal]:
225
+ """Union-find over entities grouped by overlap of any of:
226
+ 1. co-occurrence pairs (highest priority)
227
+ 2. alias overlap (cheap, ground-truth via existing aliases)
228
+ 3. heuristic local-part vs name-tokens (optional)
229
+
230
+ Returns one MergeProposal per group with >= 2 entities; the
231
+ `signal` is set to the *highest-priority* signal that contributed.
232
+ """
233
+ # parent: entity-id → root-id (union-find)
234
+ parent: dict[str, str] = {e.id: e.id for e in entities}
235
+ # Track which signal connected each pair; resolved per-group at the end.
236
+ edge_signal: dict[frozenset, str] = {}
237
+
238
+ def find(x: str) -> str:
239
+ while parent[x] != x:
240
+ parent[x] = parent[parent[x]]
241
+ x = parent[x]
242
+ return x
243
+
244
+ def union(a: str, b: str, signal: str) -> None:
245
+ ra, rb = find(a), find(b)
246
+ if ra == rb:
247
+ return
248
+ parent[ra] = rb
249
+ edge_signal[frozenset({ra, rb})] = signal
250
+
251
+ # ---- Signal 1: co-occurrence pairs ---------------------------------
252
+ # Build a (normalized_form → entity_id) lookup, then for each
253
+ # (name, email) pair, if both forms map to entities, union them.
254
+ form_to_entity: dict[str, str] = {}
255
+ for e in entities:
256
+ for f in e.norm_forms:
257
+ # First-write wins on collision; co-occurrence-driven
258
+ # merges happen below anyway.
259
+ form_to_entity.setdefault(f, e.id)
260
+ for n_name, n_email in cooccurrence_pairs:
261
+ eid_name = form_to_entity.get(n_name)
262
+ eid_email = form_to_entity.get(n_email)
263
+ if eid_name and eid_email and eid_name != eid_email:
264
+ union(eid_name, eid_email, "co_occurrence")
265
+
266
+ # ---- Signal 2: alias overlap ---------------------------------------
267
+ # Two entities that share any normalized form (one's canonical
268
+ # appears in the other's aliases, etc.) should already be one.
269
+ # Group by each form and union all entities sharing it.
270
+ form_to_entities: dict[str, set[str]] = defaultdict(set)
271
+ for e in entities:
272
+ for f in e.norm_forms:
273
+ form_to_entities[f].add(e.id)
274
+ for ents in form_to_entities.values():
275
+ if len(ents) <= 1:
276
+ continue
277
+ ents_list = sorted(ents)
278
+ for other in ents_list[1:]:
279
+ union(ents_list[0], other, "alias_overlap")
280
+
281
+ # ---- Signal 3 (optional): heuristic local-part vs name tokens ------
282
+ if use_heuristic:
283
+ # For every email-only canonical that hasn't been unioned via
284
+ # 1 or 2, try matching its local-part tokens against
285
+ # name-canonicals' tokens. Last-resort; can produce false
286
+ # positives (e.g. 'sam' matches 'Sam Patel' AND 'Sam Jones').
287
+ # Operator MUST eyeball the dry-run report.
288
+ email_entities: list[Entity] = [
289
+ e for e in entities
290
+ if _looks_like_email(e.canonical_name)
291
+ ]
292
+ name_entities_by_token: dict[str, list[Entity]] = defaultdict(list)
293
+ for e in entities:
294
+ if not _looks_like_email(e.canonical_name):
295
+ for t in _name_tokens(e.canonical_name):
296
+ name_entities_by_token[t].append(e)
297
+ for ee in email_entities:
298
+ local = _local_part(ee.canonical_name)
299
+ tokens = _local_part_tokens(local)
300
+ candidates: dict[str, int] = defaultdict(int)
301
+ for t in tokens:
302
+ for ne in name_entities_by_token.get(t, []):
303
+ if find(ee.id) == find(ne.id):
304
+ continue
305
+ candidates[ne.id] += 1
306
+ # Require >= 2 token-overlap to consider, OR a single
307
+ # token that's both >= 3 chars and appears in only one
308
+ # candidate (unambiguous nickname-style).
309
+ best: tuple[int, str] | None = None
310
+ for cand_id, hits in candidates.items():
311
+ if hits >= 2 or (hits == 1 and len(tokens) == 1
312
+ and len(next(iter(tokens))) >= 3
313
+ and len(candidates) == 1):
314
+ if best is None or hits > best[0]:
315
+ best = (hits, cand_id)
316
+ if best is not None:
317
+ union(ee.id, best[1], "heuristic")
318
+
319
+ # ---- Materialise groups → proposals --------------------------------
320
+ groups: dict[str, list[Entity]] = defaultdict(list)
321
+ for e in entities:
322
+ groups[find(e.id)].append(e)
323
+
324
+ proposals: list[MergeProposal] = []
325
+ for group in groups.values():
326
+ if len(group) < 2:
327
+ continue
328
+ # Canonical = richest (most facts, then most rels, then most
329
+ # provenance events, then lex-smallest id for determinism).
330
+ group_sorted = sorted(
331
+ group,
332
+ key=lambda e: (-e.fact_count, -e.rel_count,
333
+ -len(e.provenance_event_ids), e.id),
334
+ )
335
+ canonical = group_sorted[0]
336
+ deprecated = group_sorted[1:]
337
+
338
+ # Choose strongest signal that connected this group.
339
+ ids = {e.id for e in group}
340
+ signals_in_group = {
341
+ sig for edge, sig in edge_signal.items()
342
+ if edge & ids
343
+ }
344
+ signal_priority = ("co_occurrence", "alias_overlap", "heuristic")
345
+ chosen = next((s for s in signal_priority if s in signals_in_group),
346
+ "alias_overlap")
347
+ proposals.append(MergeProposal(canonical=canonical,
348
+ deprecated=deprecated,
349
+ signal=chosen))
350
+ return proposals
351
+
352
+
353
+ # ----------------------------------------------------------------------
354
+ # Apply (--apply)
355
+ # ----------------------------------------------------------------------
356
+
357
+ def apply_proposals(
358
+ conn: psycopg.Connection,
359
+ arena: str,
360
+ proposals: list[MergeProposal],
361
+ merged_by: str,
362
+ ) -> tuple[int, int, list[str]]:
363
+ """Apply merges in one transaction per proposal (so a failure
364
+ doesn't roll back successful merges in the same batch).
365
+
366
+ Returns (succeeded_count, failed_count, errors)."""
367
+ succeeded = 0
368
+ failed = 0
369
+ errors: list[str] = []
370
+ for p in proposals:
371
+ try:
372
+ with conn.transaction():
373
+ with conn.cursor() as cur:
374
+ # Lock the canonical row + every deprecated row to
375
+ # avoid concurrent online-resolver writes during
376
+ # the merge.
377
+ ids = [p.canonical.id, *(d.id for d in p.deprecated)]
378
+ cur.execute(
379
+ "SELECT id FROM entities WHERE id = ANY(%s) FOR UPDATE",
380
+ (ids,),
381
+ )
382
+ for dep in p.deprecated:
383
+ # Repoint facts.
384
+ cur.execute(
385
+ """
386
+ UPDATE facts SET subject_entity_id = %s
387
+ WHERE arena = %s AND subject_entity_id = %s
388
+ """,
389
+ (p.canonical.id, arena, dep.id),
390
+ )
391
+ facts_repointed = cur.rowcount
392
+ cur.execute(
393
+ """
394
+ UPDATE facts SET object_entity_id = %s
395
+ WHERE arena = %s AND object_entity_id = %s
396
+ """,
397
+ (p.canonical.id, arena, dep.id),
398
+ )
399
+ facts_repointed += cur.rowcount
400
+ # Repoint relationships.
401
+ cur.execute(
402
+ """
403
+ UPDATE relationships SET from_entity_id = %s
404
+ WHERE arena = %s AND from_entity_id = %s
405
+ """,
406
+ (p.canonical.id, arena, dep.id),
407
+ )
408
+ rels_repointed = cur.rowcount
409
+ cur.execute(
410
+ """
411
+ UPDATE relationships SET to_entity_id = %s
412
+ WHERE arena = %s AND to_entity_id = %s
413
+ """,
414
+ (p.canonical.id, arena, dep.id),
415
+ )
416
+ rels_repointed += cur.rowcount
417
+
418
+ # Merge aliases + provenance into canonical.
419
+ cur.execute(
420
+ """
421
+ UPDATE entities SET
422
+ aliases = ARRAY(SELECT DISTINCT UNNEST(
423
+ aliases || %s::text[] || ARRAY[%s]
424
+ )),
425
+ provenance_event_ids = ARRAY(SELECT DISTINCT UNNEST(
426
+ provenance_event_ids || %s::text[]
427
+ )),
428
+ last_seen = NOW()
429
+ WHERE id = %s
430
+ """,
431
+ (dep.aliases, dep.canonical_name,
432
+ dep.provenance_event_ids, p.canonical.id),
433
+ )
434
+
435
+ # Audit + rollback payload.
436
+ rollback_payload = {
437
+ "id": dep.id,
438
+ "canonical_name": dep.canonical_name,
439
+ "aliases": dep.aliases,
440
+ "provenance_event_ids": dep.provenance_event_ids,
441
+ }
442
+ merge_id = "m_" + hashlib.sha256(
443
+ f"{arena}|{dep.id}|{p.canonical.id}".encode()
444
+ ).hexdigest()[:24]
445
+ cur.execute(
446
+ """
447
+ INSERT INTO entity_merges (
448
+ id, arena, canonical_id, deprecated_id,
449
+ deprecated_canonical_name, deprecated_aliases,
450
+ merge_signal, facts_repointed,
451
+ relationships_repointed, merged_by, rollback_payload
452
+ ) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s::jsonb)
453
+ ON CONFLICT (id) DO NOTHING
454
+ """,
455
+ (
456
+ merge_id, arena, p.canonical.id, dep.id,
457
+ dep.canonical_name, dep.aliases,
458
+ p.signal, facts_repointed, rels_repointed,
459
+ merged_by, json.dumps(rollback_payload),
460
+ ),
461
+ )
462
+
463
+ # Delete the deprecated row.
464
+ cur.execute(
465
+ "DELETE FROM entities WHERE id = %s",
466
+ (dep.id,),
467
+ )
468
+ succeeded += 1
469
+ except Exception as e:
470
+ failed += 1
471
+ errors.append(f"{p.canonical.id} <- {[d.id for d in p.deprecated]}: {e}")
472
+ return succeeded, failed, errors
473
+
474
+
475
+ # ----------------------------------------------------------------------
476
+ # Report writer
477
+ # ----------------------------------------------------------------------
478
+
479
+ def write_report(proposals: list[MergeProposal], path: str) -> None:
480
+ """JSONL with one record per proposal. Operator inspects this
481
+ before --apply."""
482
+ with open(path, "w") as f:
483
+ for p in proposals:
484
+ f.write(json.dumps({
485
+ "canonical": {
486
+ "id": p.canonical.id,
487
+ "canonical_name": p.canonical.canonical_name,
488
+ "fact_count": p.canonical.fact_count,
489
+ "rel_count": p.canonical.rel_count,
490
+ },
491
+ "deprecated": [
492
+ {
493
+ "id": d.id,
494
+ "canonical_name": d.canonical_name,
495
+ "aliases": d.aliases,
496
+ "fact_count": d.fact_count,
497
+ "rel_count": d.rel_count,
498
+ } for d in p.deprecated
499
+ ],
500
+ "signal": p.signal,
501
+ }) + "\n")
502
+
503
+
504
+ # ----------------------------------------------------------------------
505
+ # CLI
506
+ # ----------------------------------------------------------------------
507
+
508
+ def parse_args() -> argparse.Namespace:
509
+ p = argparse.ArgumentParser(
510
+ description=__doc__,
511
+ formatter_class=argparse.RawDescriptionHelpFormatter,
512
+ )
513
+ p.add_argument("--arena", required=True,
514
+ help="arena id to reconcile (one at a time)")
515
+ p.add_argument("--pg-dsn", default=os.environ.get("PG_DSN"),
516
+ help="postgres DSN; defaults to $PG_DSN")
517
+ p.add_argument("--entity-type", default="person",
518
+ help="entity type to reconcile (default: person)")
519
+ p.add_argument("--apply", action="store_true",
520
+ help="actually run the merges; default is dry-run")
521
+ p.add_argument("--heuristic-merge", action="store_true",
522
+ help="enable signal 3 (off by default; risky)")
523
+ p.add_argument("--out", default=None,
524
+ help="merge-report JSONL path (default: stdout-only "
525
+ "summary, no jsonl)")
526
+ p.add_argument("--merged-by", default=None,
527
+ help="audit tag (default: backfill-YYYY-MM)")
528
+ return p.parse_args()
529
+
530
+
531
+ def main() -> int:
532
+ args = parse_args()
533
+ if not args.pg_dsn:
534
+ print("error: --pg-dsn (or $PG_DSN) required", file=sys.stderr)
535
+ return 2
536
+
537
+ merged_by = args.merged_by or f"backfill-{datetime.now(timezone.utc):%Y-%m}"
538
+
539
+ with psycopg.connect(args.pg_dsn, autocommit=False) as conn:
540
+ print(f"[backfill] arena={args.arena} type={args.entity_type} "
541
+ f"apply={args.apply} heuristic={args.heuristic_merge}")
542
+ entities = load_entities(conn, args.arena, args.entity_type)
543
+ print(f"[backfill] loaded {len(entities)} {args.entity_type} entities")
544
+
545
+ cooc = collect_cooccurrence_pairs(conn, args.arena) \
546
+ if args.entity_type == "person" else set()
547
+ print(f"[backfill] collected {len(cooc)} co-occurrence pairs from events")
548
+
549
+ proposals = build_proposals(entities, cooc, args.heuristic_merge)
550
+ print(f"[backfill] built {len(proposals)} merge proposals "
551
+ f"({sum(len(p.deprecated) for p in proposals)} rows would deprecate)")
552
+
553
+ # Summarise by signal.
554
+ by_signal: dict[str, int] = defaultdict(int)
555
+ for p in proposals:
556
+ by_signal[p.signal] += 1
557
+ for sig, n in sorted(by_signal.items()):
558
+ print(f" - {sig}: {n} groups")
559
+
560
+ if args.out:
561
+ write_report(proposals, args.out)
562
+ print(f"[backfill] wrote merge report → {args.out}")
563
+
564
+ if not args.apply:
565
+ print("[backfill] dry-run only; pass --apply to execute")
566
+ return 0
567
+
568
+ succeeded, failed, errors = apply_proposals(
569
+ conn, args.arena, proposals, merged_by
570
+ )
571
+ conn.commit()
572
+ print(f"[backfill] applied: {succeeded} succeeded, {failed} failed")
573
+ for err in errors[:20]:
574
+ print(f" ERR: {err}")
575
+ if len(errors) > 20:
576
+ print(f" ... and {len(errors) - 20} more (see --out for full report)")
577
+ return 1 if failed else 0
578
+
579
+
580
+ if __name__ == "__main__":
581
+ sys.exit(main())