@pentatonic-ai/ai-agent-sdk 0.10.1 → 0.10.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.cjs +1 -1
- package/dist/index.js +1 -1
- package/package.json +1 -1
- package/packages/memory-engine-v2/compat/server.py +38 -6
- package/packages/memory-engine-v2/extractor-async/Dockerfile +5 -3
- package/packages/memory-engine-v2/extractor-async/entity_id.py +57 -0
- package/packages/memory-engine-v2/extractor-async/sensitive_filter.py +51 -0
- package/packages/memory-engine-v2/extractor-async/test_async_ent_parser.py +258 -0
- package/packages/memory-engine-v2/extractor-async/test_sensitive_filter.py +61 -0
- package/packages/memory-engine-v2/extractor-async/worker.py +276 -42
- package/packages/memory-engine-v2/extractor-sync/Dockerfile +1 -1
- package/packages/memory-engine-v2/extractor-sync/entity_id.py +57 -0
- package/packages/memory-engine-v2/extractor-sync/server.py +231 -55
- package/packages/memory-engine-v2/extractor-sync/test_entity_id.py +88 -0
- package/packages/memory-engine-v2/extractor-sync/test_paired_extraction.py +208 -0
- package/packages/memory-engine-v2/org-model/migrations/002_entity_merges_audit.sql +53 -0
- package/packages/memory-engine-v2/org-model/migrations/003_distillation_traces.sql +60 -0
- package/packages/memory-engine-v2/scripts/backfill_entity_reconciliation.py +581 -0
- package/packages/memory-engine-v2/tests/test_entity_id_parity.py +57 -0
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
-- pentatonic-memory-engine v2: entity reconciliation audit table.
|
|
2
|
+
--
|
|
3
|
+
-- Backs the entity-reconciliation RFC's backfill (step 3) and the
|
|
4
|
+
-- online merge path. Every time a duplicate entity row is merged
|
|
5
|
+
-- into a canonical row, one record is written here recording:
|
|
6
|
+
--
|
|
7
|
+
-- - which canonical row absorbed the merge
|
|
8
|
+
-- - which row was deprecated (deleted from `entities`)
|
|
9
|
+
-- - what signal triggered the merge (co_occurrence | alias_overlap |
|
|
10
|
+
-- heuristic | online_resolver)
|
|
11
|
+
-- - how many facts / relationships were repointed
|
|
12
|
+
-- - a `rollback_payload` JSONB snapshot of the deprecated row's
|
|
13
|
+
-- state, sufficient to recreate it if the merge proves wrong
|
|
14
|
+
--
|
|
15
|
+
-- The losing row in `entities` IS deleted on merge (otherwise the
|
|
16
|
+
-- alias-GIN + canonical-name lookups still find both, defeating the
|
|
17
|
+
-- whole point). This table is the receipt + rollback substrate.
|
|
18
|
+
--
|
|
19
|
+
-- See RFC: packages/memory-engine-v2/RFC-entity-reconciliation.md §3
|
|
20
|
+
|
|
21
|
+
CREATE TABLE IF NOT EXISTS entity_merges (
|
|
22
|
+
id TEXT PRIMARY KEY,
|
|
23
|
+
arena TEXT NOT NULL,
|
|
24
|
+
canonical_id TEXT NOT NULL REFERENCES entities(id) ON DELETE CASCADE,
|
|
25
|
+
deprecated_id TEXT NOT NULL, -- no FK; row is deleted
|
|
26
|
+
deprecated_canonical_name TEXT NOT NULL, -- preserve for forensics
|
|
27
|
+
deprecated_aliases TEXT[] NOT NULL DEFAULT '{}', -- preserve for forensics
|
|
28
|
+
merge_signal TEXT NOT NULL CHECK (
|
|
29
|
+
merge_signal IN ('co_occurrence', 'alias_overlap', 'heuristic', 'online_resolver')
|
|
30
|
+
),
|
|
31
|
+
facts_repointed INTEGER NOT NULL DEFAULT 0,
|
|
32
|
+
relationships_repointed INTEGER NOT NULL DEFAULT 0,
|
|
33
|
+
merged_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
|
34
|
+
merged_by TEXT NOT NULL, -- 'backfill-YYYY-MM'|'online-resolver'
|
|
35
|
+
|
|
36
|
+
-- Rollback snapshot: enough state to recreate the deprecated row
|
|
37
|
+
-- if the merge is judged wrong. Includes everything that was
|
|
38
|
+
-- specific to the deprecated row (provenance_event_ids etc).
|
|
39
|
+
rollback_payload JSONB NOT NULL DEFAULT '{}'::jsonb
|
|
40
|
+
);
|
|
41
|
+
|
|
42
|
+
-- Look up "what merged into this entity" (canonical-side query).
|
|
43
|
+
CREATE INDEX IF NOT EXISTS idx_entity_merges_canonical
|
|
44
|
+
ON entity_merges(canonical_id);
|
|
45
|
+
|
|
46
|
+
-- Look up "was this id ever a separate entity that got merged" so
|
|
47
|
+
-- callers holding a stale id can resolve it forward.
|
|
48
|
+
CREATE INDEX IF NOT EXISTS idx_entity_merges_deprecated
|
|
49
|
+
ON entity_merges(deprecated_id);
|
|
50
|
+
|
|
51
|
+
-- Per-arena audit listing (e.g. dry-run reports).
|
|
52
|
+
CREATE INDEX IF NOT EXISTS idx_entity_merges_arena_merged_at
|
|
53
|
+
ON entity_merges(arena, merged_at DESC);
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
-- pentatonic-memory-engine v2: distillation trace audit table.
|
|
2
|
+
--
|
|
3
|
+
-- Append-only log of raw LLM teacher I/O per event distillation. We
|
|
4
|
+
-- already keep the *parsed* output as facts/entities/relationships,
|
|
5
|
+
-- but the org-model's normal-form storage is post-merge and
|
|
6
|
+
-- post-filter — perfect for retrieval, wrong shape for training a
|
|
7
|
+
-- student model on the teacher's distribution.
|
|
8
|
+
--
|
|
9
|
+
-- This table is the teacher-distribution record:
|
|
10
|
+
--
|
|
11
|
+
-- user_prompt = the per-event block we fed into the LLM (i.e.
|
|
12
|
+
-- build_event_block(i, ev) — what the model SAW)
|
|
13
|
+
-- raw_response = the per-event slice of the model's pipe-delimited
|
|
14
|
+
-- output (i.e. everything between `=== event K ===`
|
|
15
|
+
-- and the next header — what the model PRODUCED)
|
|
16
|
+
--
|
|
17
|
+
-- Together they form a (input, output) pair suitable for fine-tuning
|
|
18
|
+
-- a seq2seq student (BART/FLAN-T5) to mimic the teacher's extraction
|
|
19
|
+
-- behaviour. `system_prompt_hash` lets us segment training data by
|
|
20
|
+
-- teacher version: when the BATCH_SYSTEM_PROMPT changes, don't train
|
|
21
|
+
-- a student on outputs produced under the old prompt.
|
|
22
|
+
--
|
|
23
|
+
-- Forget semantics: ON DELETE CASCADE from events. A FORGET_MEMORY
|
|
24
|
+
-- that deletes the source event also removes its trace — training
|
|
25
|
+
-- data inherits the same right-to-erasure contract as the rest of
|
|
26
|
+
-- the org-model.
|
|
27
|
+
|
|
28
|
+
CREATE TABLE IF NOT EXISTS distillation_traces (
|
|
29
|
+
id BIGSERIAL PRIMARY KEY,
|
|
30
|
+
event_id TEXT NOT NULL REFERENCES events(id) ON DELETE CASCADE,
|
|
31
|
+
|
|
32
|
+
-- Teacher I/O for THIS event. Both are bounded by MAX_CONTENT_CHARS
|
|
33
|
+
-- + LLM_MAX_TOKENS_PER_EVENT at write time, so storage is
|
|
34
|
+
-- predictable (~2-3KB/row).
|
|
35
|
+
user_prompt TEXT NOT NULL,
|
|
36
|
+
raw_response TEXT NOT NULL,
|
|
37
|
+
|
|
38
|
+
-- Teacher identity. Lets us filter when the prompt or model changes
|
|
39
|
+
-- (don't train students on outputs from a retired prompt). Hash is
|
|
40
|
+
-- truncated sha256(BATCH_SYSTEM_PROMPT)[:16] — long enough to be a
|
|
41
|
+
-- collision-free identifier, short enough to index cheaply.
|
|
42
|
+
llm_model TEXT NOT NULL,
|
|
43
|
+
system_prompt_hash TEXT NOT NULL,
|
|
44
|
+
|
|
45
|
+
-- LLM call latency (chunk-level — one LLM call distills N events
|
|
46
|
+
-- in one request). Useful for setting student-latency targets.
|
|
47
|
+
llm_chunk_ms REAL,
|
|
48
|
+
|
|
49
|
+
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
|
|
50
|
+
);
|
|
51
|
+
|
|
52
|
+
CREATE INDEX IF NOT EXISTS idx_distillation_traces_event_id
|
|
53
|
+
ON distillation_traces(event_id);
|
|
54
|
+
|
|
55
|
+
CREATE INDEX IF NOT EXISTS idx_distillation_traces_created_at
|
|
56
|
+
ON distillation_traces(created_at DESC);
|
|
57
|
+
|
|
58
|
+
-- Segment by teacher version when exporting training data.
|
|
59
|
+
CREATE INDEX IF NOT EXISTS idx_distillation_traces_prompt_hash
|
|
60
|
+
ON distillation_traces(system_prompt_hash);
|
|
@@ -0,0 +1,581 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""Backfill: merge fragmented person entities (and optionally other
|
|
3
|
+
types) into a single canonical row per person.
|
|
4
|
+
|
|
5
|
+
Implements RFC §3 (backfill merge). Dry-run by default; --apply
|
|
6
|
+
required to actually write. One arena at a time. Audit records every
|
|
7
|
+
merge to the `entity_merges` table with enough state in
|
|
8
|
+
`rollback_payload` to recreate the deprecated row if a merge proves
|
|
9
|
+
wrong.
|
|
10
|
+
|
|
11
|
+
Grouping signal priority (per RFC):
|
|
12
|
+
|
|
13
|
+
1. CO-OCCURRENCE — events whose `attributes` carry both a name and
|
|
14
|
+
an email for the same person (gmail from_name+from_email,
|
|
15
|
+
calendar attendee displayName+email, slack profile rows). This is
|
|
16
|
+
ground-truth pairing.
|
|
17
|
+
|
|
18
|
+
2. ALIAS_OVERLAP — entity rows where A.canonical_name appears in
|
|
19
|
+
B.aliases (or vice versa). Catches post-fix merges still missing
|
|
20
|
+
from the legacy backfill.
|
|
21
|
+
|
|
22
|
+
3. HEURISTIC — (OFF by default; --heuristic-merge to enable)
|
|
23
|
+
local-part vs name-tokens overlap for cases where no event ever
|
|
24
|
+
paired the surface forms. Risky; only with explicit operator
|
|
25
|
+
flag + manual audit of the dry-run.
|
|
26
|
+
|
|
27
|
+
Usage:
|
|
28
|
+
|
|
29
|
+
python3 backfill_entity_reconciliation.py \\
|
|
30
|
+
--arena <arena-id> \\
|
|
31
|
+
--pg-dsn postgresql://... \\
|
|
32
|
+
[--apply] # write; default is dry-run
|
|
33
|
+
[--entity-type person] # which type to reconcile; default person
|
|
34
|
+
[--heuristic-merge] # enable signal 3 (off by default)
|
|
35
|
+
[--out /tmp/merges.jsonl] # where to write the merge report
|
|
36
|
+
|
|
37
|
+
Exit codes:
|
|
38
|
+
0 success (dry-run report written, or --apply completed)
|
|
39
|
+
1 partial failure (some merges failed; report shows which)
|
|
40
|
+
2 bad arguments
|
|
41
|
+
"""
|
|
42
|
+
|
|
43
|
+
from __future__ import annotations
|
|
44
|
+
|
|
45
|
+
import argparse
|
|
46
|
+
import hashlib
|
|
47
|
+
import json
|
|
48
|
+
import os
|
|
49
|
+
import re
|
|
50
|
+
import sys
|
|
51
|
+
import unicodedata
|
|
52
|
+
from collections import defaultdict
|
|
53
|
+
from dataclasses import dataclass, field
|
|
54
|
+
from datetime import datetime, timezone
|
|
55
|
+
|
|
56
|
+
import psycopg
|
|
57
|
+
import psycopg.rows
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
# ----------------------------------------------------------------------
|
|
61
|
+
# Constants & helpers
|
|
62
|
+
# ----------------------------------------------------------------------
|
|
63
|
+
|
|
64
|
+
# Person-shaped attribute keys to look up paired (name, email) on
|
|
65
|
+
# events. Producer-agnostic: any source that follows the conventional
|
|
66
|
+
# `<role>_email` / `<role>_name` shape contributes co-occurrence pairs.
|
|
67
|
+
PERSON_ROLE_PAIRS = [
|
|
68
|
+
("from_email", "from_name"),
|
|
69
|
+
("to_email", "to_name"),
|
|
70
|
+
("cc_email", "cc_name"),
|
|
71
|
+
("reply_to_email", "reply_to_name"),
|
|
72
|
+
("sender_email", "sender_name"),
|
|
73
|
+
("organizer_email", "organizer_name"),
|
|
74
|
+
("organizer_email", "organizer_display_name"),
|
|
75
|
+
]
|
|
76
|
+
EMAIL_RE = re.compile(r"\b([a-zA-Z0-9._%+\-]+@[a-zA-Z0-9.\-]+\.[a-zA-Z]{2,})\b")
|
|
77
|
+
HEURISTIC_LOCAL_PART_RE = re.compile(r"^([a-zA-Z][a-zA-Z0-9._\-]*)@")
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def _normalize_surface(s: str) -> str:
|
|
81
|
+
"""MIRROR of extractor-{sync,async}'s _normalize_surface."""
|
|
82
|
+
return re.sub(r"\s+", " ", unicodedata.normalize("NFKC", s)).strip().lower()
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
def _looks_like_email(s: str) -> bool:
|
|
86
|
+
return isinstance(s, str) and "@" in s and " " not in s and "." in s.split("@", 1)[-1]
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
def _local_part(email: str) -> str:
|
|
90
|
+
m = HEURISTIC_LOCAL_PART_RE.match(email)
|
|
91
|
+
return m.group(1) if m else ""
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
def _name_tokens(name: str) -> set[str]:
|
|
95
|
+
"""Split a display name into lowercased tokens for heuristic match."""
|
|
96
|
+
return {t for t in re.split(r"[\s\.\-_]+", name.lower()) if t}
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
def _local_part_tokens(local: str) -> set[str]:
|
|
100
|
+
return {t for t in re.split(r"[\.\-_]+", local.lower()) if t}
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
# ----------------------------------------------------------------------
|
|
104
|
+
# Data classes
|
|
105
|
+
# ----------------------------------------------------------------------
|
|
106
|
+
|
|
107
|
+
@dataclass
|
|
108
|
+
class Entity:
|
|
109
|
+
id: str
|
|
110
|
+
canonical_name: str
|
|
111
|
+
aliases: list[str]
|
|
112
|
+
provenance_event_ids: list[str]
|
|
113
|
+
fact_count: int = 0 # filled later
|
|
114
|
+
rel_count: int = 0 # filled later
|
|
115
|
+
# All surface forms (normalized) for fast set-overlap matching.
|
|
116
|
+
norm_forms: set[str] = field(default_factory=set)
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
@dataclass
|
|
120
|
+
class MergeProposal:
|
|
121
|
+
canonical: Entity
|
|
122
|
+
deprecated: list[Entity]
|
|
123
|
+
signal: str # 'co_occurrence' | 'alias_overlap' | 'heuristic'
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
# ----------------------------------------------------------------------
|
|
127
|
+
# Load + signal extraction
|
|
128
|
+
# ----------------------------------------------------------------------
|
|
129
|
+
|
|
130
|
+
def load_entities(conn: psycopg.Connection, arena: str, entity_type: str) -> list[Entity]:
|
|
131
|
+
"""Pull all entities of the given type for this arena, with
|
|
132
|
+
fact_count + rel_count for canonical-selection (richest wins)."""
|
|
133
|
+
out: list[Entity] = []
|
|
134
|
+
with conn.cursor(row_factory=psycopg.rows.dict_row) as cur:
|
|
135
|
+
cur.execute(
|
|
136
|
+
"""
|
|
137
|
+
WITH ent AS (
|
|
138
|
+
SELECT id, canonical_name, aliases, provenance_event_ids
|
|
139
|
+
FROM entities
|
|
140
|
+
WHERE arena = %s AND entity_type = %s
|
|
141
|
+
),
|
|
142
|
+
f AS (
|
|
143
|
+
SELECT subject_entity_id AS eid, COUNT(*) AS n FROM facts
|
|
144
|
+
WHERE arena = %s GROUP BY 1
|
|
145
|
+
UNION ALL
|
|
146
|
+
SELECT object_entity_id AS eid, COUNT(*) AS n FROM facts
|
|
147
|
+
WHERE arena = %s AND object_entity_id IS NOT NULL GROUP BY 1
|
|
148
|
+
),
|
|
149
|
+
r AS (
|
|
150
|
+
SELECT from_entity_id AS eid, COUNT(*) AS n FROM relationships
|
|
151
|
+
WHERE arena = %s GROUP BY 1
|
|
152
|
+
UNION ALL
|
|
153
|
+
SELECT to_entity_id AS eid, COUNT(*) AS n FROM relationships
|
|
154
|
+
WHERE arena = %s GROUP BY 1
|
|
155
|
+
)
|
|
156
|
+
SELECT
|
|
157
|
+
ent.id, ent.canonical_name, ent.aliases, ent.provenance_event_ids,
|
|
158
|
+
COALESCE((SELECT SUM(n) FROM f WHERE eid = ent.id), 0) AS fact_count,
|
|
159
|
+
COALESCE((SELECT SUM(n) FROM r WHERE eid = ent.id), 0) AS rel_count
|
|
160
|
+
FROM ent
|
|
161
|
+
""",
|
|
162
|
+
(arena, entity_type, arena, arena, arena, arena),
|
|
163
|
+
)
|
|
164
|
+
for r in cur.fetchall():
|
|
165
|
+
forms = {_normalize_surface(r["canonical_name"])}
|
|
166
|
+
for a in r["aliases"] or []:
|
|
167
|
+
forms.add(_normalize_surface(a))
|
|
168
|
+
out.append(Entity(
|
|
169
|
+
id=r["id"],
|
|
170
|
+
canonical_name=r["canonical_name"],
|
|
171
|
+
aliases=list(r["aliases"] or []),
|
|
172
|
+
provenance_event_ids=list(r["provenance_event_ids"] or []),
|
|
173
|
+
fact_count=int(r["fact_count"]),
|
|
174
|
+
rel_count=int(r["rel_count"]),
|
|
175
|
+
norm_forms=forms,
|
|
176
|
+
))
|
|
177
|
+
return out
|
|
178
|
+
|
|
179
|
+
|
|
180
|
+
def collect_cooccurrence_pairs(
|
|
181
|
+
conn: psycopg.Connection, arena: str
|
|
182
|
+
) -> set[tuple[str, str]]:
|
|
183
|
+
"""Scan events.attributes for paired (name, email) where both
|
|
184
|
+
appear for the same person in the same event. Returns
|
|
185
|
+
set of (normalized_name, normalized_email) pairs."""
|
|
186
|
+
pairs: set[tuple[str, str]] = set()
|
|
187
|
+
with conn.cursor(row_factory=psycopg.rows.dict_row) as cur:
|
|
188
|
+
cur.execute(
|
|
189
|
+
"SELECT attributes FROM events WHERE arena = %s",
|
|
190
|
+
(arena,),
|
|
191
|
+
)
|
|
192
|
+
for row in cur:
|
|
193
|
+
attrs = row["attributes"] or {}
|
|
194
|
+
if not isinstance(attrs, dict):
|
|
195
|
+
continue
|
|
196
|
+
for email_key, name_key in PERSON_ROLE_PAIRS:
|
|
197
|
+
email = attrs.get(email_key)
|
|
198
|
+
name = attrs.get(name_key)
|
|
199
|
+
if isinstance(email, str) and isinstance(name, str) \
|
|
200
|
+
and _looks_like_email(email) and name.strip():
|
|
201
|
+
pairs.add((_normalize_surface(name), _normalize_surface(email)))
|
|
202
|
+
# Structured attendee objects (calendar producers).
|
|
203
|
+
attendees = attrs.get("attendees") or attrs.get("attendee_objects")
|
|
204
|
+
if isinstance(attendees, list):
|
|
205
|
+
for a in attendees:
|
|
206
|
+
if isinstance(a, dict):
|
|
207
|
+
email = a.get("email")
|
|
208
|
+
name = a.get("displayName") or a.get("name")
|
|
209
|
+
if isinstance(email, str) and isinstance(name, str) \
|
|
210
|
+
and _looks_like_email(email) and name.strip():
|
|
211
|
+
pairs.add((_normalize_surface(name),
|
|
212
|
+
_normalize_surface(email)))
|
|
213
|
+
return pairs
|
|
214
|
+
|
|
215
|
+
|
|
216
|
+
# ----------------------------------------------------------------------
|
|
217
|
+
# Merge proposal building
|
|
218
|
+
# ----------------------------------------------------------------------
|
|
219
|
+
|
|
220
|
+
def build_proposals(
|
|
221
|
+
entities: list[Entity],
|
|
222
|
+
cooccurrence_pairs: set[tuple[str, str]],
|
|
223
|
+
use_heuristic: bool,
|
|
224
|
+
) -> list[MergeProposal]:
|
|
225
|
+
"""Union-find over entities grouped by overlap of any of:
|
|
226
|
+
1. co-occurrence pairs (highest priority)
|
|
227
|
+
2. alias overlap (cheap, ground-truth via existing aliases)
|
|
228
|
+
3. heuristic local-part vs name-tokens (optional)
|
|
229
|
+
|
|
230
|
+
Returns one MergeProposal per group with >= 2 entities; the
|
|
231
|
+
`signal` is set to the *highest-priority* signal that contributed.
|
|
232
|
+
"""
|
|
233
|
+
# parent: entity-id → root-id (union-find)
|
|
234
|
+
parent: dict[str, str] = {e.id: e.id for e in entities}
|
|
235
|
+
# Track which signal connected each pair; resolved per-group at the end.
|
|
236
|
+
edge_signal: dict[frozenset, str] = {}
|
|
237
|
+
|
|
238
|
+
def find(x: str) -> str:
|
|
239
|
+
while parent[x] != x:
|
|
240
|
+
parent[x] = parent[parent[x]]
|
|
241
|
+
x = parent[x]
|
|
242
|
+
return x
|
|
243
|
+
|
|
244
|
+
def union(a: str, b: str, signal: str) -> None:
|
|
245
|
+
ra, rb = find(a), find(b)
|
|
246
|
+
if ra == rb:
|
|
247
|
+
return
|
|
248
|
+
parent[ra] = rb
|
|
249
|
+
edge_signal[frozenset({ra, rb})] = signal
|
|
250
|
+
|
|
251
|
+
# ---- Signal 1: co-occurrence pairs ---------------------------------
|
|
252
|
+
# Build a (normalized_form → entity_id) lookup, then for each
|
|
253
|
+
# (name, email) pair, if both forms map to entities, union them.
|
|
254
|
+
form_to_entity: dict[str, str] = {}
|
|
255
|
+
for e in entities:
|
|
256
|
+
for f in e.norm_forms:
|
|
257
|
+
# First-write wins on collision; co-occurrence-driven
|
|
258
|
+
# merges happen below anyway.
|
|
259
|
+
form_to_entity.setdefault(f, e.id)
|
|
260
|
+
for n_name, n_email in cooccurrence_pairs:
|
|
261
|
+
eid_name = form_to_entity.get(n_name)
|
|
262
|
+
eid_email = form_to_entity.get(n_email)
|
|
263
|
+
if eid_name and eid_email and eid_name != eid_email:
|
|
264
|
+
union(eid_name, eid_email, "co_occurrence")
|
|
265
|
+
|
|
266
|
+
# ---- Signal 2: alias overlap ---------------------------------------
|
|
267
|
+
# Two entities that share any normalized form (one's canonical
|
|
268
|
+
# appears in the other's aliases, etc.) should already be one.
|
|
269
|
+
# Group by each form and union all entities sharing it.
|
|
270
|
+
form_to_entities: dict[str, set[str]] = defaultdict(set)
|
|
271
|
+
for e in entities:
|
|
272
|
+
for f in e.norm_forms:
|
|
273
|
+
form_to_entities[f].add(e.id)
|
|
274
|
+
for ents in form_to_entities.values():
|
|
275
|
+
if len(ents) <= 1:
|
|
276
|
+
continue
|
|
277
|
+
ents_list = sorted(ents)
|
|
278
|
+
for other in ents_list[1:]:
|
|
279
|
+
union(ents_list[0], other, "alias_overlap")
|
|
280
|
+
|
|
281
|
+
# ---- Signal 3 (optional): heuristic local-part vs name tokens ------
|
|
282
|
+
if use_heuristic:
|
|
283
|
+
# For every email-only canonical that hasn't been unioned via
|
|
284
|
+
# 1 or 2, try matching its local-part tokens against
|
|
285
|
+
# name-canonicals' tokens. Last-resort; can produce false
|
|
286
|
+
# positives (e.g. 'sam' matches 'Sam Patel' AND 'Sam Jones').
|
|
287
|
+
# Operator MUST eyeball the dry-run report.
|
|
288
|
+
email_entities: list[Entity] = [
|
|
289
|
+
e for e in entities
|
|
290
|
+
if _looks_like_email(e.canonical_name)
|
|
291
|
+
]
|
|
292
|
+
name_entities_by_token: dict[str, list[Entity]] = defaultdict(list)
|
|
293
|
+
for e in entities:
|
|
294
|
+
if not _looks_like_email(e.canonical_name):
|
|
295
|
+
for t in _name_tokens(e.canonical_name):
|
|
296
|
+
name_entities_by_token[t].append(e)
|
|
297
|
+
for ee in email_entities:
|
|
298
|
+
local = _local_part(ee.canonical_name)
|
|
299
|
+
tokens = _local_part_tokens(local)
|
|
300
|
+
candidates: dict[str, int] = defaultdict(int)
|
|
301
|
+
for t in tokens:
|
|
302
|
+
for ne in name_entities_by_token.get(t, []):
|
|
303
|
+
if find(ee.id) == find(ne.id):
|
|
304
|
+
continue
|
|
305
|
+
candidates[ne.id] += 1
|
|
306
|
+
# Require >= 2 token-overlap to consider, OR a single
|
|
307
|
+
# token that's both >= 3 chars and appears in only one
|
|
308
|
+
# candidate (unambiguous nickname-style).
|
|
309
|
+
best: tuple[int, str] | None = None
|
|
310
|
+
for cand_id, hits in candidates.items():
|
|
311
|
+
if hits >= 2 or (hits == 1 and len(tokens) == 1
|
|
312
|
+
and len(next(iter(tokens))) >= 3
|
|
313
|
+
and len(candidates) == 1):
|
|
314
|
+
if best is None or hits > best[0]:
|
|
315
|
+
best = (hits, cand_id)
|
|
316
|
+
if best is not None:
|
|
317
|
+
union(ee.id, best[1], "heuristic")
|
|
318
|
+
|
|
319
|
+
# ---- Materialise groups → proposals --------------------------------
|
|
320
|
+
groups: dict[str, list[Entity]] = defaultdict(list)
|
|
321
|
+
for e in entities:
|
|
322
|
+
groups[find(e.id)].append(e)
|
|
323
|
+
|
|
324
|
+
proposals: list[MergeProposal] = []
|
|
325
|
+
for group in groups.values():
|
|
326
|
+
if len(group) < 2:
|
|
327
|
+
continue
|
|
328
|
+
# Canonical = richest (most facts, then most rels, then most
|
|
329
|
+
# provenance events, then lex-smallest id for determinism).
|
|
330
|
+
group_sorted = sorted(
|
|
331
|
+
group,
|
|
332
|
+
key=lambda e: (-e.fact_count, -e.rel_count,
|
|
333
|
+
-len(e.provenance_event_ids), e.id),
|
|
334
|
+
)
|
|
335
|
+
canonical = group_sorted[0]
|
|
336
|
+
deprecated = group_sorted[1:]
|
|
337
|
+
|
|
338
|
+
# Choose strongest signal that connected this group.
|
|
339
|
+
ids = {e.id for e in group}
|
|
340
|
+
signals_in_group = {
|
|
341
|
+
sig for edge, sig in edge_signal.items()
|
|
342
|
+
if edge & ids
|
|
343
|
+
}
|
|
344
|
+
signal_priority = ("co_occurrence", "alias_overlap", "heuristic")
|
|
345
|
+
chosen = next((s for s in signal_priority if s in signals_in_group),
|
|
346
|
+
"alias_overlap")
|
|
347
|
+
proposals.append(MergeProposal(canonical=canonical,
|
|
348
|
+
deprecated=deprecated,
|
|
349
|
+
signal=chosen))
|
|
350
|
+
return proposals
|
|
351
|
+
|
|
352
|
+
|
|
353
|
+
# ----------------------------------------------------------------------
|
|
354
|
+
# Apply (--apply)
|
|
355
|
+
# ----------------------------------------------------------------------
|
|
356
|
+
|
|
357
|
+
def apply_proposals(
|
|
358
|
+
conn: psycopg.Connection,
|
|
359
|
+
arena: str,
|
|
360
|
+
proposals: list[MergeProposal],
|
|
361
|
+
merged_by: str,
|
|
362
|
+
) -> tuple[int, int, list[str]]:
|
|
363
|
+
"""Apply merges in one transaction per proposal (so a failure
|
|
364
|
+
doesn't roll back successful merges in the same batch).
|
|
365
|
+
|
|
366
|
+
Returns (succeeded_count, failed_count, errors)."""
|
|
367
|
+
succeeded = 0
|
|
368
|
+
failed = 0
|
|
369
|
+
errors: list[str] = []
|
|
370
|
+
for p in proposals:
|
|
371
|
+
try:
|
|
372
|
+
with conn.transaction():
|
|
373
|
+
with conn.cursor() as cur:
|
|
374
|
+
# Lock the canonical row + every deprecated row to
|
|
375
|
+
# avoid concurrent online-resolver writes during
|
|
376
|
+
# the merge.
|
|
377
|
+
ids = [p.canonical.id, *(d.id for d in p.deprecated)]
|
|
378
|
+
cur.execute(
|
|
379
|
+
"SELECT id FROM entities WHERE id = ANY(%s) FOR UPDATE",
|
|
380
|
+
(ids,),
|
|
381
|
+
)
|
|
382
|
+
for dep in p.deprecated:
|
|
383
|
+
# Repoint facts.
|
|
384
|
+
cur.execute(
|
|
385
|
+
"""
|
|
386
|
+
UPDATE facts SET subject_entity_id = %s
|
|
387
|
+
WHERE arena = %s AND subject_entity_id = %s
|
|
388
|
+
""",
|
|
389
|
+
(p.canonical.id, arena, dep.id),
|
|
390
|
+
)
|
|
391
|
+
facts_repointed = cur.rowcount
|
|
392
|
+
cur.execute(
|
|
393
|
+
"""
|
|
394
|
+
UPDATE facts SET object_entity_id = %s
|
|
395
|
+
WHERE arena = %s AND object_entity_id = %s
|
|
396
|
+
""",
|
|
397
|
+
(p.canonical.id, arena, dep.id),
|
|
398
|
+
)
|
|
399
|
+
facts_repointed += cur.rowcount
|
|
400
|
+
# Repoint relationships.
|
|
401
|
+
cur.execute(
|
|
402
|
+
"""
|
|
403
|
+
UPDATE relationships SET from_entity_id = %s
|
|
404
|
+
WHERE arena = %s AND from_entity_id = %s
|
|
405
|
+
""",
|
|
406
|
+
(p.canonical.id, arena, dep.id),
|
|
407
|
+
)
|
|
408
|
+
rels_repointed = cur.rowcount
|
|
409
|
+
cur.execute(
|
|
410
|
+
"""
|
|
411
|
+
UPDATE relationships SET to_entity_id = %s
|
|
412
|
+
WHERE arena = %s AND to_entity_id = %s
|
|
413
|
+
""",
|
|
414
|
+
(p.canonical.id, arena, dep.id),
|
|
415
|
+
)
|
|
416
|
+
rels_repointed += cur.rowcount
|
|
417
|
+
|
|
418
|
+
# Merge aliases + provenance into canonical.
|
|
419
|
+
cur.execute(
|
|
420
|
+
"""
|
|
421
|
+
UPDATE entities SET
|
|
422
|
+
aliases = ARRAY(SELECT DISTINCT UNNEST(
|
|
423
|
+
aliases || %s::text[] || ARRAY[%s]
|
|
424
|
+
)),
|
|
425
|
+
provenance_event_ids = ARRAY(SELECT DISTINCT UNNEST(
|
|
426
|
+
provenance_event_ids || %s::text[]
|
|
427
|
+
)),
|
|
428
|
+
last_seen = NOW()
|
|
429
|
+
WHERE id = %s
|
|
430
|
+
""",
|
|
431
|
+
(dep.aliases, dep.canonical_name,
|
|
432
|
+
dep.provenance_event_ids, p.canonical.id),
|
|
433
|
+
)
|
|
434
|
+
|
|
435
|
+
# Audit + rollback payload.
|
|
436
|
+
rollback_payload = {
|
|
437
|
+
"id": dep.id,
|
|
438
|
+
"canonical_name": dep.canonical_name,
|
|
439
|
+
"aliases": dep.aliases,
|
|
440
|
+
"provenance_event_ids": dep.provenance_event_ids,
|
|
441
|
+
}
|
|
442
|
+
merge_id = "m_" + hashlib.sha256(
|
|
443
|
+
f"{arena}|{dep.id}|{p.canonical.id}".encode()
|
|
444
|
+
).hexdigest()[:24]
|
|
445
|
+
cur.execute(
|
|
446
|
+
"""
|
|
447
|
+
INSERT INTO entity_merges (
|
|
448
|
+
id, arena, canonical_id, deprecated_id,
|
|
449
|
+
deprecated_canonical_name, deprecated_aliases,
|
|
450
|
+
merge_signal, facts_repointed,
|
|
451
|
+
relationships_repointed, merged_by, rollback_payload
|
|
452
|
+
) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s::jsonb)
|
|
453
|
+
ON CONFLICT (id) DO NOTHING
|
|
454
|
+
""",
|
|
455
|
+
(
|
|
456
|
+
merge_id, arena, p.canonical.id, dep.id,
|
|
457
|
+
dep.canonical_name, dep.aliases,
|
|
458
|
+
p.signal, facts_repointed, rels_repointed,
|
|
459
|
+
merged_by, json.dumps(rollback_payload),
|
|
460
|
+
),
|
|
461
|
+
)
|
|
462
|
+
|
|
463
|
+
# Delete the deprecated row.
|
|
464
|
+
cur.execute(
|
|
465
|
+
"DELETE FROM entities WHERE id = %s",
|
|
466
|
+
(dep.id,),
|
|
467
|
+
)
|
|
468
|
+
succeeded += 1
|
|
469
|
+
except Exception as e:
|
|
470
|
+
failed += 1
|
|
471
|
+
errors.append(f"{p.canonical.id} <- {[d.id for d in p.deprecated]}: {e}")
|
|
472
|
+
return succeeded, failed, errors
|
|
473
|
+
|
|
474
|
+
|
|
475
|
+
# ----------------------------------------------------------------------
|
|
476
|
+
# Report writer
|
|
477
|
+
# ----------------------------------------------------------------------
|
|
478
|
+
|
|
479
|
+
def write_report(proposals: list[MergeProposal], path: str) -> None:
|
|
480
|
+
"""JSONL with one record per proposal. Operator inspects this
|
|
481
|
+
before --apply."""
|
|
482
|
+
with open(path, "w") as f:
|
|
483
|
+
for p in proposals:
|
|
484
|
+
f.write(json.dumps({
|
|
485
|
+
"canonical": {
|
|
486
|
+
"id": p.canonical.id,
|
|
487
|
+
"canonical_name": p.canonical.canonical_name,
|
|
488
|
+
"fact_count": p.canonical.fact_count,
|
|
489
|
+
"rel_count": p.canonical.rel_count,
|
|
490
|
+
},
|
|
491
|
+
"deprecated": [
|
|
492
|
+
{
|
|
493
|
+
"id": d.id,
|
|
494
|
+
"canonical_name": d.canonical_name,
|
|
495
|
+
"aliases": d.aliases,
|
|
496
|
+
"fact_count": d.fact_count,
|
|
497
|
+
"rel_count": d.rel_count,
|
|
498
|
+
} for d in p.deprecated
|
|
499
|
+
],
|
|
500
|
+
"signal": p.signal,
|
|
501
|
+
}) + "\n")
|
|
502
|
+
|
|
503
|
+
|
|
504
|
+
# ----------------------------------------------------------------------
|
|
505
|
+
# CLI
|
|
506
|
+
# ----------------------------------------------------------------------
|
|
507
|
+
|
|
508
|
+
def parse_args() -> argparse.Namespace:
|
|
509
|
+
p = argparse.ArgumentParser(
|
|
510
|
+
description=__doc__,
|
|
511
|
+
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
512
|
+
)
|
|
513
|
+
p.add_argument("--arena", required=True,
|
|
514
|
+
help="arena id to reconcile (one at a time)")
|
|
515
|
+
p.add_argument("--pg-dsn", default=os.environ.get("PG_DSN"),
|
|
516
|
+
help="postgres DSN; defaults to $PG_DSN")
|
|
517
|
+
p.add_argument("--entity-type", default="person",
|
|
518
|
+
help="entity type to reconcile (default: person)")
|
|
519
|
+
p.add_argument("--apply", action="store_true",
|
|
520
|
+
help="actually run the merges; default is dry-run")
|
|
521
|
+
p.add_argument("--heuristic-merge", action="store_true",
|
|
522
|
+
help="enable signal 3 (off by default; risky)")
|
|
523
|
+
p.add_argument("--out", default=None,
|
|
524
|
+
help="merge-report JSONL path (default: stdout-only "
|
|
525
|
+
"summary, no jsonl)")
|
|
526
|
+
p.add_argument("--merged-by", default=None,
|
|
527
|
+
help="audit tag (default: backfill-YYYY-MM)")
|
|
528
|
+
return p.parse_args()
|
|
529
|
+
|
|
530
|
+
|
|
531
|
+
def main() -> int:
|
|
532
|
+
args = parse_args()
|
|
533
|
+
if not args.pg_dsn:
|
|
534
|
+
print("error: --pg-dsn (or $PG_DSN) required", file=sys.stderr)
|
|
535
|
+
return 2
|
|
536
|
+
|
|
537
|
+
merged_by = args.merged_by or f"backfill-{datetime.now(timezone.utc):%Y-%m}"
|
|
538
|
+
|
|
539
|
+
with psycopg.connect(args.pg_dsn, autocommit=False) as conn:
|
|
540
|
+
print(f"[backfill] arena={args.arena} type={args.entity_type} "
|
|
541
|
+
f"apply={args.apply} heuristic={args.heuristic_merge}")
|
|
542
|
+
entities = load_entities(conn, args.arena, args.entity_type)
|
|
543
|
+
print(f"[backfill] loaded {len(entities)} {args.entity_type} entities")
|
|
544
|
+
|
|
545
|
+
cooc = collect_cooccurrence_pairs(conn, args.arena) \
|
|
546
|
+
if args.entity_type == "person" else set()
|
|
547
|
+
print(f"[backfill] collected {len(cooc)} co-occurrence pairs from events")
|
|
548
|
+
|
|
549
|
+
proposals = build_proposals(entities, cooc, args.heuristic_merge)
|
|
550
|
+
print(f"[backfill] built {len(proposals)} merge proposals "
|
|
551
|
+
f"({sum(len(p.deprecated) for p in proposals)} rows would deprecate)")
|
|
552
|
+
|
|
553
|
+
# Summarise by signal.
|
|
554
|
+
by_signal: dict[str, int] = defaultdict(int)
|
|
555
|
+
for p in proposals:
|
|
556
|
+
by_signal[p.signal] += 1
|
|
557
|
+
for sig, n in sorted(by_signal.items()):
|
|
558
|
+
print(f" - {sig}: {n} groups")
|
|
559
|
+
|
|
560
|
+
if args.out:
|
|
561
|
+
write_report(proposals, args.out)
|
|
562
|
+
print(f"[backfill] wrote merge report → {args.out}")
|
|
563
|
+
|
|
564
|
+
if not args.apply:
|
|
565
|
+
print("[backfill] dry-run only; pass --apply to execute")
|
|
566
|
+
return 0
|
|
567
|
+
|
|
568
|
+
succeeded, failed, errors = apply_proposals(
|
|
569
|
+
conn, args.arena, proposals, merged_by
|
|
570
|
+
)
|
|
571
|
+
conn.commit()
|
|
572
|
+
print(f"[backfill] applied: {succeeded} succeeded, {failed} failed")
|
|
573
|
+
for err in errors[:20]:
|
|
574
|
+
print(f" ERR: {err}")
|
|
575
|
+
if len(errors) > 20:
|
|
576
|
+
print(f" ... and {len(errors) - 20} more (see --out for full report)")
|
|
577
|
+
return 1 if failed else 0
|
|
578
|
+
|
|
579
|
+
|
|
580
|
+
if __name__ == "__main__":
|
|
581
|
+
sys.exit(main())
|