@pentatonic-ai/ai-agent-sdk 0.10.19 → 0.10.21
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.cjs +1 -1
- package/dist/index.js +1 -1
- package/package.json +1 -1
- package/packages/memory-engine-v2/RFC-decay-and-fusion.md +122 -8
- package/packages/memory-engine-v2/compat/server.py +55 -10
- package/packages/memory-engine-v2/extractor-async/test_email_alias_guard.py +78 -0
- package/packages/memory-engine-v2/extractor-async/worker.py +52 -0
- package/packages/memory-engine-v2/scripts/build_retrain_corpus.py +240 -0
- package/packages/memory-engine-v2/scripts/fusion_defrag.py +440 -0
- package/packages/memory-engine-v2/scripts/redistill.py +236 -0
|
@@ -0,0 +1,240 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""Build a student retrain corpus from CLEAN teacher gold.
|
|
3
|
+
|
|
4
|
+
The student (NuExtract-2.0-4B FT) was originally trained on teacher traces
|
|
5
|
+
produced under the *old* distiller prompts (bbdaba / f1e0ff), which had no
|
|
6
|
+
email-discipline or modality rules — so the student learned to (a) promote
|
|
7
|
+
bystander emails into a person's aliases (the Johann/hotel over-merge), (b)
|
|
8
|
+
collapse future/invited roles to established `state` facts, (c) conflate
|
|
9
|
+
"X & Y" into one entity, and (d) mint email-named / generic-infra entities.
|
|
10
|
+
|
|
11
|
+
This builder draws ONLY from traces produced under the clean prompt (#126
|
|
12
|
+
modality/attribution + #129 email-discipline & entity-separation), whose
|
|
13
|
+
`system_prompt_hash` is the deployed clean hash. Building from old-prompt
|
|
14
|
+
traces would just re-teach the defects, so that is explicitly NOT the default
|
|
15
|
+
(you must pass --allow-dirty-hash to override, and you shouldn't).
|
|
16
|
+
|
|
17
|
+
A defect filter runs as a SECOND line of defence: even clean-prompt output is
|
|
18
|
+
screened for the known defect signatures and dropped if any survive. Every
|
|
19
|
+
drop is counted by reason so the corpus's cleanliness is auditable (not a
|
|
20
|
+
black box — see the printed report).
|
|
21
|
+
|
|
22
|
+
INPUT — an NDJSON stream of trace rows, one object per line:
|
|
23
|
+
{"event_id": "...", "user_prompt": "...", "raw_response": "...",
|
|
24
|
+
"system_prompt_hash": "..."}
|
|
25
|
+
Produce it from the engine box's org_model DB with row_to_json (the escaping
|
|
26
|
+
that bit us before — \\copy double-escapes, $-quoting gets eaten by the shell —
|
|
27
|
+
is avoided by -At + row_to_json):
|
|
28
|
+
|
|
29
|
+
sudo docker exec -i pme2-org-model psql -U pme -d org_model -At -c \\
|
|
30
|
+
"SELECT row_to_json(t) FROM (
|
|
31
|
+
SELECT event_id, user_prompt, raw_response, system_prompt_hash
|
|
32
|
+
FROM distillation_traces
|
|
33
|
+
WHERE system_prompt_hash = '6ccfe70f1286a131'
|
|
34
|
+
) t" > traces.ndjson
|
|
35
|
+
|
|
36
|
+
OUTPUT — {"input": <per-event block>, "output": <extraction JSON string>}
|
|
37
|
+
JSONL(.gz), the exact shape train_lora.py's load() consumes (it keeps rows
|
|
38
|
+
where both `input` and `output` are truthy, then trains user=input ->
|
|
39
|
+
assistant=output via the NuExtract chat template; no system prompt in the
|
|
40
|
+
pair). The corpus is PER-EVENT while a trace is a 3-event chunk, so each
|
|
41
|
+
trace's user_prompt is split on the `[event K]` markers and matched to
|
|
42
|
+
raw_response[index == K].
|
|
43
|
+
|
|
44
|
+
Usage:
|
|
45
|
+
python build_retrain_corpus.py --traces traces.ndjson --out retrain_clean.jsonl.gz
|
|
46
|
+
zcat traces.ndjson.gz | python build_retrain_corpus.py --traces - --out c.jsonl.gz
|
|
47
|
+
"""
|
|
48
|
+
from __future__ import annotations
|
|
49
|
+
|
|
50
|
+
import argparse
|
|
51
|
+
import gzip
|
|
52
|
+
import hashlib
|
|
53
|
+
import json
|
|
54
|
+
import re
|
|
55
|
+
import sys
|
|
56
|
+
from collections import Counter
|
|
57
|
+
|
|
58
|
+
# The clean prompt deployed as SDK 0.10.19 (#126 + #129). Verify against the
|
|
59
|
+
# running extractor-async (worker.SYSTEM_PROMPT_HASH) before a real corpus cut —
|
|
60
|
+
# a prompt edit advances this and old-hash traces must not silently leak in.
|
|
61
|
+
CLEAN_PROMPT_HASH = "6ccfe70f1286a131"
|
|
62
|
+
|
|
63
|
+
# Generic infra / environment tokens that must never be standalone entities
|
|
64
|
+
# (mirrors the #129 DISTINCT ENTITIES rule — kept in sync by hand).
|
|
65
|
+
INFRA_TOKENS = {
|
|
66
|
+
"prod", "production", "staging", "stage", "uat", "qa", "dev", "test",
|
|
67
|
+
"warehouse", "datalake", "data lake", "cluster", "backend", "frontend",
|
|
68
|
+
"the system", "the platform", "the api", "the database", "the server",
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
EVENT_BLOCK_RE = re.compile(r"(?=^\[event \d+\])", re.MULTILINE)
|
|
72
|
+
EVENT_IDX_RE = re.compile(r"^\[event (\d+)\]")
|
|
73
|
+
EMAIL_RE = re.compile(r"^[^@\s]+@[^@\s]+\.[^@\s]+$")
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def _email_plausibly_belongs(person_name: str, email: str) -> bool:
|
|
77
|
+
"""Keep an email on a person only if it plausibly is theirs: a name token
|
|
78
|
+
appears in the local-part, or the initials match. Same heuristic as the
|
|
79
|
+
write-side guard (#128) so corpus filtering and runtime agree."""
|
|
80
|
+
local = email.split("@", 1)[0].lower()
|
|
81
|
+
local_alnum = re.sub(r"[^a-z0-9]", "", local)
|
|
82
|
+
tokens = [t for t in re.split(r"\s+", person_name.lower()) if t]
|
|
83
|
+
if not tokens:
|
|
84
|
+
return False
|
|
85
|
+
for t in tokens:
|
|
86
|
+
t_alnum = re.sub(r"[^a-z0-9]", "", t)
|
|
87
|
+
if len(t_alnum) >= 3 and t_alnum in local_alnum:
|
|
88
|
+
return True
|
|
89
|
+
initials = "".join(t[0] for t in tokens if t)
|
|
90
|
+
if len(initials) >= 2 and initials in local_alnum:
|
|
91
|
+
return True
|
|
92
|
+
return False
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
def _entity_defect(ent: dict) -> str | None:
|
|
96
|
+
"""Return a drop-reason if this entity carries a known defect, else None."""
|
|
97
|
+
name = (ent.get("name") or "").strip()
|
|
98
|
+
etype = (ent.get("type") or "").lower()
|
|
99
|
+
if not name:
|
|
100
|
+
return "empty_entity_name"
|
|
101
|
+
if EMAIL_RE.match(name):
|
|
102
|
+
return "email_as_entity"
|
|
103
|
+
if name.lower() in INFRA_TOKENS:
|
|
104
|
+
return "generic_infra_entity"
|
|
105
|
+
# Conflation: "Acme & Globex" / "Alice and Bob" smuggled into one node.
|
|
106
|
+
if re.search(r"\s&\s", name) or re.search(r"\b and \b", name.lower()):
|
|
107
|
+
return "conflated_entity"
|
|
108
|
+
if etype == "person":
|
|
109
|
+
for a in ent.get("aliases") or []:
|
|
110
|
+
if isinstance(a, str) and "@" in a and " " not in a \
|
|
111
|
+
and not _email_plausibly_belongs(name, a):
|
|
112
|
+
return "bystander_email_alias"
|
|
113
|
+
return None
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
def _output_is_clean(obj: dict) -> str | None:
|
|
117
|
+
"""Screen one per-event extraction object; return a drop-reason or None."""
|
|
118
|
+
if not isinstance(obj, dict):
|
|
119
|
+
return "output_not_object"
|
|
120
|
+
for ent in obj.get("entities") or []:
|
|
121
|
+
r = _entity_defect(ent)
|
|
122
|
+
if r:
|
|
123
|
+
return r
|
|
124
|
+
return None
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
def split_events(user_prompt: str) -> dict[int, str]:
|
|
128
|
+
"""Split a chunk prompt into {event_index: block_text}."""
|
|
129
|
+
blocks: dict[int, str] = {}
|
|
130
|
+
for block in EVENT_BLOCK_RE.split(user_prompt):
|
|
131
|
+
block = block.rstrip()
|
|
132
|
+
m = EVENT_IDX_RE.match(block)
|
|
133
|
+
if m:
|
|
134
|
+
blocks[int(m.group(1))] = block
|
|
135
|
+
return blocks
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
def main() -> int:
|
|
139
|
+
ap = argparse.ArgumentParser(description=__doc__,
|
|
140
|
+
formatter_class=argparse.RawDescriptionHelpFormatter)
|
|
141
|
+
ap.add_argument("--traces", required=True,
|
|
142
|
+
help="NDJSON trace rows, or '-' for stdin")
|
|
143
|
+
ap.add_argument("--out", required=True, help="output .jsonl.gz")
|
|
144
|
+
ap.add_argument("--hash", default=CLEAN_PROMPT_HASH,
|
|
145
|
+
help=f"keep only this system_prompt_hash (default {CLEAN_PROMPT_HASH})")
|
|
146
|
+
ap.add_argument("--allow-dirty-hash", action="store_true",
|
|
147
|
+
help="do NOT filter by hash — DANGER: re-teaches old-prompt defects")
|
|
148
|
+
ap.add_argument("--report", help="optional path for a JSON stats report")
|
|
149
|
+
args = ap.parse_args()
|
|
150
|
+
|
|
151
|
+
fh = sys.stdin if args.traces == "-" else open(args.traces, encoding="utf-8")
|
|
152
|
+
stats = Counter()
|
|
153
|
+
seen: set[str] = set()
|
|
154
|
+
examples: list[dict] = []
|
|
155
|
+
|
|
156
|
+
for line in fh:
|
|
157
|
+
line = line.strip()
|
|
158
|
+
if not line:
|
|
159
|
+
continue
|
|
160
|
+
stats["trace_rows"] += 1
|
|
161
|
+
try:
|
|
162
|
+
row = json.loads(line)
|
|
163
|
+
except json.JSONDecodeError:
|
|
164
|
+
stats["drop_trace_unparseable"] += 1
|
|
165
|
+
continue
|
|
166
|
+
|
|
167
|
+
if not args.allow_dirty_hash and row.get("system_prompt_hash") != args.hash:
|
|
168
|
+
stats["drop_wrong_hash"] += 1
|
|
169
|
+
continue
|
|
170
|
+
|
|
171
|
+
raw = row.get("raw_response") or ""
|
|
172
|
+
try:
|
|
173
|
+
parsed = json.loads(raw)
|
|
174
|
+
except json.JSONDecodeError:
|
|
175
|
+
stats["drop_response_unparseable"] += 1
|
|
176
|
+
continue
|
|
177
|
+
# raw_response is either a single per-event object (current trace
|
|
178
|
+
# format — one row per event) or, for legacy chunked traces, a JSON
|
|
179
|
+
# array / {"events": [...]} of per-event objects.
|
|
180
|
+
if isinstance(parsed, dict):
|
|
181
|
+
objs = parsed.get("events") if isinstance(parsed.get("events"), list) else [parsed]
|
|
182
|
+
elif isinstance(parsed, list):
|
|
183
|
+
objs = parsed
|
|
184
|
+
else:
|
|
185
|
+
stats["drop_response_shape"] += 1
|
|
186
|
+
continue
|
|
187
|
+
|
|
188
|
+
blocks = split_events(row.get("user_prompt") or "")
|
|
189
|
+
for obj in objs:
|
|
190
|
+
if not isinstance(obj, dict):
|
|
191
|
+
stats["drop_obj_not_object"] += 1
|
|
192
|
+
continue
|
|
193
|
+
idx = obj.get("index")
|
|
194
|
+
block = blocks.get(idx) if idx is not None else None
|
|
195
|
+
# Single-event trace: one block, one object — match by position
|
|
196
|
+
# even if the stored index doesn't line up with the marker.
|
|
197
|
+
if block is None and len(objs) == 1 and len(blocks) == 1:
|
|
198
|
+
block = next(iter(blocks.values()))
|
|
199
|
+
if not block:
|
|
200
|
+
stats["drop_no_matching_block"] += 1
|
|
201
|
+
continue
|
|
202
|
+
|
|
203
|
+
key = hashlib.sha1(block.encode("utf-8")).hexdigest()
|
|
204
|
+
if key in seen:
|
|
205
|
+
stats["drop_dup"] += 1
|
|
206
|
+
continue
|
|
207
|
+
|
|
208
|
+
reason = _output_is_clean(obj)
|
|
209
|
+
if reason:
|
|
210
|
+
stats[f"drop_{reason}"] += 1
|
|
211
|
+
continue
|
|
212
|
+
|
|
213
|
+
seen.add(key)
|
|
214
|
+
examples.append({"input": block, "output": json.dumps(obj, ensure_ascii=False)})
|
|
215
|
+
stats["kept"] += 1
|
|
216
|
+
|
|
217
|
+
if args.traces != "-":
|
|
218
|
+
fh.close()
|
|
219
|
+
|
|
220
|
+
with gzip.open(args.out, "wt", encoding="utf-8") as out:
|
|
221
|
+
for ex in examples:
|
|
222
|
+
out.write(json.dumps(ex, ensure_ascii=False) + "\n")
|
|
223
|
+
|
|
224
|
+
report = {"out": args.out, "hash": (None if args.allow_dirty_hash else args.hash),
|
|
225
|
+
"stats": dict(sorted(stats.items()))}
|
|
226
|
+
print(json.dumps(report, indent=2))
|
|
227
|
+
if args.report:
|
|
228
|
+
with open(args.report, "w", encoding="utf-8") as rf:
|
|
229
|
+
json.dump(report, rf, indent=2)
|
|
230
|
+
|
|
231
|
+
if stats["kept"] == 0:
|
|
232
|
+
print("\nWARNING: 0 examples kept. If you targeted the clean hash, the "
|
|
233
|
+
"clean-prompt teacher has not produced enough gold yet — let it "
|
|
234
|
+
"accumulate (or run a teacher-only re-distill of a curated event "
|
|
235
|
+
"slice through the clean prompt), then re-run.", file=sys.stderr)
|
|
236
|
+
return 0
|
|
237
|
+
|
|
238
|
+
|
|
239
|
+
if __name__ == "__main__":
|
|
240
|
+
raise SystemExit(main())
|
|
@@ -0,0 +1,440 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""Fusion de-fragmentation — cluster same-surname PERSON fragments and propose
|
|
3
|
+
merges (RFC-decay-and-fusion A2/A3). DRY-RUN by default.
|
|
4
|
+
|
|
5
|
+
The deterministic upsert resolver (worker.py) converges same-form / shared-alias
|
|
6
|
+
entities, but it CANNOT safely merge surface-form/nickname variants of one real
|
|
7
|
+
person (e.g. "Will Vickers" / "William Vickers" / "William F. Vickers" / bare
|
|
8
|
+
"Vickers") — they have different normalized names and no shared alias, so they
|
|
9
|
+
fragment (209 "Vickers" nodes observed). Merging them is Fusion's job, and it is
|
|
10
|
+
DESTRUCTIVE (repoints facts/relationships, tombstones losers), so the over-merge
|
|
11
|
+
failure mode (folding two DIFFERENT people, or a person into an org) must be
|
|
12
|
+
designed out. This tool is conservative + dry-run-first; --apply is double-gated.
|
|
13
|
+
|
|
14
|
+
CLUSTERING (anti-over-merge by construction):
|
|
15
|
+
- PERSON entities only; never crosses entity_type (so "Vickers Oils" the org is
|
|
16
|
+
never pulled in).
|
|
17
|
+
- Same surname token (the --surname scope).
|
|
18
|
+
- First-name compatibility for the NON-surname tokens: equal, or one an initial
|
|
19
|
+
of the other (W ↔ William), or one a prefix of the other (Will ⊂ William).
|
|
20
|
+
Two DISTINCT full first names (Will vs Jane) are INCOMPATIBLE → never merged.
|
|
21
|
+
- Union-find over compatible NON-bare names → each cluster = one real person.
|
|
22
|
+
- Bare "<surname>" nodes (no first name) are folded in ONLY when there is
|
|
23
|
+
exactly ONE non-bare cluster for the surname (unambiguous); otherwise they
|
|
24
|
+
are left for human review (never used to bridge two distinct people).
|
|
25
|
+
|
|
26
|
+
CANONICAL (A3 scored master, replaces richest-row-wins):
|
|
27
|
+
+ has email (attributes.email or an email alias) strongest identity signal
|
|
28
|
+
+ full name (>=2 name tokens) a real rendering, not a stub
|
|
29
|
+
+ corroboration (provenance event count) grounded in more events
|
|
30
|
+
+ fact count the node that holds the picture
|
|
31
|
+
- bare single-token name penalize stub
|
|
32
|
+
- ID-like (digit ratio > 0.5) penalize 7B numeric-id junk
|
|
33
|
+
|
|
34
|
+
OUTPUT: per cluster — master, losers, why, and the repoint impact (facts +
|
|
35
|
+
relationships that would move onto the master). No DB writes in dry-run
|
|
36
|
+
(the session is forced read-only). --apply would execute via the reviewed
|
|
37
|
+
fusion_drive merge executor + entity_merges audit (NOT enabled here).
|
|
38
|
+
|
|
39
|
+
Usage:
|
|
40
|
+
python fusion_defrag.py --arena 'pentatonic-team%' --surname vickers
|
|
41
|
+
python fusion_defrag.py --arena 'pentatonic-team%' --surname vickers --json out.json
|
|
42
|
+
"""
|
|
43
|
+
from __future__ import annotations
|
|
44
|
+
|
|
45
|
+
import argparse
|
|
46
|
+
import json
|
|
47
|
+
import re
|
|
48
|
+
import sys
|
|
49
|
+
import uuid
|
|
50
|
+
from collections import defaultdict
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def _connect(dsn: str):
|
|
54
|
+
import psycopg
|
|
55
|
+
import psycopg.rows
|
|
56
|
+
return psycopg.connect(dsn, row_factory=psycopg.rows.dict_row)
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
_TOKEN_RE = re.compile(r"[^a-z0-9]+")
|
|
60
|
+
_EMAIL_RE = re.compile(r"^[^@\s]+@[^@\s]+\.[^@\s]+$")
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def name_tokens(name: str) -> list[str]:
|
|
64
|
+
return [t for t in _TOKEN_RE.split((name or "").lower()) if t]
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def digit_ratio(name: str) -> float:
|
|
68
|
+
s = re.sub(r"\s+", "", name or "")
|
|
69
|
+
return sum(c.isdigit() for c in s) / len(s) if s else 0.0
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
# Honorifics/titles are NOT first names — strip them so "Herr Johann Boedecker"
|
|
73
|
+
# matches "Johann Boedecker", and a title-only "Herr Boedecker" reduces to a bare
|
|
74
|
+
# surname (held for review, not merged on the title).
|
|
75
|
+
_TITLES = {"herr", "frau", "fr", "dr", "prof", "mr", "mrs", "ms", "miss",
|
|
76
|
+
"sir", "dame", "mx", "mme", "mlle", "hr"}
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
def first_name_tokens(name: str, surname: str) -> list[str]:
|
|
80
|
+
"""Name tokens minus the surname (first occurrence) and minus honorifics."""
|
|
81
|
+
toks = name_tokens(name)
|
|
82
|
+
out, dropped = [], False
|
|
83
|
+
for t in toks:
|
|
84
|
+
if not dropped and t == surname:
|
|
85
|
+
dropped = True
|
|
86
|
+
continue
|
|
87
|
+
if t in _TITLES:
|
|
88
|
+
continue
|
|
89
|
+
out.append(t)
|
|
90
|
+
return out
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
def first_names_compatible(a: list[str], b: list[str]) -> bool:
|
|
94
|
+
"""Compatible iff the leading given-name tokens don't CONFLICT. Equal /
|
|
95
|
+
initial-of / prefix-of are compatible; two distinct full names are not.
|
|
96
|
+
Empty (bare surname) is handled separately by the caller — NOT here."""
|
|
97
|
+
if not a or not b:
|
|
98
|
+
return False # bare names never auto-bridge via this predicate
|
|
99
|
+
x, y = a[0], b[0]
|
|
100
|
+
if x == y:
|
|
101
|
+
return True
|
|
102
|
+
# initial ↔ full (w / william)
|
|
103
|
+
if (len(x) == 1 and y.startswith(x)) or (len(y) == 1 and x.startswith(y)):
|
|
104
|
+
return True
|
|
105
|
+
# nickname/prefix (will / william) — require >=3 chars to avoid junk
|
|
106
|
+
if len(x) >= 3 and y.startswith(x):
|
|
107
|
+
return True
|
|
108
|
+
if len(y) >= 3 and x.startswith(y):
|
|
109
|
+
return True
|
|
110
|
+
return False
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
class UnionFind:
|
|
114
|
+
def __init__(self, ids):
|
|
115
|
+
self.p = {i: i for i in ids}
|
|
116
|
+
|
|
117
|
+
def find(self, i):
|
|
118
|
+
while self.p[i] != i:
|
|
119
|
+
self.p[i] = self.p[self.p[i]]
|
|
120
|
+
i = self.p[i]
|
|
121
|
+
return i
|
|
122
|
+
|
|
123
|
+
def union(self, a, b):
|
|
124
|
+
ra, rb = self.find(a), self.find(b)
|
|
125
|
+
if ra != rb:
|
|
126
|
+
self.p[ra] = rb
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
def has_email(ent: dict) -> bool:
|
|
130
|
+
attrs = ent.get("attributes") or {}
|
|
131
|
+
if isinstance(attrs, dict) and attrs.get("email"):
|
|
132
|
+
return True
|
|
133
|
+
return any(isinstance(a, str) and _EMAIL_RE.match(a) for a in (ent.get("aliases") or []))
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
def master_score(ent: dict, surname: str) -> float:
|
|
137
|
+
fn = first_name_tokens(ent["canonical_name"], surname)
|
|
138
|
+
score = 0.0
|
|
139
|
+
if has_email(ent):
|
|
140
|
+
score += 3.0
|
|
141
|
+
if len(name_tokens(ent["canonical_name"])) >= 2:
|
|
142
|
+
score += 2.0
|
|
143
|
+
score += min(len(ent.get("provenance_event_ids") or []), 20) * 0.2
|
|
144
|
+
score += min(ent.get("fact_n", 0), 40) * 0.05
|
|
145
|
+
if not fn: # bare surname
|
|
146
|
+
score -= 3.0
|
|
147
|
+
if digit_ratio(ent["canonical_name"]) > 0.5:
|
|
148
|
+
score -= 5.0
|
|
149
|
+
return score
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
def _union(*lists):
|
|
153
|
+
seen = {}
|
|
154
|
+
for lst in lists:
|
|
155
|
+
for x in lst or []:
|
|
156
|
+
seen.setdefault(x, None)
|
|
157
|
+
return list(seen.keys())
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
def apply_cluster(cur, conn, arena: str, master: dict, losers: list[dict]) -> dict:
|
|
161
|
+
"""Fold `losers` into `master` within ONE transaction. Faithful inline of the
|
|
162
|
+
reviewed fusion_drive _execute_entity_plan + build_entity_merge_plan: repoint
|
|
163
|
+
facts (subject/object) and relationships (endpoints, summing weight on the
|
|
164
|
+
post-repoint (from,to,type) collision), accrete aliases+provenance onto the
|
|
165
|
+
master, write one entity_merges audit row per loser (rollback_payload = full
|
|
166
|
+
loser row), then delete the losers. Re-validates losers still exist first."""
|
|
167
|
+
loser_ids = [l["id"] for l in losers]
|
|
168
|
+
# Load edges/facts touching the losers AND THE MASTER. Including the master is
|
|
169
|
+
# load-bearing for relationships: a loser edge repointed onto the master can
|
|
170
|
+
# collide with an edge the master ALREADY has (or another loser's) on the
|
|
171
|
+
# UNIQUE(arena,from,to,type) key — if we don't see the master's existing edges
|
|
172
|
+
# in collision detection, the repoint UPDATE hits a duplicate-key violation
|
|
173
|
+
# (caught the hard way on the first Vickers apply; the txn rolled back clean).
|
|
174
|
+
# Facts have no such unique key, so master facts are loaded but never repointed
|
|
175
|
+
# (repoint decisions key on the loser set only).
|
|
176
|
+
targets = loser_ids + [master["id"]]
|
|
177
|
+
cur.execute(
|
|
178
|
+
"SELECT id, subject_entity_id, object_entity_id FROM facts "
|
|
179
|
+
"WHERE arena = %s AND (subject_entity_id = ANY(%s) OR object_entity_id = ANY(%s))",
|
|
180
|
+
(arena, targets, targets),
|
|
181
|
+
)
|
|
182
|
+
facts = cur.fetchall()
|
|
183
|
+
cur.execute(
|
|
184
|
+
"SELECT id, from_entity_id, to_entity_id, relationship_type, weight, "
|
|
185
|
+
"provenance_event_ids FROM relationships "
|
|
186
|
+
"WHERE arena = %s AND (from_entity_id = ANY(%s) OR to_entity_id = ANY(%s))",
|
|
187
|
+
(arena, targets, targets),
|
|
188
|
+
)
|
|
189
|
+
rels = cur.fetchall()
|
|
190
|
+
|
|
191
|
+
lset = set(loser_ids)
|
|
192
|
+
aliases = _union(master.get("aliases") or [],
|
|
193
|
+
[l["canonical_name"] for l in losers],
|
|
194
|
+
*[l.get("aliases") or [] for l in losers])
|
|
195
|
+
aliases = [a for a in aliases if a != master["canonical_name"]]
|
|
196
|
+
provenance = _union(master.get("provenance_event_ids") or [],
|
|
197
|
+
*[l.get("provenance_event_ids") or [] for l in losers])
|
|
198
|
+
fact_subj = [f["id"] for f in facts if f["subject_entity_id"] in lset]
|
|
199
|
+
fact_obj = [f["id"] for f in facts if f["object_entity_id"] in lset]
|
|
200
|
+
|
|
201
|
+
def rk(r):
|
|
202
|
+
frm = master["id"] if r["from_entity_id"] in lset else r["from_entity_id"]
|
|
203
|
+
to = master["id"] if r["to_entity_id"] in lset else r["to_entity_id"]
|
|
204
|
+
return (frm, to, r["relationship_type"])
|
|
205
|
+
by_key, rel_repoints, rel_collisions = {}, [], []
|
|
206
|
+
for r in rels:
|
|
207
|
+
touches = r["from_entity_id"] in lset or r["to_entity_id"] in lset
|
|
208
|
+
key = rk(r)
|
|
209
|
+
if key in by_key:
|
|
210
|
+
keep = by_key[key]
|
|
211
|
+
rel_collisions.append({
|
|
212
|
+
"keep": keep["id"], "drop": r["id"],
|
|
213
|
+
"summed_weight": round((keep.get("weight") or 1.0) + (r.get("weight") or 1.0), 4),
|
|
214
|
+
"provenance": _union(keep.get("provenance_event_ids") or [],
|
|
215
|
+
r.get("provenance_event_ids") or []),
|
|
216
|
+
})
|
|
217
|
+
else:
|
|
218
|
+
by_key[key] = r
|
|
219
|
+
if touches:
|
|
220
|
+
rel_repoints.append(r["id"])
|
|
221
|
+
|
|
222
|
+
with conn.transaction():
|
|
223
|
+
live = set()
|
|
224
|
+
cur.execute("SELECT id FROM entities WHERE id = ANY(%s)", (loser_ids,))
|
|
225
|
+
live = {r["id"] for r in cur.fetchall()}
|
|
226
|
+
if live != lset:
|
|
227
|
+
return {"applied": False, "reason": "stale: some losers already gone"}
|
|
228
|
+
cur.execute("UPDATE entities SET aliases=%s, provenance_event_ids=%s, last_seen=NOW() "
|
|
229
|
+
"WHERE id=%s", (aliases, provenance, master["id"]))
|
|
230
|
+
for fid in fact_subj:
|
|
231
|
+
cur.execute("UPDATE facts SET subject_entity_id=%s WHERE id=%s", (master["id"], fid))
|
|
232
|
+
for fid in fact_obj:
|
|
233
|
+
cur.execute("UPDATE facts SET object_entity_id=%s WHERE id=%s", (master["id"], fid))
|
|
234
|
+
# DELETE colliding edges BEFORE repointing — else repointing a "keep" edge
|
|
235
|
+
# onto the master collides with the not-yet-deleted "drop" on the UNIQUE
|
|
236
|
+
# (arena,from,to,type) key. Carry each drop's weight+provenance onto its keep.
|
|
237
|
+
for col in rel_collisions:
|
|
238
|
+
cur.execute("UPDATE relationships SET weight=%s, provenance_event_ids=%s WHERE id=%s",
|
|
239
|
+
(col["summed_weight"], col["provenance"], col["keep"]))
|
|
240
|
+
cur.execute("DELETE FROM relationships WHERE id=%s", (col["drop"],))
|
|
241
|
+
for rid in rel_repoints:
|
|
242
|
+
cur.execute(
|
|
243
|
+
"UPDATE relationships SET "
|
|
244
|
+
"from_entity_id = CASE WHEN from_entity_id = ANY(%s) THEN %s ELSE from_entity_id END, "
|
|
245
|
+
"to_entity_id = CASE WHEN to_entity_id = ANY(%s) THEN %s ELSE to_entity_id END "
|
|
246
|
+
"WHERE id=%s",
|
|
247
|
+
(loser_ids, master["id"], loser_ids, master["id"], rid))
|
|
248
|
+
for l in losers:
|
|
249
|
+
cur.execute(
|
|
250
|
+
"INSERT INTO entity_merges (id, arena, canonical_id, deprecated_id, "
|
|
251
|
+
"deprecated_canonical_name, deprecated_aliases, merge_signal, "
|
|
252
|
+
"facts_repointed, relationships_repointed, merged_by, rollback_payload) "
|
|
253
|
+
"VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s::jsonb)",
|
|
254
|
+
("em_" + uuid.uuid4().hex[:20], arena, master["id"], l["id"],
|
|
255
|
+
l["canonical_name"], l.get("aliases") or [], "heuristic",
|
|
256
|
+
len(fact_subj) + len(fact_obj), len(rel_repoints), "fusion-defrag",
|
|
257
|
+
json.dumps(l, default=str)))
|
|
258
|
+
cur.execute("DELETE FROM entities WHERE id = ANY(%s)", (loser_ids,))
|
|
259
|
+
return {"applied": True, "facts_repointed": len(fact_subj) + len(fact_obj),
|
|
260
|
+
"rels_repointed": len(rel_repoints), "rel_collisions": len(rel_collisions),
|
|
261
|
+
"tombstoned": len(loser_ids)}
|
|
262
|
+
|
|
263
|
+
|
|
264
|
+
def main() -> int:
|
|
265
|
+
ap = argparse.ArgumentParser(description=__doc__,
|
|
266
|
+
formatter_class=argparse.RawDescriptionHelpFormatter)
|
|
267
|
+
ap.add_argument("--arena", required=True, help="arena LIKE filter (REQUIRED)")
|
|
268
|
+
ap.add_argument("--surname", required=True, help="surname token to scope (e.g. vickers)")
|
|
269
|
+
ap.add_argument("--pg-dsn", default="", help="Postgres DSN (or PG_DSN env)")
|
|
270
|
+
ap.add_argument("--json", help="write the proposal as JSON")
|
|
271
|
+
ap.add_argument("--apply", action="store_true",
|
|
272
|
+
help="EXECUTE the merges (default: dry-run). Requires --i-have-a-snapshot.")
|
|
273
|
+
ap.add_argument("--i-have-a-snapshot", action="store_true",
|
|
274
|
+
help="operator asserts a DB/row snapshot exists for rollback (required with --apply)")
|
|
275
|
+
args = ap.parse_args()
|
|
276
|
+
if args.apply and not args.i_have_a_snapshot:
|
|
277
|
+
print("REFUSED: --apply requires --i-have-a-snapshot (take a snapshot first; "
|
|
278
|
+
"merges repoint facts/rels + tombstone nodes — entity_merges holds rollback "
|
|
279
|
+
"payloads but a row/DB snapshot is the real safety net).", file=sys.stderr)
|
|
280
|
+
return 2
|
|
281
|
+
|
|
282
|
+
import os
|
|
283
|
+
dsn = args.pg_dsn or os.environ.get("PG_DSN", "")
|
|
284
|
+
if not dsn:
|
|
285
|
+
print("FATAL: no --pg-dsn / PG_DSN", file=sys.stderr)
|
|
286
|
+
return 2
|
|
287
|
+
surname = args.surname.lower()
|
|
288
|
+
|
|
289
|
+
with _connect(dsn) as conn:
|
|
290
|
+
with conn.cursor() as cur:
|
|
291
|
+
if not args.apply:
|
|
292
|
+
cur.execute("SET default_transaction_read_only = on") # dry-run safety
|
|
293
|
+
cur.execute("SET max_parallel_workers_per_gather = 0")
|
|
294
|
+
# Person fragments whose CANONICAL NAME carries the surname as a real
|
|
295
|
+
# token. Critically NOT alias-scoped: an @vickers-oil.com email domain
|
|
296
|
+
# in aliases would otherwise drag in unrelated employees (Paul Vann,
|
|
297
|
+
# Matt Tooze) who merely WORK at a Vickers company — the over-merge the
|
|
298
|
+
# first dry-run caught. Email-named stubs (canonical_name has '@') are
|
|
299
|
+
# also excluded: tokenizing an email is not a safe surname signal (they
|
|
300
|
+
# converge via the alias-resolution path instead).
|
|
301
|
+
cur.execute(
|
|
302
|
+
"""
|
|
303
|
+
SELECT e.id, e.arena, e.canonical_name, e.aliases, e.provenance_event_ids,
|
|
304
|
+
e.attributes, e.last_seen,
|
|
305
|
+
(SELECT count(*) FROM facts f
|
|
306
|
+
WHERE f.provenance_event_ids && e.provenance_event_ids
|
|
307
|
+
AND (f.subject_entity_id = e.id OR f.object_entity_id = e.id)) AS fact_n
|
|
308
|
+
FROM entities e
|
|
309
|
+
WHERE e.arena LIKE %s AND e.entity_type = 'person'
|
|
310
|
+
AND position('@' in e.canonical_name) = 0
|
|
311
|
+
AND lower(e.canonical_name) ~ %s
|
|
312
|
+
""",
|
|
313
|
+
(args.arena, rf"(^|[^a-z]){surname}([^a-z]|$)"),
|
|
314
|
+
)
|
|
315
|
+
ents = cur.fetchall()
|
|
316
|
+
# Confirm the surname is a real NAME token (regex is a coarse guard).
|
|
317
|
+
ents = [e for e in ents if surname in name_tokens(e["canonical_name"])]
|
|
318
|
+
print(f"[defrag] arena={args.arena} surname={surname!r}: "
|
|
319
|
+
f"{len(ents)} person fragments")
|
|
320
|
+
if not ents:
|
|
321
|
+
return 0
|
|
322
|
+
|
|
323
|
+
by_id = {e["id"]: e for e in ents}
|
|
324
|
+
non_bare = [e for e in ents if first_name_tokens(e["canonical_name"], surname)]
|
|
325
|
+
bare = [e for e in ents if not first_name_tokens(e["canonical_name"], surname)]
|
|
326
|
+
|
|
327
|
+
# Union-find over compatible non-bare names.
|
|
328
|
+
uf = UnionFind([e["id"] for e in non_bare])
|
|
329
|
+
for i in range(len(non_bare)):
|
|
330
|
+
for j in range(i + 1, len(non_bare)):
|
|
331
|
+
a, b = non_bare[i], non_bare[j]
|
|
332
|
+
# Never union across exact arenas — entity id = hash(arena|
|
|
333
|
+
# type|name), so cross-arena same-name nodes are genuinely
|
|
334
|
+
# different scoped entities; merging them would be wrong.
|
|
335
|
+
if a["arena"] != b["arena"]:
|
|
336
|
+
continue
|
|
337
|
+
if first_names_compatible(
|
|
338
|
+
first_name_tokens(a["canonical_name"], surname),
|
|
339
|
+
first_name_tokens(b["canonical_name"], surname),
|
|
340
|
+
):
|
|
341
|
+
uf.union(a["id"], b["id"])
|
|
342
|
+
clusters: dict[str, list[dict]] = defaultdict(list)
|
|
343
|
+
for e in non_bare:
|
|
344
|
+
clusters[uf.find(e["id"])].append(e)
|
|
345
|
+
|
|
346
|
+
# Bare surnames: fold in ONLY if exactly one non-bare cluster exists.
|
|
347
|
+
bare_note = ""
|
|
348
|
+
if bare:
|
|
349
|
+
if len(clusters) == 1:
|
|
350
|
+
only = next(iter(clusters))
|
|
351
|
+
cl_arena = clusters[only][0]["arena"]
|
|
352
|
+
same = [b for b in bare if b["arena"] == cl_arena]
|
|
353
|
+
clusters[only].extend(same)
|
|
354
|
+
bare_note = f"{len(same)} bare-'{surname}' node(s) folded into the single cluster"
|
|
355
|
+
else:
|
|
356
|
+
bare_note = (f"{len(bare)} bare-'{surname}' node(s) LEFT FOR REVIEW "
|
|
357
|
+
f"({len(clusters)} distinct name-clusters — ambiguous which person)")
|
|
358
|
+
|
|
359
|
+
proposals = []
|
|
360
|
+
for cid, members in clusters.items():
|
|
361
|
+
if len(members) < 2:
|
|
362
|
+
continue
|
|
363
|
+
master = max(members, key=lambda e: (master_score(e, surname),
|
|
364
|
+
len(e.get("provenance_event_ids") or []),
|
|
365
|
+
len(e["canonical_name"])))
|
|
366
|
+
losers = [e for e in members if e["id"] != master["id"]]
|
|
367
|
+
loser_ids = [l["id"] for l in losers]
|
|
368
|
+
# Repoint impact (read-only counts).
|
|
369
|
+
cur.execute(
|
|
370
|
+
"SELECT count(*) AS n FROM facts WHERE arena LIKE %s AND "
|
|
371
|
+
"(subject_entity_id = ANY(%s) OR object_entity_id = ANY(%s))",
|
|
372
|
+
(args.arena, loser_ids, loser_ids),
|
|
373
|
+
)
|
|
374
|
+
facts_repointed = cur.fetchone()["n"]
|
|
375
|
+
cur.execute(
|
|
376
|
+
"SELECT count(*) AS n FROM relationships WHERE arena LIKE %s AND "
|
|
377
|
+
"(from_entity_id = ANY(%s) OR to_entity_id = ANY(%s))",
|
|
378
|
+
(args.arena, loser_ids, loser_ids),
|
|
379
|
+
)
|
|
380
|
+
rels_repointed = cur.fetchone()["n"]
|
|
381
|
+
proposals.append({
|
|
382
|
+
"arena": master["arena"],
|
|
383
|
+
"master_row": master, # full row for apply
|
|
384
|
+
"loser_rows": losers,
|
|
385
|
+
"master": {"id": master["id"], "name": master["canonical_name"],
|
|
386
|
+
"facts": master.get("fact_n", 0),
|
|
387
|
+
"prov": len(master.get("provenance_event_ids") or []),
|
|
388
|
+
"email": has_email(master),
|
|
389
|
+
"score": round(master_score(master, surname), 2)},
|
|
390
|
+
"losers": [{"id": l["id"], "name": l["canonical_name"],
|
|
391
|
+
"facts": l.get("fact_n", 0),
|
|
392
|
+
"prov": len(l.get("provenance_event_ids") or []),
|
|
393
|
+
"email": has_email(l)} for l in losers],
|
|
394
|
+
"facts_repointed": facts_repointed,
|
|
395
|
+
"rels_repointed": rels_repointed,
|
|
396
|
+
})
|
|
397
|
+
|
|
398
|
+
if args.apply and proposals:
|
|
399
|
+
conn.rollback() # end the read-only probe txn cleanly before writes
|
|
400
|
+
print(f"\n[defrag] APPLYING {len(proposals)} cluster(s) — arena-scoped, transactional…")
|
|
401
|
+
for p in proposals:
|
|
402
|
+
p["apply_result"] = apply_cluster(cur, conn, p["arena"],
|
|
403
|
+
p["master_row"], p["loser_rows"])
|
|
404
|
+
print(f" master={p['master']['name']!r} ({p['arena']}): {p['apply_result']}")
|
|
405
|
+
|
|
406
|
+
# ---- report ----
|
|
407
|
+
mode = "APPLIED" if args.apply else "PROPOSED (dry-run, no writes)"
|
|
408
|
+
print(f"\n=== {mode} MERGES — surname '{surname}' ===")
|
|
409
|
+
if bare_note:
|
|
410
|
+
print(f" note: {bare_note}")
|
|
411
|
+
if not proposals:
|
|
412
|
+
print(" (no multi-node clusters — nothing to merge)")
|
|
413
|
+
tot_dep = tot_f = tot_r = 0
|
|
414
|
+
for i, p in enumerate(proposals, 1):
|
|
415
|
+
m = p["master"]
|
|
416
|
+
print(f"\n[{i}] MASTER ← {m['name']!r} ({m['id'][:10]}…) "
|
|
417
|
+
f"score={m['score']} facts={m['facts']} prov={m['prov']} email={m['email']}")
|
|
418
|
+
for l in p["losers"]:
|
|
419
|
+
print(f" merge: {l['name']!r} ({l['id'][:10]}…) "
|
|
420
|
+
f"facts={l['facts']} prov={l['prov']} email={l['email']}")
|
|
421
|
+
print(f" → would repoint {p['facts_repointed']} facts, "
|
|
422
|
+
f"{p['rels_repointed']} relationships onto the master; "
|
|
423
|
+
f"{len(p['losers'])} node(s) tombstoned")
|
|
424
|
+
tot_dep += len(p["losers"]); tot_f += p["facts_repointed"]; tot_r += p["rels_repointed"]
|
|
425
|
+
tail = ("APPLIED — rollback via entity_merges (merged_by='fusion-defrag') + snapshot."
|
|
426
|
+
if args.apply else "DRY-RUN — nothing written.")
|
|
427
|
+
print(f"\nTOTAL: {len(proposals)} cluster(s), {tot_dep} nodes tombstoned, "
|
|
428
|
+
f"{tot_f} facts + {tot_r} relationships repointed. {tail}")
|
|
429
|
+
if args.json:
|
|
430
|
+
with open(args.json, "w") as f:
|
|
431
|
+
json.dump({"surname": surname, "arena": args.arena, "applied": args.apply,
|
|
432
|
+
"bare_note": bare_note,
|
|
433
|
+
"proposals": [{k: v for k, v in p.items()
|
|
434
|
+
if k not in ("master_row", "loser_rows")}
|
|
435
|
+
for p in proposals]}, f, indent=2)
|
|
436
|
+
return 0
|
|
437
|
+
|
|
438
|
+
|
|
439
|
+
if __name__ == "__main__":
|
|
440
|
+
raise SystemExit(main())
|