@pentatonic-ai/ai-agent-sdk 0.10.7 → 0.10.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,264 @@
1
+ #!/usr/bin/env python3
2
+ """Fusion Drive — fusion pass (detect duplicate nodes, fuse into a master).
3
+
4
+ Detects duplicate/near-duplicate entities (and exact-triple duplicate facts)
5
+ within an arena and merges each set into a single master, repointing facts +
6
+ relationships, unioning aliases/provenance, and writing reversible audit rows
7
+ (entity_merges / fact_merges).
8
+
9
+ Detection here is the TRACTABLE, no-LLM tier:
10
+ - entities: exact normalized-name / alias-overlap dupes, AND cross-run
11
+ shared-provenance dupes (two same-type entities citing the same event
12
+ where one is junk-leaning — the 7B numeric-ID vs new-teacher real-name
13
+ case). Embedding-band + LLM adjudication detection stays in
14
+ entity_resolution_v2.py (#82); this pass consumes its proposals too if
15
+ handed a --proposals file.
16
+ - facts: exact (subject, predicate, object) triples.
17
+ Master selection uses fusion_drive.canonical (directory-anchored scoring),
18
+ NOT richest-row-wins.
19
+
20
+ Safety: arena-scoped (required), DRY-RUN default, --apply to merge, each merge
21
+ its own transaction, every deprecated row recoverable from the audit table,
22
+ restricted disclosure never auto-merged.
23
+
24
+ Usage:
25
+ fusion_drive_fuse.py --arena 'X' # dry-run: list merge proposals
26
+ fusion_drive_fuse.py --arena 'X' --apply # execute merges (reversible)
27
+ """
28
+
29
+ from __future__ import annotations
30
+
31
+ import argparse
32
+ import json
33
+ import os
34
+ import sys
35
+ import uuid
36
+
37
+ sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "fusion_drive"))
38
+ import canonical as C # noqa: E402
39
+ from merge import build_entity_merge_plan, build_fact_merge_plan # noqa: E402
40
+
41
+ try:
42
+ import psycopg
43
+ from psycopg.rows import dict_row
44
+ except ModuleNotFoundError:
45
+ print("psycopg required", file=sys.stderr)
46
+ raise
47
+
48
+
49
+ def _norm(s: str) -> str:
50
+ return " ".join(s.lower().split())
51
+
52
+
53
+ def _entity_dup_sets(cur, arena: str) -> list[list[dict]]:
54
+ """Group same-(type) entities that are exact normalized-name dupes OR
55
+ share a provenance event with a junk-leaning twin. Returns groups of >=2."""
56
+ cur.execute(
57
+ """SELECT id, entity_type, canonical_name, aliases, provenance_event_ids, disclosure_class
58
+ FROM entities WHERE arena = %s AND disclosure_class <> 'restricted'""",
59
+ (arena,),
60
+ )
61
+ ents = cur.fetchall()
62
+ groups: dict[tuple, list[dict]] = {}
63
+ # 1. exact normalized-name within (type)
64
+ for e in ents:
65
+ key = (e["entity_type"], _norm(e["canonical_name"]))
66
+ groups.setdefault(key, []).append(e)
67
+ exact = [g for g in groups.values() if len(g) > 1]
68
+
69
+ # 2. cross-run shared-provenance: same type + same event in provenance,
70
+ # where some members are junk-leaning (looks-like-id) — catches
71
+ # name-divergent dupes like "1716801984" vs "Katie Cooper" that never
72
+ # block on name.
73
+ #
74
+ # OVER-MERGE GUARD: a single event can legitimately mention several
75
+ # distinct same-type entities (an email naming Alice, Bob, AND a
76
+ # numeric-ID node). Merging the whole co-occurrence group would
77
+ # conflate Alice and Bob. So the no-LLM tier ONLY proposes when the
78
+ # group has EXACTLY ONE non-junk member: we fold the junk node(s) into
79
+ # that unambiguous real master. Groups with 0 or >=2 non-junk members
80
+ # are ambiguous and deferred to the LLM-adjudicated tier
81
+ # (entity_resolution_v2.py) rather than auto-merged.
82
+ by_event_type: dict[tuple, list[dict]] = {}
83
+ for e in ents:
84
+ for ev in (e["provenance_event_ids"] or []):
85
+ by_event_type.setdefault((e["entity_type"], ev), []).append(e)
86
+ cross = []
87
+ seen_ids: set[str] = set()
88
+ for members in by_event_type.values():
89
+ if len(members) < 2:
90
+ continue
91
+ junk = [m for m in members if C.looks_like_id(m["canonical_name"])]
92
+ non_junk = [m for m in members if not C.looks_like_id(m["canonical_name"])]
93
+ if not junk or len(non_junk) != 1:
94
+ continue # need junk to clean AND exactly one unambiguous master
95
+ group = non_junk + junk
96
+ ids = tuple(sorted(m["id"] for m in group))
97
+ if ids in seen_ids:
98
+ continue
99
+ seen_ids.add(ids)
100
+ cross.append(group)
101
+ return exact + cross
102
+
103
+
104
+ def _candidates(group: list[dict]) -> list[C.CanonicalCandidate]:
105
+ return [
106
+ C.CanonicalCandidate(
107
+ entity_id=e["id"],
108
+ canonical_name=e["canonical_name"],
109
+ n_provenance=len(e["provenance_event_ids"] or []),
110
+ aliases=e["aliases"] or [],
111
+ # in_directory / grounded / from_current_teacher would be resolved
112
+ # from an authority table + provenance content + trace llm_model;
113
+ # left False here (no-LLM tier) so scoring leans on grounding-by-
114
+ # corroboration + the ID/bare penalties. Wire authority in a
115
+ # follow-up — the scoring already supports it.
116
+ )
117
+ for e in group
118
+ ]
119
+
120
+
121
+ def _touching(cur, arena: str, loser_ids: list[str]) -> tuple[list[dict], list[dict]]:
122
+ cur.execute(
123
+ """SELECT id, subject_entity_id, object_entity_id FROM facts
124
+ WHERE arena = %s AND (subject_entity_id = ANY(%s) OR object_entity_id = ANY(%s))""",
125
+ (arena, loser_ids, loser_ids),
126
+ )
127
+ facts = cur.fetchall()
128
+ cur.execute(
129
+ """SELECT id, from_entity_id, to_entity_id, relationship_type, weight, provenance_event_ids
130
+ FROM relationships WHERE arena = %s
131
+ AND (from_entity_id = ANY(%s) OR to_entity_id = ANY(%s)
132
+ OR from_entity_id = ANY(%s) OR to_entity_id = ANY(%s))""",
133
+ (arena, loser_ids, loser_ids, loser_ids, loser_ids),
134
+ )
135
+ rels = cur.fetchall()
136
+ return facts, rels
137
+
138
+
139
+ def _execute_entity_plan(cur, plan) -> None:
140
+ # master row
141
+ cur.execute("UPDATE entities SET aliases = %s, provenance_event_ids = %s, last_seen = NOW() WHERE id = %s",
142
+ (plan.master_aliases, plan.master_provenance, plan.master_id))
143
+ for fid in plan.fact_subject_repoints:
144
+ cur.execute("UPDATE facts SET subject_entity_id = %s WHERE id = %s", (plan.master_id, fid))
145
+ for fid in plan.fact_object_repoints:
146
+ cur.execute("UPDATE facts SET object_entity_id = %s WHERE id = %s", (plan.master_id, fid))
147
+ for rid in plan.rel_endpoint_repoints:
148
+ cur.execute(
149
+ """UPDATE relationships SET
150
+ from_entity_id = CASE WHEN from_entity_id = ANY(%s) THEN %s ELSE from_entity_id END,
151
+ to_entity_id = CASE WHEN to_entity_id = ANY(%s) THEN %s ELSE to_entity_id END
152
+ WHERE id = %s""",
153
+ (plan.deprecated_entity_ids, plan.master_id,
154
+ plan.deprecated_entity_ids, plan.master_id, rid),
155
+ )
156
+ for col in plan.rel_collisions:
157
+ cur.execute("UPDATE relationships SET weight = %s, provenance_event_ids = %s WHERE id = %s",
158
+ (col["summed_weight"], col["provenance"], col["keep"]))
159
+ cur.execute("DELETE FROM relationships WHERE id = %s", (col["drop"],))
160
+ for a in plan.audit_rows:
161
+ cur.execute(
162
+ """INSERT INTO entity_merges (id, arena, canonical_id, deprecated_id,
163
+ deprecated_canonical_name, deprecated_aliases, merge_signal,
164
+ facts_repointed, rollback_payload)
165
+ VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s::jsonb)""",
166
+ ("em_" + uuid.uuid4().hex[:20], a["arena"], a["canonical_id"], a["deprecated_id"],
167
+ a["deprecated_canonical_name"], a["deprecated_aliases"], a["merge_signal"],
168
+ len(plan.fact_subject_repoints) + len(plan.fact_object_repoints),
169
+ json.dumps(a["rollback_payload"], default=str)),
170
+ )
171
+ cur.execute("DELETE FROM entities WHERE id = ANY(%s)", (plan.deprecated_entity_ids,))
172
+
173
+
174
+ def _dedup_master_facts(cur, arena: str, master_id: str) -> int:
175
+ """After repointing facts onto the master, the master can hold several
176
+ facts with the same (subject, predicate, object) but different statements
177
+ (fact id is content_id(arena, statement), so they didn't collapse on
178
+ insert). Fuse each such triple-group via build_fact_merge_plan: keep the
179
+ best, union provenance, delete dups with a fact_merges receipt."""
180
+ cur.execute(
181
+ """SELECT id, predicate, object_entity_id, statement, confidence, provenance_event_ids
182
+ FROM facts
183
+ WHERE arena = %s AND (subject_entity_id = %s OR object_entity_id = %s)""",
184
+ (arena, master_id, master_id),
185
+ )
186
+ rows = cur.fetchall()
187
+ groups: dict[tuple, list[dict]] = {}
188
+ for r in rows:
189
+ # group key uses the master as the subject anchor + predicate + object
190
+ groups.setdefault((master_id, r["predicate"], r["object_entity_id"]), []).append(r)
191
+ deduped = 0
192
+ for dup in groups.values():
193
+ plan = build_fact_merge_plan(arena=arena, dup_facts=dup)
194
+ if not plan:
195
+ continue
196
+ cur.execute("UPDATE facts SET provenance_event_ids = %s WHERE id = %s",
197
+ (plan["master_provenance"], plan["master_id"]))
198
+ for a in plan["audit_rows"]:
199
+ cur.execute(
200
+ """INSERT INTO fact_merges (id, arena, canonical_id, deprecated_id,
201
+ deprecated_statement, merge_signal, provenance_unioned, rollback_payload)
202
+ VALUES (%s,%s,%s,%s,%s,%s,%s,%s::jsonb)""",
203
+ ("fm_" + uuid.uuid4().hex[:20], a["arena"], a["canonical_id"], a["deprecated_id"],
204
+ a["deprecated_statement"], a["merge_signal"], a["provenance_unioned"],
205
+ json.dumps(a["rollback_payload"], default=str)),
206
+ )
207
+ cur.execute("DELETE FROM facts WHERE id = ANY(%s)", (plan["deprecated_ids"],))
208
+ deduped += len(plan["deprecated_ids"])
209
+ return deduped
210
+
211
+
212
+ def main() -> int:
213
+ ap = argparse.ArgumentParser()
214
+ ap.add_argument("--arena", required=True)
215
+ ap.add_argument("--pg-dsn", default=os.environ.get("PG_DSN"))
216
+ ap.add_argument("--apply", action="store_true", help="execute merges (default: dry-run)")
217
+ args = ap.parse_args()
218
+ if not args.pg_dsn:
219
+ print("PG_DSN required", file=sys.stderr)
220
+ return 2
221
+
222
+ proposals = 0
223
+ merged = 0
224
+ with psycopg.connect(args.pg_dsn, row_factory=dict_row) as conn:
225
+ with conn.cursor() as cur:
226
+ groups = _entity_dup_sets(cur, args.arena)
227
+ for group in groups:
228
+ master_c, losers_c = C.pick_master(_candidates(group))
229
+ loser_ids = [l.entity_id for l in losers_c]
230
+ if not loser_ids:
231
+ continue
232
+ proposals += 1
233
+ by_id = {e["id"]: e for e in group}
234
+ master = by_id[master_c.entity_id]
235
+ losers = [by_id[i] for i in loser_ids]
236
+ facts, rels = _touching(cur, args.arena, loser_ids)
237
+ plan = build_entity_merge_plan(
238
+ arena=args.arena, master=master, losers=losers, facts=facts, relationships=rels)
239
+ print(f" MERGE → master '{master['canonical_name']}' ({master['id']}) "
240
+ f"absorbs {[l['canonical_name'] for l in losers]} "
241
+ f"[facts:{len(plan.fact_subject_repoints)+len(plan.fact_object_repoints)} "
242
+ f"rels:{len(plan.rel_endpoint_repoints)} collisions:{len(plan.rel_collisions)}]")
243
+ if args.apply:
244
+ _execute_entity_plan(cur, plan)
245
+ _dedup_master_facts(cur, args.arena, master["id"])
246
+ merged += len(loser_ids)
247
+ conn.commit() # per-merge: a bad merge can't roll back the good ones, and locks stay short
248
+ run_id = "fdr_" + uuid.uuid4().hex[:20]
249
+ cur.execute(
250
+ """INSERT INTO fusion_drive_runs (id, arena, pass_kind, mode, scanned, changed, detail, finished_at)
251
+ VALUES (%s,%s,'fusion',%s,%s,%s,%s::jsonb,NOW())""",
252
+ (run_id, args.arena, "apply" if args.apply else "dry_run",
253
+ proposals, merged, json.dumps({"proposals": proposals, "merged": merged})),
254
+ )
255
+ conn.commit()
256
+
257
+ label = "APPLY (merged, reversible via entity_merges)" if args.apply else "DRY-RUN"
258
+ print(f"[fusion-drive:fuse] {label} arena={args.arena}: {proposals} proposal(s), {merged} entities merged")
259
+ print(f" ledger: {run_id}")
260
+ return 0
261
+
262
+
263
+ if __name__ == "__main__":
264
+ raise SystemExit(main())