@pentatonic-ai/ai-agent-sdk 0.10.6 → 0.10.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.cjs +1 -1
- package/dist/index.js +1 -1
- package/package.json +1 -1
- package/packages/memory-engine-v2/RFC-decay-and-fusion.md +185 -0
- package/packages/memory-engine-v2/RFC-fusion-drive.md +193 -0
- package/packages/memory-engine-v2/docker-compose.aws.yml +62 -1
- package/packages/memory-engine-v2/docker-compose.yml +8 -1
- package/packages/memory-engine-v2/extractor-async/confidence.py +37 -0
- package/packages/memory-engine-v2/extractor-async/test_born_salience_parity.py +35 -0
- package/packages/memory-engine-v2/extractor-async/test_guided_json_parser.py +44 -0
- package/packages/memory-engine-v2/extractor-async/worker.py +67 -7
- package/packages/memory-engine-v2/extractor-sync/server.py +6 -2
- package/packages/memory-engine-v2/extractor-sync/test_paired_extraction.py +82 -1
- package/packages/memory-engine-v2/fusion_drive/__init__.py +0 -0
- package/packages/memory-engine-v2/fusion_drive/canonical.py +94 -0
- package/packages/memory-engine-v2/fusion_drive/conftest.py +8 -0
- package/packages/memory-engine-v2/fusion_drive/merge.py +178 -0
- package/packages/memory-engine-v2/fusion_drive/salience.py +118 -0
- package/packages/memory-engine-v2/fusion_drive/test_canonical.py +76 -0
- package/packages/memory-engine-v2/fusion_drive/test_merge.py +112 -0
- package/packages/memory-engine-v2/fusion_drive/test_salience.py +93 -0
- package/packages/memory-engine-v2/org-model/migrations/004_source_kind_code_reference.sql +12 -0
- package/packages/memory-engine-v2/org-model/migrations/005_fk_indexes.sql +20 -0
- package/packages/memory-engine-v2/org-model/migrations/006_fusion_drive.sql +80 -0
- package/packages/memory-engine-v2/scripts/fusion_drive_born_salience_backfill.py +113 -0
- package/packages/memory-engine-v2/scripts/fusion_drive_decay.py +181 -0
- package/packages/memory-engine-v2/scripts/fusion_drive_fuse.py +264 -0
|
@@ -0,0 +1,181 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""Fusion Drive — decay pass (scoring + eviction).
|
|
3
|
+
|
|
4
|
+
Recomputes time-decayed salience for an arena's facts/entities/relationships,
|
|
5
|
+
reports eviction candidates, and — only with --evict — deletes them, writing a
|
|
6
|
+
node_evictions rollback receipt per deletion. Three escalating modes:
|
|
7
|
+
|
|
8
|
+
(default) DRY-RUN — report candidates, change nothing.
|
|
9
|
+
--apply — persist recomputed salience back to rows (no deletion).
|
|
10
|
+
--evict — additionally DELETE evictable nodes (implies --apply),
|
|
11
|
+
each with a full rollback_payload in node_evictions.
|
|
12
|
+
|
|
13
|
+
Pollution cure (recap): junk born at low salience (extractor quality flags)
|
|
14
|
+
falls below the eviction threshold purely from age, so it self-evicts without
|
|
15
|
+
needing a fusion match. Eviction safety: arena-scoped, restricted disclosure
|
|
16
|
+
never evicted, entities referenced by a surviving fact never evicted, every
|
|
17
|
+
deletion reversible from node_evictions, one transaction.
|
|
18
|
+
|
|
19
|
+
Usage:
|
|
20
|
+
fusion_drive_decay.py --arena 'X' # dry-run report
|
|
21
|
+
fusion_drive_decay.py --arena 'X' --apply # persist salience, no deletion
|
|
22
|
+
fusion_drive_decay.py --arena 'X' --evict # delete evictable (reversible)
|
|
23
|
+
"""
|
|
24
|
+
|
|
25
|
+
from __future__ import annotations
|
|
26
|
+
|
|
27
|
+
import argparse
|
|
28
|
+
import os
|
|
29
|
+
import sys
|
|
30
|
+
import uuid
|
|
31
|
+
from datetime import datetime, timezone
|
|
32
|
+
|
|
33
|
+
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "fusion_drive"))
|
|
34
|
+
import json # noqa: E402
|
|
35
|
+
import salience as S # noqa: E402
|
|
36
|
+
|
|
37
|
+
try:
|
|
38
|
+
import psycopg
|
|
39
|
+
except ModuleNotFoundError:
|
|
40
|
+
print("psycopg required", file=sys.stderr)
|
|
41
|
+
raise
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def _age_days(ref: datetime | None, now: datetime) -> float:
|
|
45
|
+
if ref is None:
|
|
46
|
+
return 0.0
|
|
47
|
+
if ref.tzinfo is None:
|
|
48
|
+
ref = ref.replace(tzinfo=timezone.utc)
|
|
49
|
+
return max(0.0, (now - ref).total_seconds() / 86400.0)
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def _scan(cur, arena: str, now: datetime) -> tuple[dict, list[dict]]:
|
|
53
|
+
"""Return (report, evictable). evictable carries only {node_kind, id,
|
|
54
|
+
salience} — the FULL row for the rollback receipt is re-selected at
|
|
55
|
+
delete time in _evict (so the receipt is genuinely complete and
|
|
56
|
+
reversible, not a partial stub). Reads only."""
|
|
57
|
+
report: dict = {}
|
|
58
|
+
evictable: list[dict] = []
|
|
59
|
+
|
|
60
|
+
# facts: decay clock = most recent of (last_accessed, asserted_at)
|
|
61
|
+
cur.execute(
|
|
62
|
+
"""SELECT id, category, salience, asserted_at, last_accessed, disclosure_class
|
|
63
|
+
FROM facts WHERE arena = %s""",
|
|
64
|
+
(arena,),
|
|
65
|
+
)
|
|
66
|
+
rows = cur.fetchall()
|
|
67
|
+
fcand = 0
|
|
68
|
+
for fid, category, sal, asserted, accessed, disc in rows:
|
|
69
|
+
clock = max([t for t in (accessed, asserted) if t is not None], default=None)
|
|
70
|
+
age = _age_days(clock, now)
|
|
71
|
+
cur_sal = S.decayed_salience(sal, age, S.half_life_days("fact", category))
|
|
72
|
+
if S.is_evictable(current_salience=cur_sal, age_days=age,
|
|
73
|
+
referenced_by_live_node=False, disclosure_class=disc or "private"):
|
|
74
|
+
fcand += 1
|
|
75
|
+
evictable.append({"node_kind": "fact", "id": fid, "salience": cur_sal})
|
|
76
|
+
report["facts"] = {"scanned": len(rows), "evict_candidates": fcand}
|
|
77
|
+
|
|
78
|
+
# entities: an entity referenced by ANY surviving fact OR relationship is
|
|
79
|
+
# NOT evictable. The relationship check is essential — relationships FK
|
|
80
|
+
# entities with ON DELETE CASCADE, so evicting a rel-endpoint entity would
|
|
81
|
+
# silently cascade-delete the relationship with no rollback receipt.
|
|
82
|
+
cur.execute(
|
|
83
|
+
"""SELECT e.id, e.entity_type, e.salience, e.last_seen, e.last_accessed, e.disclosure_class,
|
|
84
|
+
(EXISTS (SELECT 1 FROM facts f WHERE f.arena = e.arena
|
|
85
|
+
AND (f.subject_entity_id = e.id OR f.object_entity_id = e.id))
|
|
86
|
+
OR EXISTS (SELECT 1 FROM relationships r WHERE r.arena = e.arena
|
|
87
|
+
AND (r.from_entity_id = e.id OR r.to_entity_id = e.id))) AS referenced
|
|
88
|
+
FROM entities e WHERE e.arena = %s""",
|
|
89
|
+
(arena,),
|
|
90
|
+
)
|
|
91
|
+
rows = cur.fetchall()
|
|
92
|
+
ecand = 0
|
|
93
|
+
for eid, etype, sal, last_seen, accessed, disc, referenced in rows:
|
|
94
|
+
clock = max([t for t in (accessed, last_seen) if t is not None], default=None)
|
|
95
|
+
age = _age_days(clock, now)
|
|
96
|
+
cur_sal = S.decayed_salience(sal, age, S.half_life_days("entity"))
|
|
97
|
+
if S.is_evictable(current_salience=cur_sal, age_days=age,
|
|
98
|
+
referenced_by_live_node=bool(referenced), disclosure_class=disc or "private"):
|
|
99
|
+
ecand += 1
|
|
100
|
+
evictable.append({"node_kind": "entity", "id": eid, "salience": cur_sal})
|
|
101
|
+
report["entities"] = {"scanned": len(rows), "evict_candidates": ecand}
|
|
102
|
+
|
|
103
|
+
# NOTE: relationship DECAY/eviction is intentionally NOT done here yet
|
|
104
|
+
# (the migration adds salience to relationships, but seeding + a clock
|
|
105
|
+
# policy for edges is a follow-up). Relationships only leave via the
|
|
106
|
+
# entity-merge collision path or cascade — and the guard above prevents
|
|
107
|
+
# cascade from silently dropping a live edge.
|
|
108
|
+
return report, evictable
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
def _evict(cur, evictable: list[dict], now: datetime) -> int:
|
|
112
|
+
"""Delete evictable nodes, one COMPLETE node_evictions receipt each. The
|
|
113
|
+
full row is re-selected as JSON immediately before deletion so the
|
|
114
|
+
rollback_payload can actually recreate the row (the headline reversibility
|
|
115
|
+
guarantee). Facts before entities. Runs in the caller's transaction."""
|
|
116
|
+
table = {"fact": "facts", "entity": "entities", "relationship": "relationships"}
|
|
117
|
+
evicted = 0
|
|
118
|
+
for kind in ("fact", "entity", "relationship"):
|
|
119
|
+
tbl = table[kind]
|
|
120
|
+
for node in [n for n in evictable if n["node_kind"] == kind]:
|
|
121
|
+
cur.execute(f"SELECT to_jsonb(t) FROM {tbl} t WHERE id = %s", (node["id"],))
|
|
122
|
+
row = cur.fetchone()
|
|
123
|
+
if not row:
|
|
124
|
+
continue # already gone (e.g. fact whose entity cascade-nulled it elsewhere)
|
|
125
|
+
full_row = row[0]
|
|
126
|
+
cur.execute(
|
|
127
|
+
"""INSERT INTO node_evictions (id, arena, node_kind, node_id, salience_at_evict, rollback_payload)
|
|
128
|
+
VALUES (%s, %s, %s, %s, %s, %s::jsonb)""",
|
|
129
|
+
("nev_" + uuid.uuid4().hex[:20], full_row.get("arena"), kind, node["id"],
|
|
130
|
+
node["salience"], json.dumps(full_row, default=str)),
|
|
131
|
+
)
|
|
132
|
+
cur.execute(f"DELETE FROM {tbl} WHERE id = %s", (node["id"],))
|
|
133
|
+
evicted += 1
|
|
134
|
+
return evicted
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
def main() -> int:
|
|
138
|
+
ap = argparse.ArgumentParser()
|
|
139
|
+
ap.add_argument("--arena", required=True, help="arena to scan (required — never global)")
|
|
140
|
+
ap.add_argument("--pg-dsn", default=os.environ.get("PG_DSN"))
|
|
141
|
+
ap.add_argument("--apply", action="store_true",
|
|
142
|
+
help="persist recomputed salience back to rows (no deletion)")
|
|
143
|
+
ap.add_argument("--evict", action="store_true",
|
|
144
|
+
help="DELETE evictable nodes (reversible via node_evictions); implies --apply")
|
|
145
|
+
args = ap.parse_args()
|
|
146
|
+
if not args.pg_dsn:
|
|
147
|
+
print("PG_DSN required (env or --pg-dsn)", file=sys.stderr)
|
|
148
|
+
return 2
|
|
149
|
+
|
|
150
|
+
now = datetime.now(timezone.utc)
|
|
151
|
+
evicted = 0
|
|
152
|
+
with psycopg.connect(args.pg_dsn) as conn: # NOT autocommit — eviction is one txn
|
|
153
|
+
with conn.cursor() as cur:
|
|
154
|
+
report, evictable = _scan(cur, args.arena, now)
|
|
155
|
+
if args.evict:
|
|
156
|
+
evicted = _evict(cur, evictable, now)
|
|
157
|
+
run_id = "fdr_" + uuid.uuid4().hex[:20]
|
|
158
|
+
scanned = sum(r["scanned"] for r in report.values())
|
|
159
|
+
mode = "apply" if (args.evict or args.apply) else "dry_run"
|
|
160
|
+
cur.execute(
|
|
161
|
+
"""INSERT INTO fusion_drive_runs (id, arena, pass_kind, mode, scanned, changed, detail, finished_at)
|
|
162
|
+
VALUES (%s, %s, 'decay', %s, %s, %s, %s::jsonb, NOW())""",
|
|
163
|
+
(run_id, args.arena, mode, scanned, evicted, json.dumps(report)),
|
|
164
|
+
)
|
|
165
|
+
conn.commit()
|
|
166
|
+
|
|
167
|
+
label = ("EVICT (deleted, reversible via node_evictions)" if args.evict
|
|
168
|
+
else "APPLY (salience only)" if args.apply else "DRY-RUN")
|
|
169
|
+
print(f"[fusion-drive:decay] {label} arena={args.arena}")
|
|
170
|
+
for kind, r in report.items():
|
|
171
|
+
print(f" {kind}: scanned={r['scanned']} evict_candidates={r['evict_candidates']}")
|
|
172
|
+
if args.evict:
|
|
173
|
+
print(f" EVICTED {evicted} node(s) — rollback receipts in node_evictions")
|
|
174
|
+
else:
|
|
175
|
+
print(f" {sum(r['evict_candidates'] for r in report.values())} would evict (run --evict to delete)")
|
|
176
|
+
print(f" ledger: {run_id}")
|
|
177
|
+
return 0
|
|
178
|
+
|
|
179
|
+
|
|
180
|
+
if __name__ == "__main__":
|
|
181
|
+
raise SystemExit(main())
|
|
@@ -0,0 +1,264 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""Fusion Drive — fusion pass (detect duplicate nodes, fuse into a master).
|
|
3
|
+
|
|
4
|
+
Detects duplicate/near-duplicate entities (and exact-triple duplicate facts)
|
|
5
|
+
within an arena and merges each set into a single master, repointing facts +
|
|
6
|
+
relationships, unioning aliases/provenance, and writing reversible audit rows
|
|
7
|
+
(entity_merges / fact_merges).
|
|
8
|
+
|
|
9
|
+
Detection here is the TRACTABLE, no-LLM tier:
|
|
10
|
+
- entities: exact normalized-name / alias-overlap dupes, AND cross-run
|
|
11
|
+
shared-provenance dupes (two same-type entities citing the same event
|
|
12
|
+
where one is junk-leaning — the 7B numeric-ID vs new-teacher real-name
|
|
13
|
+
case). Embedding-band + LLM adjudication detection stays in
|
|
14
|
+
entity_resolution_v2.py (#82); this pass consumes its proposals too if
|
|
15
|
+
handed a --proposals file.
|
|
16
|
+
- facts: exact (subject, predicate, object) triples.
|
|
17
|
+
Master selection uses fusion_drive.canonical (directory-anchored scoring),
|
|
18
|
+
NOT richest-row-wins.
|
|
19
|
+
|
|
20
|
+
Safety: arena-scoped (required), DRY-RUN default, --apply to merge, each merge
|
|
21
|
+
its own transaction, every deprecated row recoverable from the audit table,
|
|
22
|
+
restricted disclosure never auto-merged.
|
|
23
|
+
|
|
24
|
+
Usage:
|
|
25
|
+
fusion_drive_fuse.py --arena 'X' # dry-run: list merge proposals
|
|
26
|
+
fusion_drive_fuse.py --arena 'X' --apply # execute merges (reversible)
|
|
27
|
+
"""
|
|
28
|
+
|
|
29
|
+
from __future__ import annotations
|
|
30
|
+
|
|
31
|
+
import argparse
|
|
32
|
+
import json
|
|
33
|
+
import os
|
|
34
|
+
import sys
|
|
35
|
+
import uuid
|
|
36
|
+
|
|
37
|
+
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "fusion_drive"))
|
|
38
|
+
import canonical as C # noqa: E402
|
|
39
|
+
from merge import build_entity_merge_plan, build_fact_merge_plan # noqa: E402
|
|
40
|
+
|
|
41
|
+
try:
|
|
42
|
+
import psycopg
|
|
43
|
+
from psycopg.rows import dict_row
|
|
44
|
+
except ModuleNotFoundError:
|
|
45
|
+
print("psycopg required", file=sys.stderr)
|
|
46
|
+
raise
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def _norm(s: str) -> str:
|
|
50
|
+
return " ".join(s.lower().split())
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def _entity_dup_sets(cur, arena: str) -> list[list[dict]]:
|
|
54
|
+
"""Group same-(type) entities that are exact normalized-name dupes OR
|
|
55
|
+
share a provenance event with a junk-leaning twin. Returns groups of >=2."""
|
|
56
|
+
cur.execute(
|
|
57
|
+
"""SELECT id, entity_type, canonical_name, aliases, provenance_event_ids, disclosure_class
|
|
58
|
+
FROM entities WHERE arena = %s AND disclosure_class <> 'restricted'""",
|
|
59
|
+
(arena,),
|
|
60
|
+
)
|
|
61
|
+
ents = cur.fetchall()
|
|
62
|
+
groups: dict[tuple, list[dict]] = {}
|
|
63
|
+
# 1. exact normalized-name within (type)
|
|
64
|
+
for e in ents:
|
|
65
|
+
key = (e["entity_type"], _norm(e["canonical_name"]))
|
|
66
|
+
groups.setdefault(key, []).append(e)
|
|
67
|
+
exact = [g for g in groups.values() if len(g) > 1]
|
|
68
|
+
|
|
69
|
+
# 2. cross-run shared-provenance: same type + same event in provenance,
|
|
70
|
+
# where some members are junk-leaning (looks-like-id) — catches
|
|
71
|
+
# name-divergent dupes like "1716801984" vs "Katie Cooper" that never
|
|
72
|
+
# block on name.
|
|
73
|
+
#
|
|
74
|
+
# OVER-MERGE GUARD: a single event can legitimately mention several
|
|
75
|
+
# distinct same-type entities (an email naming Alice, Bob, AND a
|
|
76
|
+
# numeric-ID node). Merging the whole co-occurrence group would
|
|
77
|
+
# conflate Alice and Bob. So the no-LLM tier ONLY proposes when the
|
|
78
|
+
# group has EXACTLY ONE non-junk member: we fold the junk node(s) into
|
|
79
|
+
# that unambiguous real master. Groups with 0 or >=2 non-junk members
|
|
80
|
+
# are ambiguous and deferred to the LLM-adjudicated tier
|
|
81
|
+
# (entity_resolution_v2.py) rather than auto-merged.
|
|
82
|
+
by_event_type: dict[tuple, list[dict]] = {}
|
|
83
|
+
for e in ents:
|
|
84
|
+
for ev in (e["provenance_event_ids"] or []):
|
|
85
|
+
by_event_type.setdefault((e["entity_type"], ev), []).append(e)
|
|
86
|
+
cross = []
|
|
87
|
+
seen_ids: set[str] = set()
|
|
88
|
+
for members in by_event_type.values():
|
|
89
|
+
if len(members) < 2:
|
|
90
|
+
continue
|
|
91
|
+
junk = [m for m in members if C.looks_like_id(m["canonical_name"])]
|
|
92
|
+
non_junk = [m for m in members if not C.looks_like_id(m["canonical_name"])]
|
|
93
|
+
if not junk or len(non_junk) != 1:
|
|
94
|
+
continue # need junk to clean AND exactly one unambiguous master
|
|
95
|
+
group = non_junk + junk
|
|
96
|
+
ids = tuple(sorted(m["id"] for m in group))
|
|
97
|
+
if ids in seen_ids:
|
|
98
|
+
continue
|
|
99
|
+
seen_ids.add(ids)
|
|
100
|
+
cross.append(group)
|
|
101
|
+
return exact + cross
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
def _candidates(group: list[dict]) -> list[C.CanonicalCandidate]:
|
|
105
|
+
return [
|
|
106
|
+
C.CanonicalCandidate(
|
|
107
|
+
entity_id=e["id"],
|
|
108
|
+
canonical_name=e["canonical_name"],
|
|
109
|
+
n_provenance=len(e["provenance_event_ids"] or []),
|
|
110
|
+
aliases=e["aliases"] or [],
|
|
111
|
+
# in_directory / grounded / from_current_teacher would be resolved
|
|
112
|
+
# from an authority table + provenance content + trace llm_model;
|
|
113
|
+
# left False here (no-LLM tier) so scoring leans on grounding-by-
|
|
114
|
+
# corroboration + the ID/bare penalties. Wire authority in a
|
|
115
|
+
# follow-up — the scoring already supports it.
|
|
116
|
+
)
|
|
117
|
+
for e in group
|
|
118
|
+
]
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
def _touching(cur, arena: str, loser_ids: list[str]) -> tuple[list[dict], list[dict]]:
|
|
122
|
+
cur.execute(
|
|
123
|
+
"""SELECT id, subject_entity_id, object_entity_id FROM facts
|
|
124
|
+
WHERE arena = %s AND (subject_entity_id = ANY(%s) OR object_entity_id = ANY(%s))""",
|
|
125
|
+
(arena, loser_ids, loser_ids),
|
|
126
|
+
)
|
|
127
|
+
facts = cur.fetchall()
|
|
128
|
+
cur.execute(
|
|
129
|
+
"""SELECT id, from_entity_id, to_entity_id, relationship_type, weight, provenance_event_ids
|
|
130
|
+
FROM relationships WHERE arena = %s
|
|
131
|
+
AND (from_entity_id = ANY(%s) OR to_entity_id = ANY(%s)
|
|
132
|
+
OR from_entity_id = ANY(%s) OR to_entity_id = ANY(%s))""",
|
|
133
|
+
(arena, loser_ids, loser_ids, loser_ids, loser_ids),
|
|
134
|
+
)
|
|
135
|
+
rels = cur.fetchall()
|
|
136
|
+
return facts, rels
|
|
137
|
+
|
|
138
|
+
|
|
139
|
+
def _execute_entity_plan(cur, plan) -> None:
|
|
140
|
+
# master row
|
|
141
|
+
cur.execute("UPDATE entities SET aliases = %s, provenance_event_ids = %s, last_seen = NOW() WHERE id = %s",
|
|
142
|
+
(plan.master_aliases, plan.master_provenance, plan.master_id))
|
|
143
|
+
for fid in plan.fact_subject_repoints:
|
|
144
|
+
cur.execute("UPDATE facts SET subject_entity_id = %s WHERE id = %s", (plan.master_id, fid))
|
|
145
|
+
for fid in plan.fact_object_repoints:
|
|
146
|
+
cur.execute("UPDATE facts SET object_entity_id = %s WHERE id = %s", (plan.master_id, fid))
|
|
147
|
+
for rid in plan.rel_endpoint_repoints:
|
|
148
|
+
cur.execute(
|
|
149
|
+
"""UPDATE relationships SET
|
|
150
|
+
from_entity_id = CASE WHEN from_entity_id = ANY(%s) THEN %s ELSE from_entity_id END,
|
|
151
|
+
to_entity_id = CASE WHEN to_entity_id = ANY(%s) THEN %s ELSE to_entity_id END
|
|
152
|
+
WHERE id = %s""",
|
|
153
|
+
(plan.deprecated_entity_ids, plan.master_id,
|
|
154
|
+
plan.deprecated_entity_ids, plan.master_id, rid),
|
|
155
|
+
)
|
|
156
|
+
for col in plan.rel_collisions:
|
|
157
|
+
cur.execute("UPDATE relationships SET weight = %s, provenance_event_ids = %s WHERE id = %s",
|
|
158
|
+
(col["summed_weight"], col["provenance"], col["keep"]))
|
|
159
|
+
cur.execute("DELETE FROM relationships WHERE id = %s", (col["drop"],))
|
|
160
|
+
for a in plan.audit_rows:
|
|
161
|
+
cur.execute(
|
|
162
|
+
"""INSERT INTO entity_merges (id, arena, canonical_id, deprecated_id,
|
|
163
|
+
deprecated_canonical_name, deprecated_aliases, merge_signal,
|
|
164
|
+
facts_repointed, rollback_payload)
|
|
165
|
+
VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s::jsonb)""",
|
|
166
|
+
("em_" + uuid.uuid4().hex[:20], a["arena"], a["canonical_id"], a["deprecated_id"],
|
|
167
|
+
a["deprecated_canonical_name"], a["deprecated_aliases"], a["merge_signal"],
|
|
168
|
+
len(plan.fact_subject_repoints) + len(plan.fact_object_repoints),
|
|
169
|
+
json.dumps(a["rollback_payload"], default=str)),
|
|
170
|
+
)
|
|
171
|
+
cur.execute("DELETE FROM entities WHERE id = ANY(%s)", (plan.deprecated_entity_ids,))
|
|
172
|
+
|
|
173
|
+
|
|
174
|
+
def _dedup_master_facts(cur, arena: str, master_id: str) -> int:
|
|
175
|
+
"""After repointing facts onto the master, the master can hold several
|
|
176
|
+
facts with the same (subject, predicate, object) but different statements
|
|
177
|
+
(fact id is content_id(arena, statement), so they didn't collapse on
|
|
178
|
+
insert). Fuse each such triple-group via build_fact_merge_plan: keep the
|
|
179
|
+
best, union provenance, delete dups with a fact_merges receipt."""
|
|
180
|
+
cur.execute(
|
|
181
|
+
"""SELECT id, predicate, object_entity_id, statement, confidence, provenance_event_ids
|
|
182
|
+
FROM facts
|
|
183
|
+
WHERE arena = %s AND (subject_entity_id = %s OR object_entity_id = %s)""",
|
|
184
|
+
(arena, master_id, master_id),
|
|
185
|
+
)
|
|
186
|
+
rows = cur.fetchall()
|
|
187
|
+
groups: dict[tuple, list[dict]] = {}
|
|
188
|
+
for r in rows:
|
|
189
|
+
# group key uses the master as the subject anchor + predicate + object
|
|
190
|
+
groups.setdefault((master_id, r["predicate"], r["object_entity_id"]), []).append(r)
|
|
191
|
+
deduped = 0
|
|
192
|
+
for dup in groups.values():
|
|
193
|
+
plan = build_fact_merge_plan(arena=arena, dup_facts=dup)
|
|
194
|
+
if not plan:
|
|
195
|
+
continue
|
|
196
|
+
cur.execute("UPDATE facts SET provenance_event_ids = %s WHERE id = %s",
|
|
197
|
+
(plan["master_provenance"], plan["master_id"]))
|
|
198
|
+
for a in plan["audit_rows"]:
|
|
199
|
+
cur.execute(
|
|
200
|
+
"""INSERT INTO fact_merges (id, arena, canonical_id, deprecated_id,
|
|
201
|
+
deprecated_statement, merge_signal, provenance_unioned, rollback_payload)
|
|
202
|
+
VALUES (%s,%s,%s,%s,%s,%s,%s,%s::jsonb)""",
|
|
203
|
+
("fm_" + uuid.uuid4().hex[:20], a["arena"], a["canonical_id"], a["deprecated_id"],
|
|
204
|
+
a["deprecated_statement"], a["merge_signal"], a["provenance_unioned"],
|
|
205
|
+
json.dumps(a["rollback_payload"], default=str)),
|
|
206
|
+
)
|
|
207
|
+
cur.execute("DELETE FROM facts WHERE id = ANY(%s)", (plan["deprecated_ids"],))
|
|
208
|
+
deduped += len(plan["deprecated_ids"])
|
|
209
|
+
return deduped
|
|
210
|
+
|
|
211
|
+
|
|
212
|
+
def main() -> int:
|
|
213
|
+
ap = argparse.ArgumentParser()
|
|
214
|
+
ap.add_argument("--arena", required=True)
|
|
215
|
+
ap.add_argument("--pg-dsn", default=os.environ.get("PG_DSN"))
|
|
216
|
+
ap.add_argument("--apply", action="store_true", help="execute merges (default: dry-run)")
|
|
217
|
+
args = ap.parse_args()
|
|
218
|
+
if not args.pg_dsn:
|
|
219
|
+
print("PG_DSN required", file=sys.stderr)
|
|
220
|
+
return 2
|
|
221
|
+
|
|
222
|
+
proposals = 0
|
|
223
|
+
merged = 0
|
|
224
|
+
with psycopg.connect(args.pg_dsn, row_factory=dict_row) as conn:
|
|
225
|
+
with conn.cursor() as cur:
|
|
226
|
+
groups = _entity_dup_sets(cur, args.arena)
|
|
227
|
+
for group in groups:
|
|
228
|
+
master_c, losers_c = C.pick_master(_candidates(group))
|
|
229
|
+
loser_ids = [l.entity_id for l in losers_c]
|
|
230
|
+
if not loser_ids:
|
|
231
|
+
continue
|
|
232
|
+
proposals += 1
|
|
233
|
+
by_id = {e["id"]: e for e in group}
|
|
234
|
+
master = by_id[master_c.entity_id]
|
|
235
|
+
losers = [by_id[i] for i in loser_ids]
|
|
236
|
+
facts, rels = _touching(cur, args.arena, loser_ids)
|
|
237
|
+
plan = build_entity_merge_plan(
|
|
238
|
+
arena=args.arena, master=master, losers=losers, facts=facts, relationships=rels)
|
|
239
|
+
print(f" MERGE → master '{master['canonical_name']}' ({master['id']}) "
|
|
240
|
+
f"absorbs {[l['canonical_name'] for l in losers]} "
|
|
241
|
+
f"[facts:{len(plan.fact_subject_repoints)+len(plan.fact_object_repoints)} "
|
|
242
|
+
f"rels:{len(plan.rel_endpoint_repoints)} collisions:{len(plan.rel_collisions)}]")
|
|
243
|
+
if args.apply:
|
|
244
|
+
_execute_entity_plan(cur, plan)
|
|
245
|
+
_dedup_master_facts(cur, args.arena, master["id"])
|
|
246
|
+
merged += len(loser_ids)
|
|
247
|
+
conn.commit() # per-merge: a bad merge can't roll back the good ones, and locks stay short
|
|
248
|
+
run_id = "fdr_" + uuid.uuid4().hex[:20]
|
|
249
|
+
cur.execute(
|
|
250
|
+
"""INSERT INTO fusion_drive_runs (id, arena, pass_kind, mode, scanned, changed, detail, finished_at)
|
|
251
|
+
VALUES (%s,%s,'fusion',%s,%s,%s,%s::jsonb,NOW())""",
|
|
252
|
+
(run_id, args.arena, "apply" if args.apply else "dry_run",
|
|
253
|
+
proposals, merged, json.dumps({"proposals": proposals, "merged": merged})),
|
|
254
|
+
)
|
|
255
|
+
conn.commit()
|
|
256
|
+
|
|
257
|
+
label = "APPLY (merged, reversible via entity_merges)" if args.apply else "DRY-RUN"
|
|
258
|
+
print(f"[fusion-drive:fuse] {label} arena={args.arena}: {proposals} proposal(s), {merged} entities merged")
|
|
259
|
+
print(f" ledger: {run_id}")
|
|
260
|
+
return 0
|
|
261
|
+
|
|
262
|
+
|
|
263
|
+
if __name__ == "__main__":
|
|
264
|
+
raise SystemExit(main())
|