@pentatonic-ai/ai-agent-sdk 0.10.6 → 0.10.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.cjs +1 -1
- package/dist/index.js +1 -1
- package/package.json +1 -1
- package/packages/memory-engine-v2/RFC-decay-and-fusion.md +185 -0
- package/packages/memory-engine-v2/RFC-fusion-drive.md +193 -0
- package/packages/memory-engine-v2/docker-compose.aws.yml +62 -1
- package/packages/memory-engine-v2/docker-compose.yml +8 -1
- package/packages/memory-engine-v2/extractor-async/confidence.py +37 -0
- package/packages/memory-engine-v2/extractor-async/test_born_salience_parity.py +35 -0
- package/packages/memory-engine-v2/extractor-async/test_guided_json_parser.py +44 -0
- package/packages/memory-engine-v2/extractor-async/worker.py +67 -7
- package/packages/memory-engine-v2/extractor-sync/server.py +6 -2
- package/packages/memory-engine-v2/extractor-sync/test_paired_extraction.py +82 -1
- package/packages/memory-engine-v2/fusion_drive/__init__.py +0 -0
- package/packages/memory-engine-v2/fusion_drive/canonical.py +94 -0
- package/packages/memory-engine-v2/fusion_drive/conftest.py +8 -0
- package/packages/memory-engine-v2/fusion_drive/merge.py +178 -0
- package/packages/memory-engine-v2/fusion_drive/salience.py +118 -0
- package/packages/memory-engine-v2/fusion_drive/test_canonical.py +76 -0
- package/packages/memory-engine-v2/fusion_drive/test_merge.py +112 -0
- package/packages/memory-engine-v2/fusion_drive/test_salience.py +93 -0
- package/packages/memory-engine-v2/org-model/migrations/004_source_kind_code_reference.sql +12 -0
- package/packages/memory-engine-v2/org-model/migrations/005_fk_indexes.sql +20 -0
- package/packages/memory-engine-v2/org-model/migrations/006_fusion_drive.sql +80 -0
- package/packages/memory-engine-v2/scripts/fusion_drive_born_salience_backfill.py +113 -0
- package/packages/memory-engine-v2/scripts/fusion_drive_decay.py +181 -0
- package/packages/memory-engine-v2/scripts/fusion_drive_fuse.py +264 -0
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
"""Unit tests for Fusion Drive scored canonical selection (pure, no DB)."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from canonical import CanonicalCandidate, canonical_score, pick_master, looks_like_id
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class TestLooksLikeId:
|
|
9
|
+
def test_pure_digits(self):
|
|
10
|
+
assert looks_like_id("1716801984")
|
|
11
|
+
|
|
12
|
+
def test_real_name(self):
|
|
13
|
+
assert not looks_like_id("Katie Cooper")
|
|
14
|
+
|
|
15
|
+
def test_mostly_digits(self):
|
|
16
|
+
assert looks_like_id("user 90210 33")
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class TestCanonicalScore:
|
|
20
|
+
def test_directory_match_dominates(self):
|
|
21
|
+
directoried = CanonicalCandidate("e1", "Philip Mossop", in_directory=True)
|
|
22
|
+
rich_typo = CanonicalCandidate("e2", "Phil Mossop", n_provenance=50, grounded=True)
|
|
23
|
+
assert canonical_score(directoried) > canonical_score(rich_typo)
|
|
24
|
+
|
|
25
|
+
def test_numeric_id_person_heavily_penalised(self):
|
|
26
|
+
idp = CanonicalCandidate("e1", "1716801984", n_provenance=20)
|
|
27
|
+
real = CanonicalCandidate("e2", "Katie Cooper", n_provenance=1)
|
|
28
|
+
assert canonical_score(real) > canonical_score(idp)
|
|
29
|
+
|
|
30
|
+
def test_current_teacher_beats_superseded_when_otherwise_equal(self):
|
|
31
|
+
new = CanonicalCandidate("e1", "Acme Corp", from_current_teacher=True)
|
|
32
|
+
old = CanonicalCandidate("e2", "Acme Corp", from_current_teacher=False)
|
|
33
|
+
assert canonical_score(new) > canonical_score(old)
|
|
34
|
+
|
|
35
|
+
def test_hallucinated_email_penalised(self):
|
|
36
|
+
clean = CanonicalCandidate("e1", "Sam Patel")
|
|
37
|
+
halluc = CanonicalCandidate("e2", "Sam Patel", hallucinated_email=True)
|
|
38
|
+
assert canonical_score(clean) > canonical_score(halluc)
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
class TestPickMaster:
|
|
42
|
+
def test_the_phil_mossop_typo_case(self):
|
|
43
|
+
# The exact regression the RFC calls out: directory-known correct
|
|
44
|
+
# spelling must win over a richer typo row.
|
|
45
|
+
cands = [
|
|
46
|
+
CanonicalCandidate("typo", "Phil Mossop", n_provenance=40, grounded=True),
|
|
47
|
+
CanonicalCandidate("real", "Philip Mossop", n_provenance=3, in_directory=True),
|
|
48
|
+
]
|
|
49
|
+
master, losers = pick_master(cands)
|
|
50
|
+
assert master.entity_id == "real"
|
|
51
|
+
assert [l.entity_id for l in losers] == ["typo"]
|
|
52
|
+
|
|
53
|
+
def test_numeric_id_loses_to_real_name(self):
|
|
54
|
+
cands = [
|
|
55
|
+
CanonicalCandidate("idp", "1716801984", n_provenance=30),
|
|
56
|
+
CanonicalCandidate("named", "Katie Cooper", n_provenance=2, grounded=True),
|
|
57
|
+
]
|
|
58
|
+
master, _ = pick_master(cands)
|
|
59
|
+
assert master.entity_id == "named"
|
|
60
|
+
|
|
61
|
+
def test_single_candidate_is_its_own_master(self):
|
|
62
|
+
c = CanonicalCandidate("solo", "Solo Entity")
|
|
63
|
+
master, losers = pick_master([c])
|
|
64
|
+
assert master is c and losers == []
|
|
65
|
+
|
|
66
|
+
def test_deterministic_tie_break(self):
|
|
67
|
+
a = CanonicalCandidate("a", "Acme", n_provenance=5)
|
|
68
|
+
b = CanonicalCandidate("b", "Acme", n_provenance=5)
|
|
69
|
+
m1, _ = pick_master([a, b])
|
|
70
|
+
m2, _ = pick_master([b, a])
|
|
71
|
+
assert m1.entity_id == m2.entity_id # order-independent
|
|
72
|
+
|
|
73
|
+
def test_empty_raises(self):
|
|
74
|
+
import pytest
|
|
75
|
+
with pytest.raises(ValueError):
|
|
76
|
+
pick_master([])
|
|
@@ -0,0 +1,112 @@
|
|
|
1
|
+
"""Unit tests for Fusion Drive merge & eviction plan builders (pure, no DB)."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import pytest
|
|
6
|
+
from merge import build_entity_merge_plan, build_fact_merge_plan, build_eviction_receipt
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def _ent(id, name, aliases=None, prov=None):
|
|
10
|
+
return {"id": id, "canonical_name": name, "aliases": aliases or [], "provenance_event_ids": prov or []}
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class TestEntityMergePlan:
|
|
14
|
+
def test_aliases_and_provenance_union(self):
|
|
15
|
+
master = _ent("m", "Philip Mossop", aliases=["P. Mossop"], prov=["e1"])
|
|
16
|
+
loser = _ent("l", "Phil Mossop", aliases=["phil"], prov=["e2", "e1"])
|
|
17
|
+
plan = build_entity_merge_plan(arena="a", master=master, losers=[loser], facts=[], relationships=[])
|
|
18
|
+
# loser's canonical + aliases fold into master aliases; master's own name excluded
|
|
19
|
+
assert "Phil Mossop" in plan.master_aliases
|
|
20
|
+
assert "phil" in plan.master_aliases
|
|
21
|
+
assert "Philip Mossop" not in plan.master_aliases
|
|
22
|
+
# provenance deduped union
|
|
23
|
+
assert sorted(plan.master_provenance) == ["e1", "e2"]
|
|
24
|
+
assert plan.deprecated_entity_ids == ["l"]
|
|
25
|
+
|
|
26
|
+
def test_fact_subject_and_object_repoints(self):
|
|
27
|
+
master, loser = _ent("m", "Acme"), _ent("l", "ACME Inc")
|
|
28
|
+
facts = [
|
|
29
|
+
{"id": "f1", "subject_entity_id": "l", "object_entity_id": None},
|
|
30
|
+
{"id": "f2", "subject_entity_id": "x", "object_entity_id": "l"},
|
|
31
|
+
{"id": "f3", "subject_entity_id": "x", "object_entity_id": "y"}, # untouched
|
|
32
|
+
]
|
|
33
|
+
plan = build_entity_merge_plan(arena="a", master=master, losers=[loser], facts=facts, relationships=[])
|
|
34
|
+
assert plan.fact_subject_repoints == ["f1"]
|
|
35
|
+
assert plan.fact_object_repoints == ["f2"]
|
|
36
|
+
|
|
37
|
+
def test_relationship_repoint_without_collision(self):
|
|
38
|
+
master, loser = _ent("m", "Bob"), _ent("l", "Bobby")
|
|
39
|
+
rels = [{"id": "r1", "from_entity_id": "l", "to_entity_id": "z",
|
|
40
|
+
"relationship_type": "works_for", "weight": 2.0, "provenance_event_ids": ["e1"]}]
|
|
41
|
+
plan = build_entity_merge_plan(arena="a", master=master, losers=[loser], facts=[], relationships=rels)
|
|
42
|
+
assert plan.rel_endpoint_repoints == ["r1"]
|
|
43
|
+
assert plan.rel_collisions == []
|
|
44
|
+
|
|
45
|
+
def test_relationship_collision_sums_weight_and_unions_provenance(self):
|
|
46
|
+
# master already has (m -> z, works_for); loser has (l -> z, works_for).
|
|
47
|
+
# Repointing l->m collides; keep one, sum weights, union provenance, drop other.
|
|
48
|
+
master, loser = _ent("m", "Bob"), _ent("l", "Bobby")
|
|
49
|
+
rels = [
|
|
50
|
+
{"id": "r_master", "from_entity_id": "m", "to_entity_id": "z",
|
|
51
|
+
"relationship_type": "works_for", "weight": 3.0, "provenance_event_ids": ["e1"]},
|
|
52
|
+
{"id": "r_loser", "from_entity_id": "l", "to_entity_id": "z",
|
|
53
|
+
"relationship_type": "works_for", "weight": 2.0, "provenance_event_ids": ["e2"]},
|
|
54
|
+
]
|
|
55
|
+
plan = build_entity_merge_plan(arena="a", master=master, losers=[loser], facts=[], relationships=rels)
|
|
56
|
+
assert len(plan.rel_collisions) == 1
|
|
57
|
+
c = plan.rel_collisions[0]
|
|
58
|
+
assert c["keep"] == "r_master" and c["drop"] == "r_loser"
|
|
59
|
+
assert c["summed_weight"] == 5.0
|
|
60
|
+
assert sorted(c["provenance"]) == ["e1", "e2"]
|
|
61
|
+
|
|
62
|
+
def test_audit_rows_carry_rollback_payload(self):
|
|
63
|
+
master, loser = _ent("m", "Real"), _ent("l", "Dupe", prov=["e9"])
|
|
64
|
+
plan = build_entity_merge_plan(arena="a", master=master, losers=[loser], facts=[], relationships=[])
|
|
65
|
+
assert len(plan.audit_rows) == 1
|
|
66
|
+
row = plan.audit_rows[0]
|
|
67
|
+
assert row["canonical_id"] == "m" and row["deprecated_id"] == "l"
|
|
68
|
+
assert row["rollback_payload"] == loser # full row preserved
|
|
69
|
+
|
|
70
|
+
def test_master_cannot_be_loser(self):
|
|
71
|
+
m = _ent("m", "X")
|
|
72
|
+
with pytest.raises(ValueError):
|
|
73
|
+
build_entity_merge_plan(arena="a", master=m, losers=[m], facts=[], relationships=[])
|
|
74
|
+
|
|
75
|
+
def test_multi_loser_merge(self):
|
|
76
|
+
master = _ent("m", "Katie Cooper", prov=["e1"])
|
|
77
|
+
losers = [_ent("l1", "1716801984", prov=["e2"]), _ent("l2", "K. Cooper", prov=["e3"])]
|
|
78
|
+
plan = build_entity_merge_plan(arena="a", master=master, losers=losers, facts=[], relationships=[])
|
|
79
|
+
assert plan.deprecated_entity_ids == ["l1", "l2"]
|
|
80
|
+
assert sorted(plan.master_provenance) == ["e1", "e2", "e3"]
|
|
81
|
+
assert len(plan.audit_rows) == 2
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
class TestFactMergePlan:
|
|
85
|
+
def test_picks_highest_confidence_master(self):
|
|
86
|
+
dups = [
|
|
87
|
+
{"id": "f1", "confidence": 0.5, "statement": "short", "provenance_event_ids": ["e1"]},
|
|
88
|
+
{"id": "f2", "confidence": 0.9, "statement": "the better one", "provenance_event_ids": ["e2"]},
|
|
89
|
+
]
|
|
90
|
+
plan = build_fact_merge_plan(arena="a", dup_facts=dups)
|
|
91
|
+
assert plan["master_id"] == "f2"
|
|
92
|
+
assert plan["deprecated_ids"] == ["f1"]
|
|
93
|
+
assert sorted(plan["master_provenance"]) == ["e1", "e2"]
|
|
94
|
+
|
|
95
|
+
def test_single_fact_no_merge(self):
|
|
96
|
+
assert build_fact_merge_plan(arena="a", dup_facts=[{"id": "f1"}]) is None
|
|
97
|
+
|
|
98
|
+
def test_tie_breaks_on_statement_length(self):
|
|
99
|
+
dups = [
|
|
100
|
+
{"id": "f1", "confidence": 0.7, "statement": "x", "provenance_event_ids": []},
|
|
101
|
+
{"id": "f2", "confidence": 0.7, "statement": "longer statement", "provenance_event_ids": []},
|
|
102
|
+
]
|
|
103
|
+
plan = build_fact_merge_plan(arena="a", dup_facts=dups)
|
|
104
|
+
assert plan["master_id"] == "f2"
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
class TestEvictionReceipt:
|
|
108
|
+
def test_carries_full_row(self):
|
|
109
|
+
row = {"id": "e1", "arena": "a", "canonical_name": "Ghost"}
|
|
110
|
+
r = build_eviction_receipt("entity", row)
|
|
111
|
+
assert r["node_kind"] == "entity" and r["node_id"] == "e1"
|
|
112
|
+
assert r["rollback_payload"] == row
|
|
@@ -0,0 +1,93 @@
|
|
|
1
|
+
"""Unit tests for Fusion Drive salience scoring + decay (pure, no DB)."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import salience as S
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class TestBornSalience:
|
|
9
|
+
def test_base_for_single_uncorroborated_clean_node(self):
|
|
10
|
+
assert S.born_salience(n_sources=1) == S.BASE_SALIENCE
|
|
11
|
+
|
|
12
|
+
def test_corroboration_raises_but_caps(self):
|
|
13
|
+
assert S.born_salience(n_sources=2) > S.BASE_SALIENCE
|
|
14
|
+
capped = S.born_salience(n_sources=100)
|
|
15
|
+
assert capped == round(S.BASE_SALIENCE + S.CORROB_CAP, 4)
|
|
16
|
+
|
|
17
|
+
def test_junk_is_born_below_decay_sweep_threshold(self):
|
|
18
|
+
# Any single hard-junk flag must sink it under the 0.3 sweep line
|
|
19
|
+
# so decay targets pollution without needing a fusion match.
|
|
20
|
+
for flag in ("noise_name", "numeric_id_person", "hallucinated_email"):
|
|
21
|
+
assert S.born_salience(n_sources=1, quality_flags=[flag]) < 0.3, flag
|
|
22
|
+
|
|
23
|
+
def test_combined_flags_drive_to_floor(self):
|
|
24
|
+
s = S.born_salience(
|
|
25
|
+
n_sources=1, quality_flags=["numeric_id_person", "hallucinated_email", "ungrounded"]
|
|
26
|
+
)
|
|
27
|
+
assert s == S.SALIENCE_FLOOR
|
|
28
|
+
|
|
29
|
+
def test_corroboration_cannot_rescue_hard_junk(self):
|
|
30
|
+
# A heavily-corroborated numeric-ID person is still junk-leaning.
|
|
31
|
+
s = S.born_salience(n_sources=5, quality_flags=["numeric_id_person"])
|
|
32
|
+
assert s < S.BASE_SALIENCE
|
|
33
|
+
|
|
34
|
+
def test_never_exceeds_ceiling_or_below_floor(self):
|
|
35
|
+
assert S.born_salience(n_sources=1000) <= S.SALIENCE_CEIL
|
|
36
|
+
assert S.born_salience(n_sources=1, quality_flags=["noise_name"] * 10) == S.SALIENCE_FLOOR
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
class TestHalfLife:
|
|
40
|
+
def test_durable_categories_barely_decay(self):
|
|
41
|
+
assert S.half_life_days("fact", "decision") >= 3650
|
|
42
|
+
assert S.half_life_days("fact", "commitment") >= 3650
|
|
43
|
+
|
|
44
|
+
def test_ephemeral_categories_fade_fast(self):
|
|
45
|
+
assert S.half_life_days("fact", "mention") <= 30
|
|
46
|
+
assert S.half_life_days("fact", "observation") <= 30
|
|
47
|
+
|
|
48
|
+
def test_unknown_category_uses_default(self):
|
|
49
|
+
assert S.half_life_days("fact", "made_up") == S.FACT_HALF_LIFE_DEFAULT
|
|
50
|
+
|
|
51
|
+
def test_entity_and_relationship_kinds(self):
|
|
52
|
+
assert S.half_life_days("entity") == S.ENTITY_HALF_LIFE_DAYS
|
|
53
|
+
assert S.half_life_days("relationship") == S.RELATIONSHIP_HALF_LIFE_DAYS
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
class TestDecay:
|
|
57
|
+
def test_one_half_life_halves_salience(self):
|
|
58
|
+
assert S.decayed_salience(0.8, age_days=30, hl_days=30) == round(0.4, 4)
|
|
59
|
+
|
|
60
|
+
def test_two_half_lives_quarters(self):
|
|
61
|
+
assert S.decayed_salience(0.8, age_days=60, hl_days=30) == round(0.2, 4)
|
|
62
|
+
|
|
63
|
+
def test_no_age_no_decay(self):
|
|
64
|
+
assert S.decayed_salience(0.6, age_days=0, hl_days=30) == 0.6
|
|
65
|
+
|
|
66
|
+
def test_durable_fact_after_a_year_barely_moves(self):
|
|
67
|
+
hl = S.half_life_days("fact", "decision")
|
|
68
|
+
assert S.decayed_salience(0.7, age_days=365, hl_days=hl) > 0.65
|
|
69
|
+
|
|
70
|
+
def test_mention_after_three_months_is_near_floor(self):
|
|
71
|
+
hl = S.half_life_days("fact", "mention") # 30d
|
|
72
|
+
assert S.decayed_salience(0.5, age_days=90, hl_days=hl) < 0.1
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
class TestEvictable:
|
|
76
|
+
def test_low_salience_old_unreferenced_is_evictable(self):
|
|
77
|
+
assert S.is_evictable(current_salience=0.02, age_days=60, referenced_by_live_node=False)
|
|
78
|
+
|
|
79
|
+
def test_referenced_node_survives(self):
|
|
80
|
+
assert not S.is_evictable(current_salience=0.0, age_days=999, referenced_by_live_node=True)
|
|
81
|
+
|
|
82
|
+
def test_restricted_never_auto_evicted(self):
|
|
83
|
+
assert not S.is_evictable(
|
|
84
|
+
current_salience=0.0, age_days=999, referenced_by_live_node=False,
|
|
85
|
+
disclosure_class="restricted",
|
|
86
|
+
)
|
|
87
|
+
|
|
88
|
+
def test_recent_low_salience_not_yet_evictable(self):
|
|
89
|
+
# Below threshold but too young — grace window protects it.
|
|
90
|
+
assert not S.is_evictable(current_salience=0.01, age_days=5, referenced_by_live_node=False)
|
|
91
|
+
|
|
92
|
+
def test_healthy_salience_never_evictable(self):
|
|
93
|
+
assert not S.is_evictable(current_salience=0.5, age_days=9999, referenced_by_live_node=False)
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
-- 004: accept 'code_reference' source events (SDK corpus ingest).
|
|
2
|
+
--
|
|
3
|
+
-- The SDK corpus module (packages/memory/src/corpus/) emits events with
|
|
4
|
+
-- source_kind='code_reference' (code-signature ingest, adapters.js).
|
|
5
|
+
-- The enum predates that feature, so those events bounced with
|
|
6
|
+
-- InvalidTextRepresentation and could never be stored — observed in
|
|
7
|
+
-- prod 2026-06-11 as persistent /extract 500s + producer retry loops.
|
|
8
|
+
--
|
|
9
|
+
-- ALTER TYPE ... ADD VALUE cannot run inside a transaction block;
|
|
10
|
+
-- apply with autocommit (psql's default per-statement behaviour).
|
|
11
|
+
-- Applied manually to prod (pme2-org-model) on 2026-06-11.
|
|
12
|
+
ALTER TYPE source_kind ADD VALUE IF NOT EXISTS 'code_reference';
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
-- 005: index every column that references events(id).
|
|
2
|
+
--
|
|
3
|
+
-- events has four referencing constraints:
|
|
4
|
+
-- distillation_queue.event_id ON DELETE CASCADE
|
|
5
|
+
-- vector_provenance.event_id ON DELETE CASCADE
|
|
6
|
+
-- distillation_traces.event_id ON DELETE CASCADE
|
|
7
|
+
-- events.forgets (self) ON DELETE SET NULL
|
|
8
|
+
--
|
|
9
|
+
-- Postgres does NOT auto-index FK referencing columns. Without these,
|
|
10
|
+
-- every DELETE on events seq-scans each referencing table per deleted
|
|
11
|
+
-- row to enforce the constraint — the 2026-06-11 arena-scoped nuke of
|
|
12
|
+
-- ~70k events ran for HOURS until the missing indexes were created
|
|
13
|
+
-- on-box. (distillation_queue.event_id already had idx_distillation_
|
|
14
|
+
-- event_id from 003; listed here for completeness via IF NOT EXISTS.)
|
|
15
|
+
--
|
|
16
|
+
-- All idempotent; applied manually to prod (pme2-org-model) 2026-06-12.
|
|
17
|
+
CREATE INDEX IF NOT EXISTS idx_distillation_event_id ON distillation_queue(event_id);
|
|
18
|
+
CREATE INDEX IF NOT EXISTS idx_traces_event_id ON distillation_traces(event_id);
|
|
19
|
+
CREATE INDEX IF NOT EXISTS idx_vector_provenance_event_id ON vector_provenance(event_id);
|
|
20
|
+
CREATE INDEX IF NOT EXISTS idx_events_forgets ON events(forgets);
|
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
-- 006: Fusion Drive foundations — salience + audit/ledger tables.
|
|
2
|
+
--
|
|
3
|
+
-- See RFC-fusion-drive.md. This migration is Phase 1 (scoring only, NO
|
|
4
|
+
-- eviction): it adds the columns the decay pass scores against and the
|
|
5
|
+
-- audit/ledger tables fusion + decay write to. Nothing in this migration
|
|
6
|
+
-- deletes or evicts; the decay pass ships dry-run first.
|
|
7
|
+
--
|
|
8
|
+
-- KEY DESIGN CALL: salience is SEPARATE from confidence. `confidence`
|
|
9
|
+
-- means "how corroborated/true is this fact" and must only ever move up
|
|
10
|
+
-- with corroboration — decaying it would lie about truth. `salience` is
|
|
11
|
+
-- "retention priority" and is what decays with time + rises with access.
|
|
12
|
+
-- Eviction (a later phase) keys on salience, never on confidence.
|
|
13
|
+
--
|
|
14
|
+
-- All idempotent (IF NOT EXISTS) — safe to re-run.
|
|
15
|
+
|
|
16
|
+
-- Retention scoring columns. Default 0.5 (neutral); the decay pass and
|
|
17
|
+
-- the extractor's born-salience seeding set real values. last_accessed
|
|
18
|
+
-- is bumped by /search when a node is returned (keeps used memories
|
|
19
|
+
-- alive); NULL means never retrieved.
|
|
20
|
+
ALTER TABLE entities ADD COLUMN IF NOT EXISTS salience REAL NOT NULL DEFAULT 0.5;
|
|
21
|
+
ALTER TABLE entities ADD COLUMN IF NOT EXISTS last_accessed TIMESTAMPTZ;
|
|
22
|
+
ALTER TABLE facts ADD COLUMN IF NOT EXISTS salience REAL NOT NULL DEFAULT 0.5;
|
|
23
|
+
ALTER TABLE facts ADD COLUMN IF NOT EXISTS last_accessed TIMESTAMPTZ;
|
|
24
|
+
ALTER TABLE relationships ADD COLUMN IF NOT EXISTS salience REAL NOT NULL DEFAULT 0.5;
|
|
25
|
+
ALTER TABLE relationships ADD COLUMN IF NOT EXISTS last_accessed TIMESTAMPTZ;
|
|
26
|
+
|
|
27
|
+
-- Partial indexes for the decay sweep: it scans low-salience nodes only.
|
|
28
|
+
CREATE INDEX IF NOT EXISTS idx_entities_salience ON entities(arena, salience) WHERE salience < 0.3;
|
|
29
|
+
CREATE INDEX IF NOT EXISTS idx_facts_salience ON facts(arena, salience) WHERE salience < 0.3;
|
|
30
|
+
CREATE INDEX IF NOT EXISTS idx_relationships_salience ON relationships(arena, salience) WHERE salience < 0.3;
|
|
31
|
+
|
|
32
|
+
-- fact_merges — mirror of entity_merges (002) for fact fusion. Same
|
|
33
|
+
-- receipt + rollback substrate: the losing fact row is deleted on merge,
|
|
34
|
+
-- this records what absorbed it and enough to recreate it.
|
|
35
|
+
CREATE TABLE IF NOT EXISTS fact_merges (
|
|
36
|
+
id TEXT PRIMARY KEY,
|
|
37
|
+
arena TEXT NOT NULL,
|
|
38
|
+
canonical_id TEXT NOT NULL REFERENCES facts(id) ON DELETE CASCADE,
|
|
39
|
+
deprecated_id TEXT NOT NULL, -- no FK; row is deleted
|
|
40
|
+
deprecated_statement TEXT NOT NULL, -- preserve for forensics
|
|
41
|
+
merge_signal TEXT NOT NULL CHECK (
|
|
42
|
+
merge_signal IN ('exact_triple', 'statement_embedding', 'llm_adjudication')
|
|
43
|
+
),
|
|
44
|
+
provenance_unioned INTEGER NOT NULL DEFAULT 0,
|
|
45
|
+
rollback_payload JSONB NOT NULL,
|
|
46
|
+
merged_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
|
|
47
|
+
);
|
|
48
|
+
CREATE INDEX IF NOT EXISTS idx_fact_merges_canonical ON fact_merges(canonical_id);
|
|
49
|
+
|
|
50
|
+
-- Run ledgers — observability for the Fusion Drive passes. Each pass
|
|
51
|
+
-- (fusion or decay) writes one row: what it scanned, what it changed,
|
|
52
|
+
-- and whether it was a dry-run.
|
|
53
|
+
CREATE TABLE IF NOT EXISTS fusion_drive_runs (
|
|
54
|
+
id TEXT PRIMARY KEY,
|
|
55
|
+
arena TEXT NOT NULL,
|
|
56
|
+
pass_kind TEXT NOT NULL CHECK (pass_kind IN ('fusion', 'decay')),
|
|
57
|
+
mode TEXT NOT NULL CHECK (mode IN ('dry_run', 'apply')),
|
|
58
|
+
scanned INTEGER NOT NULL DEFAULT 0,
|
|
59
|
+
changed INTEGER NOT NULL DEFAULT 0, -- merged (fusion) / evicted (decay)
|
|
60
|
+
detail JSONB NOT NULL DEFAULT '{}'::jsonb,
|
|
61
|
+
started_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
|
62
|
+
finished_at TIMESTAMPTZ
|
|
63
|
+
);
|
|
64
|
+
CREATE INDEX IF NOT EXISTS idx_fusion_drive_runs_arena ON fusion_drive_runs(arena, started_at DESC);
|
|
65
|
+
|
|
66
|
+
-- node_evictions — rollback receipts for decay eviction. The decay pass
|
|
67
|
+
-- only deletes nodes its salience math already classified evictable
|
|
68
|
+
-- (low salience, aged past the grace floor, unreferenced, non-restricted);
|
|
69
|
+
-- this records the full deleted row so an eviction can be undone. Mirrors
|
|
70
|
+
-- the merge-audit pattern (002 / fact_merges).
|
|
71
|
+
CREATE TABLE IF NOT EXISTS node_evictions (
|
|
72
|
+
id TEXT PRIMARY KEY,
|
|
73
|
+
arena TEXT NOT NULL,
|
|
74
|
+
node_kind TEXT NOT NULL CHECK (node_kind IN ('entity', 'fact', 'relationship')),
|
|
75
|
+
node_id TEXT NOT NULL,
|
|
76
|
+
salience_at_evict REAL,
|
|
77
|
+
rollback_payload JSONB NOT NULL,
|
|
78
|
+
evicted_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
|
|
79
|
+
);
|
|
80
|
+
CREATE INDEX IF NOT EXISTS idx_node_evictions_arena ON node_evictions(arena, evicted_at DESC);
|
|
@@ -0,0 +1,113 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""Fusion Drive — born-salience backfill for EXISTING rows.
|
|
3
|
+
|
|
4
|
+
Migration 006 defaults every pre-existing entity/fact to salience 0.5,
|
|
5
|
+
including accumulated 7B-era junk (e.g. the 87k pip-agents events). At 0.5 a
|
|
6
|
+
junk entity would take ~3+ years to decay under the entity half-life — so the
|
|
7
|
+
"junk self-evicts" cure is inert for existing data until their salience is
|
|
8
|
+
re-seeded from the same quality flags the worker now stamps at insert.
|
|
9
|
+
|
|
10
|
+
This pass recomputes born-salience for existing entities + facts in an arena
|
|
11
|
+
and writes it back, so the decay pass can act on historical pollution. It only
|
|
12
|
+
LOWERS salience where the quality flags fire (never raises a node above what
|
|
13
|
+
it already has — re-corroboration, not this pass, raises salience). Reads the
|
|
14
|
+
same digit-ratio / subject-undeclared signals the worker uses; deeper signals
|
|
15
|
+
(ungrounded vs source content) are a follow-up.
|
|
16
|
+
|
|
17
|
+
Arena-scoped (required), dry-run default, --apply to write.
|
|
18
|
+
|
|
19
|
+
Usage:
|
|
20
|
+
fusion_drive_born_salience_backfill.py --arena 'X' # report
|
|
21
|
+
fusion_drive_born_salience_backfill.py --arena 'X' --apply # write salience
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
from __future__ import annotations
|
|
25
|
+
|
|
26
|
+
import argparse
|
|
27
|
+
import os
|
|
28
|
+
import sys
|
|
29
|
+
|
|
30
|
+
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "fusion_drive"))
|
|
31
|
+
import salience as S # noqa: E402
|
|
32
|
+
|
|
33
|
+
try:
|
|
34
|
+
import psycopg
|
|
35
|
+
from psycopg.rows import dict_row
|
|
36
|
+
except ModuleNotFoundError:
|
|
37
|
+
print("psycopg required", file=sys.stderr)
|
|
38
|
+
raise
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def _digit_ratio(s: str) -> float:
|
|
42
|
+
stripped = "".join((s or "").split())
|
|
43
|
+
return sum(c.isdigit() for c in stripped) / len(stripped) if stripped else 0.0
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def main() -> int:
|
|
47
|
+
ap = argparse.ArgumentParser()
|
|
48
|
+
ap.add_argument("--arena", required=True)
|
|
49
|
+
ap.add_argument("--pg-dsn", default=os.environ.get("PG_DSN"))
|
|
50
|
+
ap.add_argument("--apply", action="store_true")
|
|
51
|
+
args = ap.parse_args()
|
|
52
|
+
if not args.pg_dsn:
|
|
53
|
+
print("PG_DSN required", file=sys.stderr)
|
|
54
|
+
return 2
|
|
55
|
+
|
|
56
|
+
ent_lowered = fact_lowered = ent_scanned = fact_scanned = 0
|
|
57
|
+
with psycopg.connect(args.pg_dsn, row_factory=dict_row) as conn:
|
|
58
|
+
with conn.cursor() as cur:
|
|
59
|
+
# entities: numeric-ID-as-person born low
|
|
60
|
+
cur.execute(
|
|
61
|
+
"SELECT id, entity_type, canonical_name, salience, provenance_event_ids "
|
|
62
|
+
"FROM entities WHERE arena = %s", (args.arena,))
|
|
63
|
+
for e in cur.fetchall():
|
|
64
|
+
ent_scanned += 1
|
|
65
|
+
flags = []
|
|
66
|
+
if e["entity_type"] == "person" and _digit_ratio(e["canonical_name"]) > 0.5:
|
|
67
|
+
flags.append("numeric_id_person")
|
|
68
|
+
if not flags:
|
|
69
|
+
continue
|
|
70
|
+
new_sal = S.born_salience(n_sources=len(e["provenance_event_ids"] or []) or 1,
|
|
71
|
+
quality_flags=flags)
|
|
72
|
+
if new_sal < e["salience"]:
|
|
73
|
+
ent_lowered += 1
|
|
74
|
+
if args.apply:
|
|
75
|
+
with conn.cursor() as w:
|
|
76
|
+
w.execute("UPDATE entities SET salience = %s WHERE id = %s",
|
|
77
|
+
(new_sal, e["id"]))
|
|
78
|
+
|
|
79
|
+
# facts: subject-undeclared / low-signal born low
|
|
80
|
+
cur.execute(
|
|
81
|
+
"SELECT id, subject_entity_id, statement, salience, provenance_event_ids "
|
|
82
|
+
"FROM facts WHERE arena = %s", (args.arena,))
|
|
83
|
+
for f in cur.fetchall():
|
|
84
|
+
fact_scanned += 1
|
|
85
|
+
flags = []
|
|
86
|
+
if f["subject_entity_id"] is None:
|
|
87
|
+
flags.append("subject_undeclared")
|
|
88
|
+
if len((f["statement"] or "")) < 60:
|
|
89
|
+
flags.append("low_signal")
|
|
90
|
+
if not flags:
|
|
91
|
+
continue
|
|
92
|
+
new_sal = S.born_salience(n_sources=len(f["provenance_event_ids"] or []) or 1,
|
|
93
|
+
quality_flags=flags)
|
|
94
|
+
if new_sal < f["salience"]:
|
|
95
|
+
fact_lowered += 1
|
|
96
|
+
if args.apply:
|
|
97
|
+
with conn.cursor() as w:
|
|
98
|
+
w.execute("UPDATE facts SET salience = %s WHERE id = %s",
|
|
99
|
+
(new_sal, f["id"]))
|
|
100
|
+
if args.apply:
|
|
101
|
+
conn.commit()
|
|
102
|
+
|
|
103
|
+
mode = "APPLY" if args.apply else "DRY-RUN"
|
|
104
|
+
print(f"[fusion-drive:born-salience-backfill] {mode} arena={args.arena}")
|
|
105
|
+
print(f" entities: scanned={ent_scanned} lowered={ent_lowered}")
|
|
106
|
+
print(f" facts: scanned={fact_scanned} lowered={fact_lowered}")
|
|
107
|
+
if not args.apply:
|
|
108
|
+
print(" (dry-run — run --apply to write. Lowers junk salience only; never raises.)")
|
|
109
|
+
return 0
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
if __name__ == "__main__":
|
|
113
|
+
raise SystemExit(main())
|