@pentatonic-ai/ai-agent-sdk 0.10.6 → 0.10.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (27) hide show
  1. package/dist/index.cjs +1 -1
  2. package/dist/index.js +1 -1
  3. package/package.json +1 -1
  4. package/packages/memory-engine-v2/RFC-decay-and-fusion.md +185 -0
  5. package/packages/memory-engine-v2/RFC-fusion-drive.md +193 -0
  6. package/packages/memory-engine-v2/docker-compose.aws.yml +62 -1
  7. package/packages/memory-engine-v2/docker-compose.yml +8 -1
  8. package/packages/memory-engine-v2/extractor-async/confidence.py +37 -0
  9. package/packages/memory-engine-v2/extractor-async/test_born_salience_parity.py +35 -0
  10. package/packages/memory-engine-v2/extractor-async/test_guided_json_parser.py +44 -0
  11. package/packages/memory-engine-v2/extractor-async/worker.py +67 -7
  12. package/packages/memory-engine-v2/extractor-sync/server.py +6 -2
  13. package/packages/memory-engine-v2/extractor-sync/test_paired_extraction.py +82 -1
  14. package/packages/memory-engine-v2/fusion_drive/__init__.py +0 -0
  15. package/packages/memory-engine-v2/fusion_drive/canonical.py +94 -0
  16. package/packages/memory-engine-v2/fusion_drive/conftest.py +8 -0
  17. package/packages/memory-engine-v2/fusion_drive/merge.py +178 -0
  18. package/packages/memory-engine-v2/fusion_drive/salience.py +118 -0
  19. package/packages/memory-engine-v2/fusion_drive/test_canonical.py +76 -0
  20. package/packages/memory-engine-v2/fusion_drive/test_merge.py +112 -0
  21. package/packages/memory-engine-v2/fusion_drive/test_salience.py +93 -0
  22. package/packages/memory-engine-v2/org-model/migrations/004_source_kind_code_reference.sql +12 -0
  23. package/packages/memory-engine-v2/org-model/migrations/005_fk_indexes.sql +20 -0
  24. package/packages/memory-engine-v2/org-model/migrations/006_fusion_drive.sql +80 -0
  25. package/packages/memory-engine-v2/scripts/fusion_drive_born_salience_backfill.py +113 -0
  26. package/packages/memory-engine-v2/scripts/fusion_drive_decay.py +181 -0
  27. package/packages/memory-engine-v2/scripts/fusion_drive_fuse.py +264 -0
@@ -0,0 +1,76 @@
1
+ """Unit tests for Fusion Drive scored canonical selection (pure, no DB)."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from canonical import CanonicalCandidate, canonical_score, pick_master, looks_like_id
6
+
7
+
8
+ class TestLooksLikeId:
9
+ def test_pure_digits(self):
10
+ assert looks_like_id("1716801984")
11
+
12
+ def test_real_name(self):
13
+ assert not looks_like_id("Katie Cooper")
14
+
15
+ def test_mostly_digits(self):
16
+ assert looks_like_id("user 90210 33")
17
+
18
+
19
+ class TestCanonicalScore:
20
+ def test_directory_match_dominates(self):
21
+ directoried = CanonicalCandidate("e1", "Philip Mossop", in_directory=True)
22
+ rich_typo = CanonicalCandidate("e2", "Phil Mossop", n_provenance=50, grounded=True)
23
+ assert canonical_score(directoried) > canonical_score(rich_typo)
24
+
25
+ def test_numeric_id_person_heavily_penalised(self):
26
+ idp = CanonicalCandidate("e1", "1716801984", n_provenance=20)
27
+ real = CanonicalCandidate("e2", "Katie Cooper", n_provenance=1)
28
+ assert canonical_score(real) > canonical_score(idp)
29
+
30
+ def test_current_teacher_beats_superseded_when_otherwise_equal(self):
31
+ new = CanonicalCandidate("e1", "Acme Corp", from_current_teacher=True)
32
+ old = CanonicalCandidate("e2", "Acme Corp", from_current_teacher=False)
33
+ assert canonical_score(new) > canonical_score(old)
34
+
35
+ def test_hallucinated_email_penalised(self):
36
+ clean = CanonicalCandidate("e1", "Sam Patel")
37
+ halluc = CanonicalCandidate("e2", "Sam Patel", hallucinated_email=True)
38
+ assert canonical_score(clean) > canonical_score(halluc)
39
+
40
+
41
+ class TestPickMaster:
42
+ def test_the_phil_mossop_typo_case(self):
43
+ # The exact regression the RFC calls out: directory-known correct
44
+ # spelling must win over a richer typo row.
45
+ cands = [
46
+ CanonicalCandidate("typo", "Phil Mossop", n_provenance=40, grounded=True),
47
+ CanonicalCandidate("real", "Philip Mossop", n_provenance=3, in_directory=True),
48
+ ]
49
+ master, losers = pick_master(cands)
50
+ assert master.entity_id == "real"
51
+ assert [l.entity_id for l in losers] == ["typo"]
52
+
53
+ def test_numeric_id_loses_to_real_name(self):
54
+ cands = [
55
+ CanonicalCandidate("idp", "1716801984", n_provenance=30),
56
+ CanonicalCandidate("named", "Katie Cooper", n_provenance=2, grounded=True),
57
+ ]
58
+ master, _ = pick_master(cands)
59
+ assert master.entity_id == "named"
60
+
61
+ def test_single_candidate_is_its_own_master(self):
62
+ c = CanonicalCandidate("solo", "Solo Entity")
63
+ master, losers = pick_master([c])
64
+ assert master is c and losers == []
65
+
66
+ def test_deterministic_tie_break(self):
67
+ a = CanonicalCandidate("a", "Acme", n_provenance=5)
68
+ b = CanonicalCandidate("b", "Acme", n_provenance=5)
69
+ m1, _ = pick_master([a, b])
70
+ m2, _ = pick_master([b, a])
71
+ assert m1.entity_id == m2.entity_id # order-independent
72
+
73
+ def test_empty_raises(self):
74
+ import pytest
75
+ with pytest.raises(ValueError):
76
+ pick_master([])
@@ -0,0 +1,112 @@
1
+ """Unit tests for Fusion Drive merge & eviction plan builders (pure, no DB)."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import pytest
6
+ from merge import build_entity_merge_plan, build_fact_merge_plan, build_eviction_receipt
7
+
8
+
9
+ def _ent(id, name, aliases=None, prov=None):
10
+ return {"id": id, "canonical_name": name, "aliases": aliases or [], "provenance_event_ids": prov or []}
11
+
12
+
13
+ class TestEntityMergePlan:
14
+ def test_aliases_and_provenance_union(self):
15
+ master = _ent("m", "Philip Mossop", aliases=["P. Mossop"], prov=["e1"])
16
+ loser = _ent("l", "Phil Mossop", aliases=["phil"], prov=["e2", "e1"])
17
+ plan = build_entity_merge_plan(arena="a", master=master, losers=[loser], facts=[], relationships=[])
18
+ # loser's canonical + aliases fold into master aliases; master's own name excluded
19
+ assert "Phil Mossop" in plan.master_aliases
20
+ assert "phil" in plan.master_aliases
21
+ assert "Philip Mossop" not in plan.master_aliases
22
+ # provenance deduped union
23
+ assert sorted(plan.master_provenance) == ["e1", "e2"]
24
+ assert plan.deprecated_entity_ids == ["l"]
25
+
26
+ def test_fact_subject_and_object_repoints(self):
27
+ master, loser = _ent("m", "Acme"), _ent("l", "ACME Inc")
28
+ facts = [
29
+ {"id": "f1", "subject_entity_id": "l", "object_entity_id": None},
30
+ {"id": "f2", "subject_entity_id": "x", "object_entity_id": "l"},
31
+ {"id": "f3", "subject_entity_id": "x", "object_entity_id": "y"}, # untouched
32
+ ]
33
+ plan = build_entity_merge_plan(arena="a", master=master, losers=[loser], facts=facts, relationships=[])
34
+ assert plan.fact_subject_repoints == ["f1"]
35
+ assert plan.fact_object_repoints == ["f2"]
36
+
37
+ def test_relationship_repoint_without_collision(self):
38
+ master, loser = _ent("m", "Bob"), _ent("l", "Bobby")
39
+ rels = [{"id": "r1", "from_entity_id": "l", "to_entity_id": "z",
40
+ "relationship_type": "works_for", "weight": 2.0, "provenance_event_ids": ["e1"]}]
41
+ plan = build_entity_merge_plan(arena="a", master=master, losers=[loser], facts=[], relationships=rels)
42
+ assert plan.rel_endpoint_repoints == ["r1"]
43
+ assert plan.rel_collisions == []
44
+
45
+ def test_relationship_collision_sums_weight_and_unions_provenance(self):
46
+ # master already has (m -> z, works_for); loser has (l -> z, works_for).
47
+ # Repointing l->m collides; keep one, sum weights, union provenance, drop other.
48
+ master, loser = _ent("m", "Bob"), _ent("l", "Bobby")
49
+ rels = [
50
+ {"id": "r_master", "from_entity_id": "m", "to_entity_id": "z",
51
+ "relationship_type": "works_for", "weight": 3.0, "provenance_event_ids": ["e1"]},
52
+ {"id": "r_loser", "from_entity_id": "l", "to_entity_id": "z",
53
+ "relationship_type": "works_for", "weight": 2.0, "provenance_event_ids": ["e2"]},
54
+ ]
55
+ plan = build_entity_merge_plan(arena="a", master=master, losers=[loser], facts=[], relationships=rels)
56
+ assert len(plan.rel_collisions) == 1
57
+ c = plan.rel_collisions[0]
58
+ assert c["keep"] == "r_master" and c["drop"] == "r_loser"
59
+ assert c["summed_weight"] == 5.0
60
+ assert sorted(c["provenance"]) == ["e1", "e2"]
61
+
62
+ def test_audit_rows_carry_rollback_payload(self):
63
+ master, loser = _ent("m", "Real"), _ent("l", "Dupe", prov=["e9"])
64
+ plan = build_entity_merge_plan(arena="a", master=master, losers=[loser], facts=[], relationships=[])
65
+ assert len(plan.audit_rows) == 1
66
+ row = plan.audit_rows[0]
67
+ assert row["canonical_id"] == "m" and row["deprecated_id"] == "l"
68
+ assert row["rollback_payload"] == loser # full row preserved
69
+
70
+ def test_master_cannot_be_loser(self):
71
+ m = _ent("m", "X")
72
+ with pytest.raises(ValueError):
73
+ build_entity_merge_plan(arena="a", master=m, losers=[m], facts=[], relationships=[])
74
+
75
+ def test_multi_loser_merge(self):
76
+ master = _ent("m", "Katie Cooper", prov=["e1"])
77
+ losers = [_ent("l1", "1716801984", prov=["e2"]), _ent("l2", "K. Cooper", prov=["e3"])]
78
+ plan = build_entity_merge_plan(arena="a", master=master, losers=losers, facts=[], relationships=[])
79
+ assert plan.deprecated_entity_ids == ["l1", "l2"]
80
+ assert sorted(plan.master_provenance) == ["e1", "e2", "e3"]
81
+ assert len(plan.audit_rows) == 2
82
+
83
+
84
+ class TestFactMergePlan:
85
+ def test_picks_highest_confidence_master(self):
86
+ dups = [
87
+ {"id": "f1", "confidence": 0.5, "statement": "short", "provenance_event_ids": ["e1"]},
88
+ {"id": "f2", "confidence": 0.9, "statement": "the better one", "provenance_event_ids": ["e2"]},
89
+ ]
90
+ plan = build_fact_merge_plan(arena="a", dup_facts=dups)
91
+ assert plan["master_id"] == "f2"
92
+ assert plan["deprecated_ids"] == ["f1"]
93
+ assert sorted(plan["master_provenance"]) == ["e1", "e2"]
94
+
95
+ def test_single_fact_no_merge(self):
96
+ assert build_fact_merge_plan(arena="a", dup_facts=[{"id": "f1"}]) is None
97
+
98
+ def test_tie_breaks_on_statement_length(self):
99
+ dups = [
100
+ {"id": "f1", "confidence": 0.7, "statement": "x", "provenance_event_ids": []},
101
+ {"id": "f2", "confidence": 0.7, "statement": "longer statement", "provenance_event_ids": []},
102
+ ]
103
+ plan = build_fact_merge_plan(arena="a", dup_facts=dups)
104
+ assert plan["master_id"] == "f2"
105
+
106
+
107
+ class TestEvictionReceipt:
108
+ def test_carries_full_row(self):
109
+ row = {"id": "e1", "arena": "a", "canonical_name": "Ghost"}
110
+ r = build_eviction_receipt("entity", row)
111
+ assert r["node_kind"] == "entity" and r["node_id"] == "e1"
112
+ assert r["rollback_payload"] == row
@@ -0,0 +1,93 @@
1
+ """Unit tests for Fusion Drive salience scoring + decay (pure, no DB)."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import salience as S
6
+
7
+
8
+ class TestBornSalience:
9
+ def test_base_for_single_uncorroborated_clean_node(self):
10
+ assert S.born_salience(n_sources=1) == S.BASE_SALIENCE
11
+
12
+ def test_corroboration_raises_but_caps(self):
13
+ assert S.born_salience(n_sources=2) > S.BASE_SALIENCE
14
+ capped = S.born_salience(n_sources=100)
15
+ assert capped == round(S.BASE_SALIENCE + S.CORROB_CAP, 4)
16
+
17
+ def test_junk_is_born_below_decay_sweep_threshold(self):
18
+ # Any single hard-junk flag must sink it under the 0.3 sweep line
19
+ # so decay targets pollution without needing a fusion match.
20
+ for flag in ("noise_name", "numeric_id_person", "hallucinated_email"):
21
+ assert S.born_salience(n_sources=1, quality_flags=[flag]) < 0.3, flag
22
+
23
+ def test_combined_flags_drive_to_floor(self):
24
+ s = S.born_salience(
25
+ n_sources=1, quality_flags=["numeric_id_person", "hallucinated_email", "ungrounded"]
26
+ )
27
+ assert s == S.SALIENCE_FLOOR
28
+
29
+ def test_corroboration_cannot_rescue_hard_junk(self):
30
+ # A heavily-corroborated numeric-ID person is still junk-leaning.
31
+ s = S.born_salience(n_sources=5, quality_flags=["numeric_id_person"])
32
+ assert s < S.BASE_SALIENCE
33
+
34
+ def test_never_exceeds_ceiling_or_below_floor(self):
35
+ assert S.born_salience(n_sources=1000) <= S.SALIENCE_CEIL
36
+ assert S.born_salience(n_sources=1, quality_flags=["noise_name"] * 10) == S.SALIENCE_FLOOR
37
+
38
+
39
+ class TestHalfLife:
40
+ def test_durable_categories_barely_decay(self):
41
+ assert S.half_life_days("fact", "decision") >= 3650
42
+ assert S.half_life_days("fact", "commitment") >= 3650
43
+
44
+ def test_ephemeral_categories_fade_fast(self):
45
+ assert S.half_life_days("fact", "mention") <= 30
46
+ assert S.half_life_days("fact", "observation") <= 30
47
+
48
+ def test_unknown_category_uses_default(self):
49
+ assert S.half_life_days("fact", "made_up") == S.FACT_HALF_LIFE_DEFAULT
50
+
51
+ def test_entity_and_relationship_kinds(self):
52
+ assert S.half_life_days("entity") == S.ENTITY_HALF_LIFE_DAYS
53
+ assert S.half_life_days("relationship") == S.RELATIONSHIP_HALF_LIFE_DAYS
54
+
55
+
56
+ class TestDecay:
57
+ def test_one_half_life_halves_salience(self):
58
+ assert S.decayed_salience(0.8, age_days=30, hl_days=30) == round(0.4, 4)
59
+
60
+ def test_two_half_lives_quarters(self):
61
+ assert S.decayed_salience(0.8, age_days=60, hl_days=30) == round(0.2, 4)
62
+
63
+ def test_no_age_no_decay(self):
64
+ assert S.decayed_salience(0.6, age_days=0, hl_days=30) == 0.6
65
+
66
+ def test_durable_fact_after_a_year_barely_moves(self):
67
+ hl = S.half_life_days("fact", "decision")
68
+ assert S.decayed_salience(0.7, age_days=365, hl_days=hl) > 0.65
69
+
70
+ def test_mention_after_three_months_is_near_floor(self):
71
+ hl = S.half_life_days("fact", "mention") # 30d
72
+ assert S.decayed_salience(0.5, age_days=90, hl_days=hl) < 0.1
73
+
74
+
75
+ class TestEvictable:
76
+ def test_low_salience_old_unreferenced_is_evictable(self):
77
+ assert S.is_evictable(current_salience=0.02, age_days=60, referenced_by_live_node=False)
78
+
79
+ def test_referenced_node_survives(self):
80
+ assert not S.is_evictable(current_salience=0.0, age_days=999, referenced_by_live_node=True)
81
+
82
+ def test_restricted_never_auto_evicted(self):
83
+ assert not S.is_evictable(
84
+ current_salience=0.0, age_days=999, referenced_by_live_node=False,
85
+ disclosure_class="restricted",
86
+ )
87
+
88
+ def test_recent_low_salience_not_yet_evictable(self):
89
+ # Below threshold but too young — grace window protects it.
90
+ assert not S.is_evictable(current_salience=0.01, age_days=5, referenced_by_live_node=False)
91
+
92
+ def test_healthy_salience_never_evictable(self):
93
+ assert not S.is_evictable(current_salience=0.5, age_days=9999, referenced_by_live_node=False)
@@ -0,0 +1,12 @@
1
+ -- 004: accept 'code_reference' source events (SDK corpus ingest).
2
+ --
3
+ -- The SDK corpus module (packages/memory/src/corpus/) emits events with
4
+ -- source_kind='code_reference' (code-signature ingest, adapters.js).
5
+ -- The enum predates that feature, so those events bounced with
6
+ -- InvalidTextRepresentation and could never be stored — observed in
7
+ -- prod 2026-06-11 as persistent /extract 500s + producer retry loops.
8
+ --
9
+ -- ALTER TYPE ... ADD VALUE cannot run inside a transaction block;
10
+ -- apply with autocommit (psql's default per-statement behaviour).
11
+ -- Applied manually to prod (pme2-org-model) on 2026-06-11.
12
+ ALTER TYPE source_kind ADD VALUE IF NOT EXISTS 'code_reference';
@@ -0,0 +1,20 @@
1
+ -- 005: index every column that references events(id).
2
+ --
3
+ -- events has four referencing constraints:
4
+ -- distillation_queue.event_id ON DELETE CASCADE
5
+ -- vector_provenance.event_id ON DELETE CASCADE
6
+ -- distillation_traces.event_id ON DELETE CASCADE
7
+ -- events.forgets (self) ON DELETE SET NULL
8
+ --
9
+ -- Postgres does NOT auto-index FK referencing columns. Without these,
10
+ -- every DELETE on events seq-scans each referencing table per deleted
11
+ -- row to enforce the constraint — the 2026-06-11 arena-scoped nuke of
12
+ -- ~70k events ran for HOURS until the missing indexes were created
13
+ -- on-box. (distillation_queue.event_id already had idx_distillation_
14
+ -- event_id from 003; listed here for completeness via IF NOT EXISTS.)
15
+ --
16
+ -- All idempotent; applied manually to prod (pme2-org-model) 2026-06-12.
17
+ CREATE INDEX IF NOT EXISTS idx_distillation_event_id ON distillation_queue(event_id);
18
+ CREATE INDEX IF NOT EXISTS idx_traces_event_id ON distillation_traces(event_id);
19
+ CREATE INDEX IF NOT EXISTS idx_vector_provenance_event_id ON vector_provenance(event_id);
20
+ CREATE INDEX IF NOT EXISTS idx_events_forgets ON events(forgets);
@@ -0,0 +1,80 @@
1
+ -- 006: Fusion Drive foundations — salience + audit/ledger tables.
2
+ --
3
+ -- See RFC-fusion-drive.md. This migration is Phase 1 (scoring only, NO
4
+ -- eviction): it adds the columns the decay pass scores against and the
5
+ -- audit/ledger tables fusion + decay write to. Nothing in this migration
6
+ -- deletes or evicts; the decay pass ships dry-run first.
7
+ --
8
+ -- KEY DESIGN CALL: salience is SEPARATE from confidence. `confidence`
9
+ -- means "how corroborated/true is this fact" and must only ever move up
10
+ -- with corroboration — decaying it would lie about truth. `salience` is
11
+ -- "retention priority" and is what decays with time + rises with access.
12
+ -- Eviction (a later phase) keys on salience, never on confidence.
13
+ --
14
+ -- All idempotent (IF NOT EXISTS) — safe to re-run.
15
+
16
+ -- Retention scoring columns. Default 0.5 (neutral); the decay pass and
17
+ -- the extractor's born-salience seeding set real values. last_accessed
18
+ -- is bumped by /search when a node is returned (keeps used memories
19
+ -- alive); NULL means never retrieved.
20
+ ALTER TABLE entities ADD COLUMN IF NOT EXISTS salience REAL NOT NULL DEFAULT 0.5;
21
+ ALTER TABLE entities ADD COLUMN IF NOT EXISTS last_accessed TIMESTAMPTZ;
22
+ ALTER TABLE facts ADD COLUMN IF NOT EXISTS salience REAL NOT NULL DEFAULT 0.5;
23
+ ALTER TABLE facts ADD COLUMN IF NOT EXISTS last_accessed TIMESTAMPTZ;
24
+ ALTER TABLE relationships ADD COLUMN IF NOT EXISTS salience REAL NOT NULL DEFAULT 0.5;
25
+ ALTER TABLE relationships ADD COLUMN IF NOT EXISTS last_accessed TIMESTAMPTZ;
26
+
27
+ -- Partial indexes for the decay sweep: it scans low-salience nodes only.
28
+ CREATE INDEX IF NOT EXISTS idx_entities_salience ON entities(arena, salience) WHERE salience < 0.3;
29
+ CREATE INDEX IF NOT EXISTS idx_facts_salience ON facts(arena, salience) WHERE salience < 0.3;
30
+ CREATE INDEX IF NOT EXISTS idx_relationships_salience ON relationships(arena, salience) WHERE salience < 0.3;
31
+
32
+ -- fact_merges — mirror of entity_merges (002) for fact fusion. Same
33
+ -- receipt + rollback substrate: the losing fact row is deleted on merge,
34
+ -- this records what absorbed it and enough to recreate it.
35
+ CREATE TABLE IF NOT EXISTS fact_merges (
36
+ id TEXT PRIMARY KEY,
37
+ arena TEXT NOT NULL,
38
+ canonical_id TEXT NOT NULL REFERENCES facts(id) ON DELETE CASCADE,
39
+ deprecated_id TEXT NOT NULL, -- no FK; row is deleted
40
+ deprecated_statement TEXT NOT NULL, -- preserve for forensics
41
+ merge_signal TEXT NOT NULL CHECK (
42
+ merge_signal IN ('exact_triple', 'statement_embedding', 'llm_adjudication')
43
+ ),
44
+ provenance_unioned INTEGER NOT NULL DEFAULT 0,
45
+ rollback_payload JSONB NOT NULL,
46
+ merged_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
47
+ );
48
+ CREATE INDEX IF NOT EXISTS idx_fact_merges_canonical ON fact_merges(canonical_id);
49
+
50
+ -- Run ledgers — observability for the Fusion Drive passes. Each pass
51
+ -- (fusion or decay) writes one row: what it scanned, what it changed,
52
+ -- and whether it was a dry-run.
53
+ CREATE TABLE IF NOT EXISTS fusion_drive_runs (
54
+ id TEXT PRIMARY KEY,
55
+ arena TEXT NOT NULL,
56
+ pass_kind TEXT NOT NULL CHECK (pass_kind IN ('fusion', 'decay')),
57
+ mode TEXT NOT NULL CHECK (mode IN ('dry_run', 'apply')),
58
+ scanned INTEGER NOT NULL DEFAULT 0,
59
+ changed INTEGER NOT NULL DEFAULT 0, -- merged (fusion) / evicted (decay)
60
+ detail JSONB NOT NULL DEFAULT '{}'::jsonb,
61
+ started_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
62
+ finished_at TIMESTAMPTZ
63
+ );
64
+ CREATE INDEX IF NOT EXISTS idx_fusion_drive_runs_arena ON fusion_drive_runs(arena, started_at DESC);
65
+
66
+ -- node_evictions — rollback receipts for decay eviction. The decay pass
67
+ -- only deletes nodes its salience math already classified evictable
68
+ -- (low salience, aged past the grace floor, unreferenced, non-restricted);
69
+ -- this records the full deleted row so an eviction can be undone. Mirrors
70
+ -- the merge-audit pattern (002 / fact_merges).
71
+ CREATE TABLE IF NOT EXISTS node_evictions (
72
+ id TEXT PRIMARY KEY,
73
+ arena TEXT NOT NULL,
74
+ node_kind TEXT NOT NULL CHECK (node_kind IN ('entity', 'fact', 'relationship')),
75
+ node_id TEXT NOT NULL,
76
+ salience_at_evict REAL,
77
+ rollback_payload JSONB NOT NULL,
78
+ evicted_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
79
+ );
80
+ CREATE INDEX IF NOT EXISTS idx_node_evictions_arena ON node_evictions(arena, evicted_at DESC);
@@ -0,0 +1,113 @@
1
+ #!/usr/bin/env python3
2
+ """Fusion Drive — born-salience backfill for EXISTING rows.
3
+
4
+ Migration 006 defaults every pre-existing entity/fact to salience 0.5,
5
+ including accumulated 7B-era junk (e.g. the 87k pip-agents events). At 0.5 a
6
+ junk entity would take ~3+ years to decay under the entity half-life — so the
7
+ "junk self-evicts" cure is inert for existing data until their salience is
8
+ re-seeded from the same quality flags the worker now stamps at insert.
9
+
10
+ This pass recomputes born-salience for existing entities + facts in an arena
11
+ and writes it back, so the decay pass can act on historical pollution. It only
12
+ LOWERS salience where the quality flags fire (never raises a node above what
13
+ it already has — re-corroboration, not this pass, raises salience). Reads the
14
+ same digit-ratio / subject-undeclared signals the worker uses; deeper signals
15
+ (ungrounded vs source content) are a follow-up.
16
+
17
+ Arena-scoped (required), dry-run default, --apply to write.
18
+
19
+ Usage:
20
+ fusion_drive_born_salience_backfill.py --arena 'X' # report
21
+ fusion_drive_born_salience_backfill.py --arena 'X' --apply # write salience
22
+ """
23
+
24
+ from __future__ import annotations
25
+
26
+ import argparse
27
+ import os
28
+ import sys
29
+
30
+ sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "fusion_drive"))
31
+ import salience as S # noqa: E402
32
+
33
+ try:
34
+ import psycopg
35
+ from psycopg.rows import dict_row
36
+ except ModuleNotFoundError:
37
+ print("psycopg required", file=sys.stderr)
38
+ raise
39
+
40
+
41
+ def _digit_ratio(s: str) -> float:
42
+ stripped = "".join((s or "").split())
43
+ return sum(c.isdigit() for c in stripped) / len(stripped) if stripped else 0.0
44
+
45
+
46
+ def main() -> int:
47
+ ap = argparse.ArgumentParser()
48
+ ap.add_argument("--arena", required=True)
49
+ ap.add_argument("--pg-dsn", default=os.environ.get("PG_DSN"))
50
+ ap.add_argument("--apply", action="store_true")
51
+ args = ap.parse_args()
52
+ if not args.pg_dsn:
53
+ print("PG_DSN required", file=sys.stderr)
54
+ return 2
55
+
56
+ ent_lowered = fact_lowered = ent_scanned = fact_scanned = 0
57
+ with psycopg.connect(args.pg_dsn, row_factory=dict_row) as conn:
58
+ with conn.cursor() as cur:
59
+ # entities: numeric-ID-as-person born low
60
+ cur.execute(
61
+ "SELECT id, entity_type, canonical_name, salience, provenance_event_ids "
62
+ "FROM entities WHERE arena = %s", (args.arena,))
63
+ for e in cur.fetchall():
64
+ ent_scanned += 1
65
+ flags = []
66
+ if e["entity_type"] == "person" and _digit_ratio(e["canonical_name"]) > 0.5:
67
+ flags.append("numeric_id_person")
68
+ if not flags:
69
+ continue
70
+ new_sal = S.born_salience(n_sources=len(e["provenance_event_ids"] or []) or 1,
71
+ quality_flags=flags)
72
+ if new_sal < e["salience"]:
73
+ ent_lowered += 1
74
+ if args.apply:
75
+ with conn.cursor() as w:
76
+ w.execute("UPDATE entities SET salience = %s WHERE id = %s",
77
+ (new_sal, e["id"]))
78
+
79
+ # facts: subject-undeclared / low-signal born low
80
+ cur.execute(
81
+ "SELECT id, subject_entity_id, statement, salience, provenance_event_ids "
82
+ "FROM facts WHERE arena = %s", (args.arena,))
83
+ for f in cur.fetchall():
84
+ fact_scanned += 1
85
+ flags = []
86
+ if f["subject_entity_id"] is None:
87
+ flags.append("subject_undeclared")
88
+ if len((f["statement"] or "")) < 60:
89
+ flags.append("low_signal")
90
+ if not flags:
91
+ continue
92
+ new_sal = S.born_salience(n_sources=len(f["provenance_event_ids"] or []) or 1,
93
+ quality_flags=flags)
94
+ if new_sal < f["salience"]:
95
+ fact_lowered += 1
96
+ if args.apply:
97
+ with conn.cursor() as w:
98
+ w.execute("UPDATE facts SET salience = %s WHERE id = %s",
99
+ (new_sal, f["id"]))
100
+ if args.apply:
101
+ conn.commit()
102
+
103
+ mode = "APPLY" if args.apply else "DRY-RUN"
104
+ print(f"[fusion-drive:born-salience-backfill] {mode} arena={args.arena}")
105
+ print(f" entities: scanned={ent_scanned} lowered={ent_lowered}")
106
+ print(f" facts: scanned={fact_scanned} lowered={fact_lowered}")
107
+ if not args.apply:
108
+ print(" (dry-run — run --apply to write. Lowers junk salience only; never raises.)")
109
+ return 0
110
+
111
+
112
+ if __name__ == "__main__":
113
+ raise SystemExit(main())