@pentatonic-ai/ai-agent-sdk 0.10.7 → 0.10.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.cjs CHANGED
@@ -878,7 +878,7 @@ function fireAndForgetEmit(clientConfig, sessionOpts, messages, result, model) {
878
878
  }
879
879
 
880
880
  // src/telemetry.js
881
- var VERSION = "0.10.7";
881
+ var VERSION = "0.10.8";
882
882
  var TELEMETRY_URL = "https://sdk-telemetry.philip-134.workers.dev";
883
883
  function machineId() {
884
884
  const raw = typeof process !== "undefined" ? `${process.env?.USER || process.env?.USERNAME || "u"}:${process.platform || "x"}:${process.arch || "x"}` : "browser";
package/dist/index.js CHANGED
@@ -847,7 +847,7 @@ function fireAndForgetEmit(clientConfig, sessionOpts, messages, result, model) {
847
847
  }
848
848
 
849
849
  // src/telemetry.js
850
- var VERSION = "0.10.7";
850
+ var VERSION = "0.10.8";
851
851
  var TELEMETRY_URL = "https://sdk-telemetry.philip-134.workers.dev";
852
852
  function machineId() {
853
853
  const raw = typeof process !== "undefined" ? `${process.env?.USER || process.env?.USERNAME || "u"}:${process.platform || "x"}:${process.arch || "x"}` : "browser";
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@pentatonic-ai/ai-agent-sdk",
3
- "version": "0.10.7",
3
+ "version": "0.10.8",
4
4
  "description": "TES SDK — LLM observability and lifecycle tracking via Pentatonic Thing Event System. Track token usage, tool calls, and conversations. Manage things through event-sourced lifecycle stages with AI enrichment and vector search.",
5
5
  "type": "module",
6
6
  "main": "./dist/index.cjs",
@@ -0,0 +1,185 @@
1
+ # RFC: the Fusion Drive — v2 memory self-healing (cross-run node fusion + decay)
2
+
3
+ > **Fusion Drive** = the continuous, arena-scoped background engine that keeps the v2
4
+ > memory graph self-healing: it *fuses* duplicate/near-duplicate nodes from different
5
+ > distillation runs into a single master node (horizontal convergence) and *decays* stale,
6
+ > low-value, and junk nodes out of existence (vertical aging). Named for the drive that
7
+ > does the fusing — the decay pass rides the same engine.
8
+
9
+ **Status:** draft / spec — 2026-06-12
10
+ **Builds on:** `RFC-entity-reconciliation.md`, `scripts/entity_resolution_v2.py` (#82),
11
+ `org-model/migrations/002_entity_merges_audit.sql`.
12
+ **Motivated by:** the v2 store is currently **pure-accretion** — three independent
13
+ properties, all verified in code, mean nothing ever leaves or improves in place:
14
+
15
+ 1. **No supersede by source_id** — event identity is `sha256(arena:content)`; re-emitting
16
+ edited content appends a new event, the old persists.
17
+ 2. **Accrete-only graph writes** — entity/fact upserts are `ON CONFLICT (id) DO UPDATE`
18
+ that only merge aliases/provenance and bump confidence; a *corrected* extraction has a
19
+ different deterministic id, so it lands **beside** the polluted node, never replacing it.
20
+ 3. **No decay/eviction** — v2 has no GC; fact confidence only moves up; recency affects
21
+ search ranking only, never retention.
22
+
23
+ Net: improving the extractor/teacher only helps **new** content. Accumulated 7B-era
24
+ pollution (hallucinated emails, numeric-ID-as-person, ungrounded entities) is immortal.
25
+ `pentatonic-team` had to be **nuked** rather than re-distilled because of this; `pip-agents`
26
+ (87k events) still carries all of it.
27
+
28
+ This RFC makes the store **self-healing** via two complementary mechanisms:
29
+ **fusion** (horizontal — converge duplicate/near-duplicate nodes from different
30
+ distillation runs into one *master* node) and **decay** (vertical — age out stale and
31
+ low-value nodes). Both are gated, arena-scoped, audited, and reversible.
32
+
33
+ ---
34
+
35
+ ## Part A — Fusion: converge near-duplicate nodes into a master
36
+
37
+ Extends the existing entity-resolution machinery along four axes.
38
+
39
+ ### A1. Online + continuous (today it's dry-run batch)
40
+ Run fusion as a scheduled per-arena pass (systemd timer on the engine box, same pattern as
41
+ the distiller autoscaler) **and** opportunistically after a distillation run touches an
42
+ arena's entities. Keep #82's invariants: dry-run default, `--apply` gate, arena scoping,
43
+ `entity_merges` rollback. Add a `fusion_runs` ledger (arena, started_at, candidates,
44
+ merged, mode) for observability.
45
+
46
+ ### A2. Cross-distillation-run detection (the actual pollution cure)
47
+ The hard case #82 misses: 7B `"1716801984"` (numeric-ID person) and Qwen3.6 `"Katie Cooper"`
48
+ are the same real entity but share **no name similarity**, so name-blocking never compares
49
+ them. New candidate signals beyond name trigrams / embedding-on-name:
50
+
51
+ - **Shared-provenance co-reference** — two entities of the same `entity_type` citing the
52
+ same `event_id` in `provenance_event_ids`, where one is low-quality (numeric / ungrounded
53
+ / single-token). The shared event's content is the adjudication context ("does this event
54
+ support these being the same person?").
55
+ - **Context embedding** — embed the *facts/statements about* an entity (not just its name),
56
+ so name-divergent dupes still cluster. Reuses the bulk-embed lane.
57
+ - **Teacher-version signal** — provenance maps to `distillation_traces.llm_model` /
58
+ `system_prompt_hash`. Prefer the newer-teacher extraction as master; an entity *only* ever
59
+ produced by the superseded teacher and never re-confirmed by the new one is both a fusion
60
+ candidate (likely a worse rendering of a node the new teacher got right) and a decay
61
+ candidate (stale-teacher orphan — see B).
62
+
63
+ ### A3. Master-node selection — replace richest-row-wins
64
+ #82 uses "richest-row-wins", which (flagged in review) would crown the typo **"Phil Mossop"**
65
+ over **"Philip Mossop"**. Replace with a **scored** canonical pick:
66
+
67
+ | Signal | Effect |
68
+ |---|---|
69
+ | **Directory/authority anchor** (name matches an org-directory / HubSpot contact / Pip `contact_email`+`contact_name`) | dominant + → canonical |
70
+ | Grounding (name appears verbatim in a provenance event's content) | + |
71
+ | Teacher recency (newer `llm_model`) | + |
72
+ | Corroboration (`cardinality(provenance_event_ids)`) | + |
73
+ | Looks-like-ID (digit-ratio > 0.5) / hallucinated-email flag / single-token bare name | − − |
74
+
75
+ Master = highest score. Losers' surface forms become **aliases** on the master (so existing
76
+ lookups still resolve), facts/relationships are repointed, losers tombstoned in
77
+ `entity_merges` with `rollback_payload`. Directory-anchored selection is the key fix: an
78
+ authoritative source, when present, beats any heuristic.
79
+
80
+ ### A4. Fact + relationship fusion (today only entities fuse)
81
+ After entity fusion (so subject/object ids are canonical):
82
+ - **Facts** — exact `(arena, subject, predicate, object)` dupes already collapse via the
83
+ content-id. **Semantic** dupes (same assertion, different surface — "joined Acme" vs "works
84
+ at Acme") need statement-embedding similarity + LLM adjudication ("same assertion?").
85
+ Master fact = max confidence + best-grounded statement; union provenance; tombstone dupes.
86
+ New `fact_merges` audit mirroring `entity_merges`.
87
+ - **Relationships** — `(from,to,type)` already collapses; a controlled rel-type vocabulary
88
+ ("works at" ≡ "employed by") is a later optional canonicalization.
89
+
90
+ ### A5. Audit, reversibility, safety rails
91
+ Reuse `entity_merges`; add `fact_merges`. Every fusion carries `rollback_payload`.
92
+ LLM-adjudicated merges store prompt+verdict. **Disclosure rail:** never send
93
+ `disclosure_class='restricted'` rows to the LLM adjudicator (data-egress; the #82 review
94
+ item). Auto-merge only above a high confidence band; everything else → human-review queue.
95
+
96
+ ---
97
+
98
+ ## Part B — Decay: age out stale and low-value nodes
99
+
100
+ ### B1. Separate `salience` from `confidence` (important)
101
+ Do **not** decay `confidence` — it means "how corroborated/true is this", and decaying it
102
+ would lie about corroboration. Add a separate **`salience`** (retention priority) to
103
+ entities/facts/relationships. Decay acts on salience; eviction keys on salience.
104
+
105
+ `salience(t) = salience₀ · exp(−ln2 · Δt / half_life[category])`, bumped on access or
106
+ re-corroboration. Per-category half-life:
107
+
108
+ | category | half-life | rationale |
109
+ |---|---|---|
110
+ | decision, commitment | very long / ∞ | durable record |
111
+ | state, preference | medium | changes but matters |
112
+ | mention, observation | short | ephemeral |
113
+
114
+ `Δt` = time since `last_seen` **or** a new `last_accessed` (bumped when a node is returned by
115
+ `/search` — cheap write, makes retrieval keep memories alive). Re-corroboration (new
116
+ provenance) resets the clock and bumps salience.
117
+
118
+ ### B2. Born-salience — the cheap partial cure
119
+ Seed `salience₀` from extraction-quality signals already computed (the trap detectors:
120
+ ungrounded, numeric-ID-person, hallucinated-email, `noise_filter` hits). **Junk is born
121
+ low**, so it decays below threshold and self-evicts fast — pollution cleans itself even
122
+ without a fusion match.
123
+
124
+ ### B3. Eviction (GC)
125
+ Node is evictable when: `salience < min_threshold` **AND** `last_seen`/`last_accessed`
126
+ older than a floor **AND** not referenced by a surviving higher-salience node (an entity
127
+ that's the subject/object of a live fact survives). Eviction = **tombstone** (soft-delete +
128
+ retention window) → hard-delete after grace, cascading to the node's Qdrant points +
129
+ `vector_provenance`. Never evict `disclosure_class='restricted'` without sign-off.
130
+
131
+ ### B4. Capacity bound (optional)
132
+ Per-arena soft cap; when exceeded, evict lowest-salience first. Backstop against unbounded
133
+ arenas.
134
+
135
+ ### B5. Cadence + safety
136
+ Background per-arena pass (timer on the engine box), dry-run → `--apply` in a quiet window,
137
+ counts logged, fully arena-scoped. Same operational shape as the distiller autoscaler /
138
+ sparse backfill.
139
+
140
+ ---
141
+
142
+ ## Part C — Ordering & how they combine
143
+
144
+ Per arena, on schedule: **(1) fusion → (2) decay.** Fusion first so a master node absorbs
145
+ its duplicates' provenance/salience *before* decay judges it (else a real node split across
146
+ two weak dupes could wrongly decay out). Then decay ages + evicts the survivors.
147
+
148
+ **This is what finally cures immortal pollution:**
149
+ - 7B polluted node *with* a correct Qwen3.6 counterpart → **fused**, correct one as master,
150
+ polluted demoted to alias / tombstoned.
151
+ - 7B pure-junk node with *no* correct counterpart (numeric-ID-person, ungrounded) → born-low
152
+ salience + no corroboration + never accessed → **decays out and is evicted**.
153
+
154
+ Together they convert the accrete-only store into a self-healing one. `pip-agents` could
155
+ then self-clean over time instead of requiring a nuke (a nuke is still faster for a one-shot
156
+ reset, but no longer the *only* path).
157
+
158
+ ---
159
+
160
+ ## Part D — Schema changes
161
+
162
+ - `entities`: `+ salience REAL DEFAULT …`, `+ last_accessed TIMESTAMPTZ`.
163
+ - `facts`: `+ salience REAL`, `+ last_accessed TIMESTAMPTZ` (keep `confidence` as-is =
164
+ corroboration truth; `asserted_at`/`expires_at` already exist).
165
+ - `relationships`: `+ salience REAL`, `+ last_accessed` (already has `weight`,
166
+ `first/last_seen`).
167
+ - new `fact_merges` audit (mirror `entity_merges` incl. `rollback_payload`).
168
+ - new `fusion_runs` + `decay_runs` ledgers for observability.
169
+ - `/search` gains a `last_accessed = NOW()` bump on returned nodes (batched).
170
+
171
+ ## Part E — Rollout (each flag-gated, arena-scoped, dry-run-first, audited)
172
+
173
+ 1. **Salience scoring only** — add columns, born-salience + decay math, NO eviction.
174
+ Observe distributions; confirm junk scores low and durable facts stay high.
175
+ 2. **Eviction** — dry-run (count what *would* evict) → `--apply` in a quiet window.
176
+ 3. **Fusion extension** — scored canonical selection (fix typo-crowning) + cross-run
177
+ detection + fact fusion, dry-run → apply.
178
+ 4. **Online/continuous** — wire fusion+decay to run after distillation per arena.
179
+
180
+ ## Open questions
181
+ - Half-life constants per category — needs a calibration pass against real arenas.
182
+ - `last_accessed` write amplification on hot search paths — batch/throttle the bump.
183
+ - Directory authority source for canonical anchoring — HubSpot contacts? a curated table?
184
+ - Interaction with the (still-open) source_id supersede mode — fusion partly subsumes it,
185
+ but explicit supersede is cheaper for known-mutable sources.
@@ -0,0 +1,193 @@
1
+ # RFC: the Fusion Drive — v2 memory self-healing (cross-run node fusion + decay)
2
+
3
+ > **Fusion Drive** = the continuous, arena-scoped background engine that keeps the v2
4
+ > memory graph self-healing: it *fuses* duplicate/near-duplicate nodes from different
5
+ > distillation runs into a single master node (horizontal convergence) and *decays* stale,
6
+ > low-value, and junk nodes out of existence (vertical aging). Named for the drive that
7
+ > does the fusing — the decay pass rides the same engine.
8
+
9
+ **Status:** spec + initial implementation (PR #92) — 2026-06-12. Implemented: salience
10
+ scoring + decay, **eviction** (`fusion_drive_decay.py --evict`, reversible via
11
+ `node_evictions`), and **fusion** of exact + cross-run-shared-provenance entity dupes and
12
+ exact-triple fact dupes (`fusion_drive_fuse.py --apply`, reversible via `entity_merges`/
13
+ `fact_merges`), with scored directory-anchored master selection. All arena-scoped,
14
+ dry-run-default, transactional, audited. TODO (later PRs): embedding-band + LLM-adjudicated
15
+ detection (in `entity_resolution_v2.py`), semantic fact fusion, authority-table wiring for
16
+ canonical scoring, continuous scheduling, and a half-life/threshold calibration pass before
17
+ `--evict` runs in prod.
18
+ **Builds on:** `RFC-entity-reconciliation.md`, `scripts/entity_resolution_v2.py` (#82),
19
+ `org-model/migrations/002_entity_merges_audit.sql`.
20
+ **Motivated by:** the v2 store is currently **pure-accretion** — three independent
21
+ properties, all verified in code, mean nothing ever leaves or improves in place:
22
+
23
+ 1. **No supersede by source_id** — event identity is `sha256(arena:content)`; re-emitting
24
+ edited content appends a new event, the old persists.
25
+ 2. **Accrete-only graph writes** — entity/fact upserts are `ON CONFLICT (id) DO UPDATE`
26
+ that only merge aliases/provenance and bump confidence; a *corrected* extraction has a
27
+ different deterministic id, so it lands **beside** the polluted node, never replacing it.
28
+ 3. **No decay/eviction** — v2 has no GC; fact confidence only moves up; recency affects
29
+ search ranking only, never retention.
30
+
31
+ Net: improving the extractor/teacher only helps **new** content. Accumulated 7B-era
32
+ pollution (hallucinated emails, numeric-ID-as-person, ungrounded entities) is immortal.
33
+ `pentatonic-team` had to be **nuked** rather than re-distilled because of this; `pip-agents`
34
+ (87k events) still carries all of it.
35
+
36
+ This RFC makes the store **self-healing** via two complementary mechanisms:
37
+ **fusion** (horizontal — converge duplicate/near-duplicate nodes from different
38
+ distillation runs into one *master* node) and **decay** (vertical — age out stale and
39
+ low-value nodes). Both are gated, arena-scoped, audited, and reversible.
40
+
41
+ ---
42
+
43
+ ## Part A — Fusion: converge near-duplicate nodes into a master
44
+
45
+ Extends the existing entity-resolution machinery along four axes.
46
+
47
+ ### A1. Online + continuous (today it's dry-run batch)
48
+ Run fusion as a scheduled per-arena pass (systemd timer on the engine box, same pattern as
49
+ the distiller autoscaler) **and** opportunistically after a distillation run touches an
50
+ arena's entities. Keep #82's invariants: dry-run default, `--apply` gate, arena scoping,
51
+ `entity_merges` rollback. Add a `fusion_runs` ledger (arena, started_at, candidates,
52
+ merged, mode) for observability.
53
+
54
+ ### A2. Cross-distillation-run detection (the actual pollution cure)
55
+ The hard case #82 misses: 7B `"1716801984"` (numeric-ID person) and Qwen3.6 `"Katie Cooper"`
56
+ are the same real entity but share **no name similarity**, so name-blocking never compares
57
+ them. New candidate signals beyond name trigrams / embedding-on-name:
58
+
59
+ - **Shared-provenance co-reference** — two entities of the same `entity_type` citing the
60
+ same `event_id` in `provenance_event_ids`, where one is low-quality (numeric / ungrounded
61
+ / single-token). The shared event's content is the adjudication context ("does this event
62
+ support these being the same person?").
63
+ - **Context embedding** — embed the *facts/statements about* an entity (not just its name),
64
+ so name-divergent dupes still cluster. Reuses the bulk-embed lane.
65
+ - **Teacher-version signal** — provenance maps to `distillation_traces.llm_model` /
66
+ `system_prompt_hash`. Prefer the newer-teacher extraction as master; an entity *only* ever
67
+ produced by the superseded teacher and never re-confirmed by the new one is both a fusion
68
+ candidate (likely a worse rendering of a node the new teacher got right) and a decay
69
+ candidate (stale-teacher orphan — see B).
70
+
71
+ ### A3. Master-node selection — replace richest-row-wins
72
+ #82 uses "richest-row-wins", which (flagged in review) would crown the typo **"Phil Mossop"**
73
+ over **"Philip Mossop"**. Replace with a **scored** canonical pick:
74
+
75
+ | Signal | Effect |
76
+ |---|---|
77
+ | **Directory/authority anchor** (name matches an org-directory / HubSpot contact / Pip `contact_email`+`contact_name`) | dominant + → canonical |
78
+ | Grounding (name appears verbatim in a provenance event's content) | + |
79
+ | Teacher recency (newer `llm_model`) | + |
80
+ | Corroboration (`cardinality(provenance_event_ids)`) | + |
81
+ | Looks-like-ID (digit-ratio > 0.5) / hallucinated-email flag / single-token bare name | − − |
82
+
83
+ Master = highest score. Losers' surface forms become **aliases** on the master (so existing
84
+ lookups still resolve), facts/relationships are repointed, losers tombstoned in
85
+ `entity_merges` with `rollback_payload`. Directory-anchored selection is the key fix: an
86
+ authoritative source, when present, beats any heuristic.
87
+
88
+ ### A4. Fact + relationship fusion (today only entities fuse)
89
+ After entity fusion (so subject/object ids are canonical):
90
+ - **Facts** — exact `(arena, subject, predicate, object)` dupes already collapse via the
91
+ content-id. **Semantic** dupes (same assertion, different surface — "joined Acme" vs "works
92
+ at Acme") need statement-embedding similarity + LLM adjudication ("same assertion?").
93
+ Master fact = max confidence + best-grounded statement; union provenance; tombstone dupes.
94
+ New `fact_merges` audit mirroring `entity_merges`.
95
+ - **Relationships** — `(from,to,type)` already collapses; a controlled rel-type vocabulary
96
+ ("works at" ≡ "employed by") is a later optional canonicalization.
97
+
98
+ ### A5. Audit, reversibility, safety rails
99
+ Reuse `entity_merges`; add `fact_merges`. Every fusion carries `rollback_payload`.
100
+ LLM-adjudicated merges store prompt+verdict. **Disclosure rail:** never send
101
+ `disclosure_class='restricted'` rows to the LLM adjudicator (data-egress; the #82 review
102
+ item). Auto-merge only above a high confidence band; everything else → human-review queue.
103
+
104
+ ---
105
+
106
+ ## Part B — Decay: age out stale and low-value nodes
107
+
108
+ ### B1. Separate `salience` from `confidence` (important)
109
+ Do **not** decay `confidence` — it means "how corroborated/true is this", and decaying it
110
+ would lie about corroboration. Add a separate **`salience`** (retention priority) to
111
+ entities/facts/relationships. Decay acts on salience; eviction keys on salience.
112
+
113
+ `salience(t) = salience₀ · exp(−ln2 · Δt / half_life[category])`, bumped on access or
114
+ re-corroboration. Per-category half-life:
115
+
116
+ | category | half-life | rationale |
117
+ |---|---|---|
118
+ | decision, commitment | very long / ∞ | durable record |
119
+ | state, preference | medium | changes but matters |
120
+ | mention, observation | short | ephemeral |
121
+
122
+ `Δt` = time since `last_seen` **or** a new `last_accessed` (bumped when a node is returned by
123
+ `/search` — cheap write, makes retrieval keep memories alive). Re-corroboration (new
124
+ provenance) resets the clock and bumps salience.
125
+
126
+ ### B2. Born-salience — the cheap partial cure
127
+ Seed `salience₀` from extraction-quality signals already computed (the trap detectors:
128
+ ungrounded, numeric-ID-person, hallucinated-email, `noise_filter` hits). **Junk is born
129
+ low**, so it decays below threshold and self-evicts fast — pollution cleans itself even
130
+ without a fusion match.
131
+
132
+ ### B3. Eviction (GC)
133
+ Node is evictable when: `salience < min_threshold` **AND** `last_seen`/`last_accessed`
134
+ older than a floor **AND** not referenced by a surviving higher-salience node (an entity
135
+ that's the subject/object of a live fact survives). Eviction = **tombstone** (soft-delete +
136
+ retention window) → hard-delete after grace, cascading to the node's Qdrant points +
137
+ `vector_provenance`. Never evict `disclosure_class='restricted'` without sign-off.
138
+
139
+ ### B4. Capacity bound (optional)
140
+ Per-arena soft cap; when exceeded, evict lowest-salience first. Backstop against unbounded
141
+ arenas.
142
+
143
+ ### B5. Cadence + safety
144
+ Background per-arena pass (timer on the engine box), dry-run → `--apply` in a quiet window,
145
+ counts logged, fully arena-scoped. Same operational shape as the distiller autoscaler /
146
+ sparse backfill.
147
+
148
+ ---
149
+
150
+ ## Part C — Ordering & how they combine
151
+
152
+ Per arena, on schedule: **(1) fusion → (2) decay.** Fusion first so a master node absorbs
153
+ its duplicates' provenance/salience *before* decay judges it (else a real node split across
154
+ two weak dupes could wrongly decay out). Then decay ages + evicts the survivors.
155
+
156
+ **This is what finally cures immortal pollution:**
157
+ - 7B polluted node *with* a correct Qwen3.6 counterpart → **fused**, correct one as master,
158
+ polluted demoted to alias / tombstoned.
159
+ - 7B pure-junk node with *no* correct counterpart (numeric-ID-person, ungrounded) → born-low
160
+ salience + no corroboration + never accessed → **decays out and is evicted**.
161
+
162
+ Together they convert the accrete-only store into a self-healing one. `pip-agents` could
163
+ then self-clean over time instead of requiring a nuke (a nuke is still faster for a one-shot
164
+ reset, but no longer the *only* path).
165
+
166
+ ---
167
+
168
+ ## Part D — Schema changes
169
+
170
+ - `entities`: `+ salience REAL DEFAULT …`, `+ last_accessed TIMESTAMPTZ`.
171
+ - `facts`: `+ salience REAL`, `+ last_accessed TIMESTAMPTZ` (keep `confidence` as-is =
172
+ corroboration truth; `asserted_at`/`expires_at` already exist).
173
+ - `relationships`: `+ salience REAL`, `+ last_accessed` (already has `weight`,
174
+ `first/last_seen`).
175
+ - new `fact_merges` audit (mirror `entity_merges` incl. `rollback_payload`).
176
+ - new `fusion_runs` + `decay_runs` ledgers for observability.
177
+ - `/search` gains a `last_accessed = NOW()` bump on returned nodes (batched).
178
+
179
+ ## Part E — Rollout (each flag-gated, arena-scoped, dry-run-first, audited)
180
+
181
+ 1. **Salience scoring only** — add columns, born-salience + decay math, NO eviction.
182
+ Observe distributions; confirm junk scores low and durable facts stay high.
183
+ 2. **Eviction** — dry-run (count what *would* evict) → `--apply` in a quiet window.
184
+ 3. **Fusion extension** — scored canonical selection (fix typo-crowning) + cross-run
185
+ detection + fact fusion, dry-run → apply.
186
+ 4. **Online/continuous** — wire fusion+decay to run after distillation per arena.
187
+
188
+ ## Open questions
189
+ - Half-life constants per category — needs a calibration pass against real arenas.
190
+ - `last_accessed` write amplification on hot search paths — batch/throttle the bump.
191
+ - Directory authority source for canonical anchoring — HubSpot contacts? a curated table?
192
+ - Interaction with the (still-open) source_id supersede mode — fusion partly subsumes it,
193
+ but explicit supersede is cheaper for known-mutable sources.
@@ -60,3 +60,40 @@ def corroborated_confidence(n_sources: int) -> float:
60
60
  if bumped > _CONF_CAP:
61
61
  return _CONF_CAP
62
62
  return round(bumped, 2)
63
+
64
+
65
+ # ── born salience (Fusion Drive) ─────────────────────────────────────
66
+ # Retention priority a node is stamped with at extraction time, SEPARATE
67
+ # from confidence (confidence = corroboration/truth; salience = how long
68
+ # it's worth keeping). Junk — flagged by the extractor's own quality
69
+ # detectors (noise name, numeric-ID-as-person, hallucinated email,
70
+ # ungrounded, etc.) — is born near the floor so the Fusion Drive decay
71
+ # pass evicts it on a short clock instead of the multi-year default.
72
+ #
73
+ # This MUST stay byte-identical to fusion_drive/salience.py:born_salience
74
+ # (the decay side uses the same scale). test_born_salience_parity.py
75
+ # guards the two against drift — same pattern as entity_id.py's parity
76
+ # test across the sync/async build contexts.
77
+ _SAL_BASE = 0.50
78
+ _SAL_CORROB_PER_SOURCE = 0.10
79
+ _SAL_CORROB_CAP = 0.30
80
+ _SAL_FLOOR = 0.01
81
+ _SAL_CEIL = 1.00
82
+ _SAL_PENALTIES = {
83
+ "noise_name": 0.45,
84
+ "numeric_id_person": 0.45,
85
+ "hallucinated_email": 0.40,
86
+ "ungrounded": 0.35,
87
+ "subject_undeclared": 0.25,
88
+ "low_signal": 0.15,
89
+ }
90
+
91
+
92
+ def born_salience(n_sources: int = 1, quality_flags: list[str] | None = None) -> float:
93
+ """Salience to stamp on a freshly extracted node. See the module note."""
94
+ s = _SAL_BASE
95
+ if n_sources > 1:
96
+ s += min(_SAL_CORROB_CAP, _SAL_CORROB_PER_SOURCE * (n_sources - 1))
97
+ for flag in quality_flags or []:
98
+ s -= _SAL_PENALTIES.get(flag, 0.0)
99
+ return round(max(_SAL_FLOOR, min(_SAL_CEIL, s)), 4)
@@ -0,0 +1,35 @@
1
+ """Parity guard: confidence.born_salience (worker, copied into the container)
2
+ must stay byte-equivalent to fusion_drive/salience.born_salience (the decay
3
+ side). Same pattern as test_entity_id_parity.py — the two live across a Docker
4
+ build-context boundary and would silently drift otherwise."""
5
+
6
+ from __future__ import annotations
7
+
8
+ import os
9
+ import sys
10
+
11
+ import confidence as worker
12
+
13
+ sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "fusion_drive"))
14
+ import salience as drive # noqa: E402
15
+
16
+
17
+ def test_constants_match():
18
+ assert worker._SAL_BASE == drive.BASE_SALIENCE
19
+ assert worker._SAL_CORROB_PER_SOURCE == drive.CORROB_PER_SOURCE
20
+ assert worker._SAL_CORROB_CAP == drive.CORROB_CAP
21
+ assert worker._SAL_FLOOR == drive.SALIENCE_FLOOR
22
+ assert worker._SAL_CEIL == drive.SALIENCE_CEIL
23
+ assert worker._SAL_PENALTIES == drive.QUALITY_PENALTIES
24
+
25
+
26
+ def test_output_matches_across_input_matrix():
27
+ flagsets = [
28
+ None, [], ["noise_name"], ["numeric_id_person"], ["hallucinated_email"],
29
+ ["ungrounded"], ["subject_undeclared"], ["low_signal"],
30
+ ["numeric_id_person", "hallucinated_email", "ungrounded"],
31
+ ["noise_name"] * 5,
32
+ ]
33
+ for n in (1, 2, 3, 5, 100):
34
+ for flags in flagsets:
35
+ assert worker.born_salience(n, flags) == drive.born_salience(n_sources=n, quality_flags=flags), (n, flags)
@@ -39,7 +39,7 @@ import httpx
39
39
  import psycopg
40
40
  import psycopg.rows
41
41
 
42
- from confidence import corroborated_confidence
42
+ from confidence import born_salience, corroborated_confidence
43
43
  from entity_id import entity_id, normalize_surface_form
44
44
  from extraction_schema import (
45
45
  ALLOWED_ENT_TYPES,
@@ -782,6 +782,15 @@ def _content_id(*parts: str) -> str:
782
782
  return hashlib.sha256("\x1f".join(parts).encode()).hexdigest()[:32]
783
783
 
784
784
 
785
+ def _digit_ratio(s: str) -> float:
786
+ """Fraction of non-whitespace chars that are digits. Used to flag
787
+ numeric-ID-as-person junk for Fusion Drive born-salience."""
788
+ stripped = "".join(s.split())
789
+ if not stripped:
790
+ return 0.0
791
+ return sum(c.isdigit() for c in stripped) / len(stripped)
792
+
793
+
785
794
  def upsert_entities(
786
795
  conn: psycopg.Connection,
787
796
  arena: str,
@@ -883,12 +892,20 @@ def upsert_entities(
883
892
  else:
884
893
  # 3b. No match — insert new.
885
894
  eid = entity_id(arena, etype, name)
895
+ # Fusion Drive born-salience: a numeric-ID-as-person (classic
896
+ # 7B junk that slips past noise_filter, e.g. "1716801984") is
897
+ # born near the floor so the decay pass can evict it on a short
898
+ # clock instead of the multi-year entity default.
899
+ _qflags = []
900
+ if etype == "person" and _digit_ratio(name) > 0.5:
901
+ _qflags.append("numeric_id_person")
902
+ _sal = born_salience(1, _qflags)
886
903
  cur.execute(
887
904
  """
888
905
  INSERT INTO entities (
889
906
  id, arena, entity_type, canonical_name, aliases,
890
- provenance_event_ids, participant_set, disclosure_class
891
- ) VALUES (%s, %s, %s, %s, %s, %s, %s, %s::disclosure_class)
907
+ provenance_event_ids, participant_set, disclosure_class, salience
908
+ ) VALUES (%s, %s, %s, %s, %s, %s, %s, %s::disclosure_class, %s)
892
909
  ON CONFLICT (id) DO UPDATE SET
893
910
  aliases = (
894
911
  SELECT ARRAY(SELECT DISTINCT UNNEST(entities.aliases || EXCLUDED.aliases))
@@ -896,11 +913,13 @@ def upsert_entities(
896
913
  provenance_event_ids = (
897
914
  SELECT ARRAY(SELECT DISTINCT UNNEST(entities.provenance_event_ids || EXCLUDED.provenance_event_ids))
898
915
  ),
916
+ -- re-corroboration can only RAISE salience, never lower it
917
+ salience = GREATEST(entities.salience, EXCLUDED.salience),
899
918
  last_seen = NOW()
900
919
  """,
901
920
  (
902
921
  eid, arena, etype, name, aliases,
903
- [event_id], participant_set, disclosure_class,
922
+ [event_id], participant_set, disclosure_class, _sal,
904
923
  ),
905
924
  )
906
925
  name_to_id[name] = eid
@@ -942,15 +961,24 @@ def upsert_facts(
942
961
  continue
943
962
  subj_name = f.get("subject")
944
963
  obj_name = f.get("object")
964
+ # Fusion Drive born-salience: a fact whose subject isn't among the
965
+ # event's declared entities (ungrounded subject) or that's barely
966
+ # a sentence is born low so decay can clear it. n_sources=1 here.
967
+ _fflags = []
968
+ if subj_name and not name_to_id.get(subj_name):
969
+ _fflags.append("subject_undeclared")
970
+ if len(stmt) < 60:
971
+ _fflags.append("low_signal")
972
+ _fsal = born_salience(1, _fflags)
945
973
  cur.execute(
946
974
  """
947
975
  INSERT INTO facts (
948
976
  id, arena, category, subject_entity_id, predicate,
949
977
  object_entity_id, statement, provenance_event_ids,
950
- stage, confidence, participant_set, disclosure_class
978
+ stage, confidence, participant_set, disclosure_class, salience
951
979
  ) VALUES (
952
980
  %s, %s, %s, %s, %s, %s, %s, %s,
953
- 'provisional'::extraction_stage, %s, %s, %s::disclosure_class
981
+ 'provisional'::extraction_stage, %s, %s, %s::disclosure_class, %s
954
982
  )
955
983
  ON CONFLICT (id) DO UPDATE SET
956
984
  provenance_event_ids = (
@@ -958,6 +986,7 @@ def upsert_facts(
958
986
  facts.provenance_event_ids || EXCLUDED.provenance_event_ids
959
987
  ))
960
988
  ),
989
+ salience = GREATEST(facts.salience, EXCLUDED.salience),
961
990
  -- Confidence bumps with each additional independent
962
991
  -- source. The cardinality of the merged provenance
963
992
  -- array IS the corroboration count, so the formula
@@ -990,6 +1019,7 @@ def upsert_facts(
990
1019
  float(f.get("confidence") or corroborated_confidence(1)),
991
1020
  participant_set,
992
1021
  disclosure_class,
1022
+ _fsal,
993
1023
  ),
994
1024
  )
995
1025
  inserted += 1