@pentatonic-ai/ai-agent-sdk 0.10.6 → 0.10.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.cjs +1 -1
- package/dist/index.js +1 -1
- package/package.json +1 -1
- package/packages/memory-engine-v2/RFC-decay-and-fusion.md +185 -0
- package/packages/memory-engine-v2/RFC-fusion-drive.md +193 -0
- package/packages/memory-engine-v2/docker-compose.aws.yml +62 -1
- package/packages/memory-engine-v2/docker-compose.yml +8 -1
- package/packages/memory-engine-v2/extractor-async/confidence.py +37 -0
- package/packages/memory-engine-v2/extractor-async/test_born_salience_parity.py +35 -0
- package/packages/memory-engine-v2/extractor-async/test_guided_json_parser.py +44 -0
- package/packages/memory-engine-v2/extractor-async/worker.py +67 -7
- package/packages/memory-engine-v2/extractor-sync/server.py +6 -2
- package/packages/memory-engine-v2/extractor-sync/test_paired_extraction.py +82 -1
- package/packages/memory-engine-v2/fusion_drive/__init__.py +0 -0
- package/packages/memory-engine-v2/fusion_drive/canonical.py +94 -0
- package/packages/memory-engine-v2/fusion_drive/conftest.py +8 -0
- package/packages/memory-engine-v2/fusion_drive/merge.py +178 -0
- package/packages/memory-engine-v2/fusion_drive/salience.py +118 -0
- package/packages/memory-engine-v2/fusion_drive/test_canonical.py +76 -0
- package/packages/memory-engine-v2/fusion_drive/test_merge.py +112 -0
- package/packages/memory-engine-v2/fusion_drive/test_salience.py +93 -0
- package/packages/memory-engine-v2/org-model/migrations/004_source_kind_code_reference.sql +12 -0
- package/packages/memory-engine-v2/org-model/migrations/005_fk_indexes.sql +20 -0
- package/packages/memory-engine-v2/org-model/migrations/006_fusion_drive.sql +80 -0
- package/packages/memory-engine-v2/scripts/fusion_drive_born_salience_backfill.py +113 -0
- package/packages/memory-engine-v2/scripts/fusion_drive_decay.py +181 -0
- package/packages/memory-engine-v2/scripts/fusion_drive_fuse.py +264 -0
|
@@ -39,7 +39,7 @@ import httpx
|
|
|
39
39
|
import psycopg
|
|
40
40
|
import psycopg.rows
|
|
41
41
|
|
|
42
|
-
from confidence import corroborated_confidence
|
|
42
|
+
from confidence import born_salience, corroborated_confidence
|
|
43
43
|
from entity_id import entity_id, normalize_surface_form
|
|
44
44
|
from extraction_schema import (
|
|
45
45
|
ALLOWED_ENT_TYPES,
|
|
@@ -149,13 +149,41 @@ if DISTILL_GUIDED_PARAM_STYLE not in ("response_format", "guided_json"):
|
|
|
149
149
|
)
|
|
150
150
|
DISTILL_GUIDED_PARAM_STYLE = "response_format"
|
|
151
151
|
|
|
152
|
+
# Optional chat-template kwargs forwarded verbatim on every chat
|
|
153
|
+
# completion (vLLM extension: top-level `chat_template_kwargs`).
|
|
154
|
+
# Needed for thinking-capable teachers: Qwen3.x chat templates default
|
|
155
|
+
# enable_thinking=true, which burns the max_tokens budget on reasoning
|
|
156
|
+
# the distiller never reads. The 2026-06-11 teacher bake-off ran the
|
|
157
|
+
# Qwen3.6 lanes with {"enable_thinking": false}, so the prod swap must
|
|
158
|
+
# send the same switch for its traces to match the benchmarked
|
|
159
|
+
# distribution. Unset (default) sends nothing — the request body stays
|
|
160
|
+
# byte-identical for teachers without template switches (Qwen2.5).
|
|
161
|
+
DISTILL_CHAT_TEMPLATE_KWARGS: dict[str, Any] | None = None
|
|
162
|
+
_raw_ctk = os.environ.get("DISTILL_CHAT_TEMPLATE_KWARGS", "").strip()
|
|
163
|
+
if _raw_ctk:
|
|
164
|
+
try:
|
|
165
|
+
_parsed_ctk = json.loads(_raw_ctk)
|
|
166
|
+
if not isinstance(_parsed_ctk, dict):
|
|
167
|
+
raise ValueError("must be a JSON object")
|
|
168
|
+
DISTILL_CHAT_TEMPLATE_KWARGS = _parsed_ctk
|
|
169
|
+
except ValueError as e:
|
|
170
|
+
log.warning(f"DISTILL_CHAT_TEMPLATE_KWARGS invalid ({e}) — ignoring")
|
|
171
|
+
|
|
152
172
|
# JSON output carries structural overhead (braces, quotes, key names)
|
|
153
173
|
# the KV format doesn't, so guided mode gets its own per-event token
|
|
154
174
|
# budget. Truncation is guided mode's ONLY parse-failure mode (the
|
|
155
175
|
# schema enforcer guarantees validity up to the cut), so this errs
|
|
156
176
|
# higher than the KV 300.
|
|
177
|
+
#
|
|
178
|
+
# NOTE the budget is SHARED across the chunk (max_tokens = this × N
|
|
179
|
+
# events per request). A fully-maxed event (8 ent / 6 fct with 140-char
|
|
180
|
+
# statements / 6 rel + JSON overhead) is ~1.1k output tokens, so chunk
|
|
181
|
+
# size and this value must be chosen together against the server's
|
|
182
|
+
# max_model_len. Raised 400→900 after prod showed 15% of 5-event chunks
|
|
183
|
+
# truncating on `length` (2026-06-12); prod now runs EVENTS_PER_LLM_CALL=3
|
|
184
|
+
# so 3×900 output + ~2.1k prompt stays well inside the L40S 8192 ctx.
|
|
157
185
|
LLM_MAX_TOKENS_PER_EVENT_JSON = int(
|
|
158
|
-
os.environ.get("LLM_MAX_TOKENS_PER_EVENT_JSON", "
|
|
186
|
+
os.environ.get("LLM_MAX_TOKENS_PER_EVENT_JSON", "900")
|
|
159
187
|
)
|
|
160
188
|
|
|
161
189
|
|
|
@@ -667,6 +695,8 @@ def _build_request_body(user_prompt: str, n: int) -> dict[str, Any]:
|
|
|
667
695
|
else LLM_MAX_TOKENS_PER_EVENT
|
|
668
696
|
) * n,
|
|
669
697
|
}
|
|
698
|
+
if DISTILL_CHAT_TEMPLATE_KWARGS:
|
|
699
|
+
body["chat_template_kwargs"] = DISTILL_CHAT_TEMPLATE_KWARGS
|
|
670
700
|
if DISTILL_OUTPUT_MODE == "guided_json":
|
|
671
701
|
if DISTILL_GUIDED_PARAM_STYLE == "guided_json":
|
|
672
702
|
body["guided_json"] = EXTRACTION_SCHEMA
|
|
@@ -752,6 +782,15 @@ def _content_id(*parts: str) -> str:
|
|
|
752
782
|
return hashlib.sha256("\x1f".join(parts).encode()).hexdigest()[:32]
|
|
753
783
|
|
|
754
784
|
|
|
785
|
+
def _digit_ratio(s: str) -> float:
|
|
786
|
+
"""Fraction of non-whitespace chars that are digits. Used to flag
|
|
787
|
+
numeric-ID-as-person junk for Fusion Drive born-salience."""
|
|
788
|
+
stripped = "".join(s.split())
|
|
789
|
+
if not stripped:
|
|
790
|
+
return 0.0
|
|
791
|
+
return sum(c.isdigit() for c in stripped) / len(stripped)
|
|
792
|
+
|
|
793
|
+
|
|
755
794
|
def upsert_entities(
|
|
756
795
|
conn: psycopg.Connection,
|
|
757
796
|
arena: str,
|
|
@@ -853,12 +892,20 @@ def upsert_entities(
|
|
|
853
892
|
else:
|
|
854
893
|
# 3b. No match — insert new.
|
|
855
894
|
eid = entity_id(arena, etype, name)
|
|
895
|
+
# Fusion Drive born-salience: a numeric-ID-as-person (classic
|
|
896
|
+
# 7B junk that slips past noise_filter, e.g. "1716801984") is
|
|
897
|
+
# born near the floor so the decay pass can evict it on a short
|
|
898
|
+
# clock instead of the multi-year entity default.
|
|
899
|
+
_qflags = []
|
|
900
|
+
if etype == "person" and _digit_ratio(name) > 0.5:
|
|
901
|
+
_qflags.append("numeric_id_person")
|
|
902
|
+
_sal = born_salience(1, _qflags)
|
|
856
903
|
cur.execute(
|
|
857
904
|
"""
|
|
858
905
|
INSERT INTO entities (
|
|
859
906
|
id, arena, entity_type, canonical_name, aliases,
|
|
860
|
-
provenance_event_ids, participant_set, disclosure_class
|
|
861
|
-
) VALUES (%s, %s, %s, %s, %s, %s, %s, %s::disclosure_class)
|
|
907
|
+
provenance_event_ids, participant_set, disclosure_class, salience
|
|
908
|
+
) VALUES (%s, %s, %s, %s, %s, %s, %s, %s::disclosure_class, %s)
|
|
862
909
|
ON CONFLICT (id) DO UPDATE SET
|
|
863
910
|
aliases = (
|
|
864
911
|
SELECT ARRAY(SELECT DISTINCT UNNEST(entities.aliases || EXCLUDED.aliases))
|
|
@@ -866,11 +913,13 @@ def upsert_entities(
|
|
|
866
913
|
provenance_event_ids = (
|
|
867
914
|
SELECT ARRAY(SELECT DISTINCT UNNEST(entities.provenance_event_ids || EXCLUDED.provenance_event_ids))
|
|
868
915
|
),
|
|
916
|
+
-- re-corroboration can only RAISE salience, never lower it
|
|
917
|
+
salience = GREATEST(entities.salience, EXCLUDED.salience),
|
|
869
918
|
last_seen = NOW()
|
|
870
919
|
""",
|
|
871
920
|
(
|
|
872
921
|
eid, arena, etype, name, aliases,
|
|
873
|
-
[event_id], participant_set, disclosure_class,
|
|
922
|
+
[event_id], participant_set, disclosure_class, _sal,
|
|
874
923
|
),
|
|
875
924
|
)
|
|
876
925
|
name_to_id[name] = eid
|
|
@@ -912,15 +961,24 @@ def upsert_facts(
|
|
|
912
961
|
continue
|
|
913
962
|
subj_name = f.get("subject")
|
|
914
963
|
obj_name = f.get("object")
|
|
964
|
+
# Fusion Drive born-salience: a fact whose subject isn't among the
|
|
965
|
+
# event's declared entities (ungrounded subject) or that's barely
|
|
966
|
+
# a sentence is born low so decay can clear it. n_sources=1 here.
|
|
967
|
+
_fflags = []
|
|
968
|
+
if subj_name and not name_to_id.get(subj_name):
|
|
969
|
+
_fflags.append("subject_undeclared")
|
|
970
|
+
if len(stmt) < 60:
|
|
971
|
+
_fflags.append("low_signal")
|
|
972
|
+
_fsal = born_salience(1, _fflags)
|
|
915
973
|
cur.execute(
|
|
916
974
|
"""
|
|
917
975
|
INSERT INTO facts (
|
|
918
976
|
id, arena, category, subject_entity_id, predicate,
|
|
919
977
|
object_entity_id, statement, provenance_event_ids,
|
|
920
|
-
stage, confidence, participant_set, disclosure_class
|
|
978
|
+
stage, confidence, participant_set, disclosure_class, salience
|
|
921
979
|
) VALUES (
|
|
922
980
|
%s, %s, %s, %s, %s, %s, %s, %s,
|
|
923
|
-
'provisional'::extraction_stage, %s, %s, %s::disclosure_class
|
|
981
|
+
'provisional'::extraction_stage, %s, %s, %s::disclosure_class, %s
|
|
924
982
|
)
|
|
925
983
|
ON CONFLICT (id) DO UPDATE SET
|
|
926
984
|
provenance_event_ids = (
|
|
@@ -928,6 +986,7 @@ def upsert_facts(
|
|
|
928
986
|
facts.provenance_event_ids || EXCLUDED.provenance_event_ids
|
|
929
987
|
))
|
|
930
988
|
),
|
|
989
|
+
salience = GREATEST(facts.salience, EXCLUDED.salience),
|
|
931
990
|
-- Confidence bumps with each additional independent
|
|
932
991
|
-- source. The cardinality of the merged provenance
|
|
933
992
|
-- array IS the corroboration count, so the formula
|
|
@@ -960,6 +1019,7 @@ def upsert_facts(
|
|
|
960
1019
|
float(f.get("confidence") or corroborated_confidence(1)),
|
|
961
1020
|
participant_set,
|
|
962
1021
|
disclosure_class,
|
|
1022
|
+
_fsal,
|
|
963
1023
|
),
|
|
964
1024
|
)
|
|
965
1025
|
inserted += 1
|
|
@@ -56,11 +56,15 @@ _pool: AsyncConnectionPool | None = None
|
|
|
56
56
|
@asynccontextmanager
|
|
57
57
|
async def lifespan(app: FastAPI):
|
|
58
58
|
global _pool
|
|
59
|
+
# Default (tuple) row factory — _upsert_entities and friends index
|
|
60
|
+
# fetchone() rows positionally, matching extractor-async's worker.
|
|
61
|
+
# A dict_row factory here turns row[0] into KeyError: 0 on the
|
|
62
|
+
# entity-merge path (2026-06-11 prod incident: every extract that
|
|
63
|
+
# re-saw a known entity 500'd; only never-seen-entity events stored).
|
|
59
64
|
_pool = AsyncConnectionPool(
|
|
60
65
|
conninfo=PG_DSN,
|
|
61
66
|
min_size=8,
|
|
62
67
|
max_size=50,
|
|
63
|
-
kwargs={"row_factory": psycopg.rows.dict_row},
|
|
64
68
|
open=False,
|
|
65
69
|
)
|
|
66
70
|
await _pool.open()
|
|
@@ -89,7 +93,7 @@ class ExtractRequest(BaseModel):
|
|
|
89
93
|
clientId: str
|
|
90
94
|
userId: str | None = None
|
|
91
95
|
event_type: str = "STORE_MEMORY"
|
|
92
|
-
source_kind: str # 'chat' | 'note' | 'doc' | 'event' | 'ticket' | 'commit' | 'system' | 'agent'
|
|
96
|
+
source_kind: str # 'chat' | 'note' | 'doc' | 'event' | 'ticket' | 'commit' | 'system' | 'agent' | 'code_reference'
|
|
93
97
|
source_id: str | None = None
|
|
94
98
|
content: str
|
|
95
99
|
attributes: dict[str, Any] = {}
|
|
@@ -22,8 +22,14 @@ import pytest
|
|
|
22
22
|
|
|
23
23
|
|
|
24
24
|
# Load extractor-sync's server.py as a module so we can call its
|
|
25
|
-
# private helpers directly.
|
|
25
|
+
# private helpers directly. server.py flat-imports its siblings
|
|
26
|
+
# (entity_id) the way the container's WORKDIR layout resolves them, so
|
|
27
|
+
# this directory must be on sys.path — otherwise exec_module raises
|
|
28
|
+
# ImportError and the module-level skip below silently swallows the
|
|
29
|
+
# whole suite whenever pytest runs from the repo root.
|
|
26
30
|
_THIS = Path(__file__).resolve().parent
|
|
31
|
+
if str(_THIS) not in sys.path:
|
|
32
|
+
sys.path.insert(0, str(_THIS))
|
|
27
33
|
_SPEC = importlib.util.spec_from_file_location("extractor_sync_server",
|
|
28
34
|
_THIS / "server.py")
|
|
29
35
|
assert _SPEC and _SPEC.loader
|
|
@@ -206,3 +212,78 @@ def test_extract_event_organizer_object_form() -> None:
|
|
|
206
212
|
assert len(entities) == 1
|
|
207
213
|
assert entities[0]["canonical_name"] == "X Person"
|
|
208
214
|
assert "x@example.com" in entities[0]["aliases"]
|
|
215
|
+
|
|
216
|
+
|
|
217
|
+
# ----------------------------------------------------------------------
|
|
218
|
+
# _upsert_entities — merge path indexes rows positionally
|
|
219
|
+
# ----------------------------------------------------------------------
|
|
220
|
+
#
|
|
221
|
+
# Regression for the 2026-06-11 prod incident: the pool was configured
|
|
222
|
+
# with row_factory=dict_row while _upsert_entities did `row[0]`, so the
|
|
223
|
+
# merge branch (entity already known) raised KeyError: 0 and every
|
|
224
|
+
# extract that re-saw a known entity 500'd. Only never-seen-entity
|
|
225
|
+
# events could store. Two guards:
|
|
226
|
+
# 1. the pool must keep psycopg's default tuple row factory
|
|
227
|
+
# (matching extractor-async's worker, which also indexes
|
|
228
|
+
# positionally), and
|
|
229
|
+
# 2. the merge branch must work against tuple rows end-to-end.
|
|
230
|
+
|
|
231
|
+
import asyncio
|
|
232
|
+
import inspect
|
|
233
|
+
|
|
234
|
+
|
|
235
|
+
class _FakeCursor:
|
|
236
|
+
"""Quacks like psycopg.AsyncCursor, returning TUPLE rows — the
|
|
237
|
+
shape the pool's default row factory produces. If the pool ever
|
|
238
|
+
grows a custom row_factory again, update this fake to match it or
|
|
239
|
+
test_pool_keeps_default_tuple_row_factory will flag the drift."""
|
|
240
|
+
|
|
241
|
+
def __init__(self, existing_id: str | None) -> None:
|
|
242
|
+
self.executed: list[tuple[str, object]] = []
|
|
243
|
+
self._existing_id = existing_id
|
|
244
|
+
|
|
245
|
+
async def execute(self, sql: str, params: object = None) -> None:
|
|
246
|
+
self.executed.append((" ".join(sql.split()), params))
|
|
247
|
+
|
|
248
|
+
async def fetchone(self):
|
|
249
|
+
return (self._existing_id,) if self._existing_id else None
|
|
250
|
+
|
|
251
|
+
|
|
252
|
+
def _entity_stub() -> dict:
|
|
253
|
+
return {
|
|
254
|
+
"id": "e_new",
|
|
255
|
+
"arena": "arena1",
|
|
256
|
+
"entity_type": "person",
|
|
257
|
+
"canonical_name": "Alice One",
|
|
258
|
+
"aliases": ["Alice One", "alice@example.com"],
|
|
259
|
+
"provenance_event_ids": ["evt1"],
|
|
260
|
+
"participant_set": ["arena1"],
|
|
261
|
+
"disclosure_class": "private",
|
|
262
|
+
}
|
|
263
|
+
|
|
264
|
+
|
|
265
|
+
def test_pool_keeps_default_tuple_row_factory() -> None:
|
|
266
|
+
src = inspect.getsource(sync_server.lifespan)
|
|
267
|
+
assert "row_factory" not in src, (
|
|
268
|
+
"extractor-sync's pool must use psycopg's default tuple rows: "
|
|
269
|
+
"_upsert_entities indexes fetchone() results positionally."
|
|
270
|
+
)
|
|
271
|
+
|
|
272
|
+
|
|
273
|
+
def test_upsert_entities_merge_branch_with_tuple_rows() -> None:
|
|
274
|
+
"""Entity already exists → UPDATE branch runs, id taken from row[0]."""
|
|
275
|
+
cur = _FakeCursor(existing_id="e_existing")
|
|
276
|
+
asyncio.run(sync_server._upsert_entities(cur, [_entity_stub()]))
|
|
277
|
+
updates = [(s, p) for s, p in cur.executed if s.startswith("UPDATE entities")]
|
|
278
|
+
assert len(updates) == 1
|
|
279
|
+
_, params = updates[0]
|
|
280
|
+
assert params[-1] == "e_existing" # WHERE id = %s ← row[0]
|
|
281
|
+
assert not any(s.startswith("INSERT INTO entities") for s, _ in cur.executed)
|
|
282
|
+
|
|
283
|
+
|
|
284
|
+
def test_upsert_entities_insert_branch_when_no_match() -> None:
|
|
285
|
+
cur = _FakeCursor(existing_id=None)
|
|
286
|
+
asyncio.run(sync_server._upsert_entities(cur, [_entity_stub()]))
|
|
287
|
+
inserts = [s for s, _ in cur.executed if s.startswith("INSERT INTO entities")]
|
|
288
|
+
assert len(inserts) == 1
|
|
289
|
+
assert not any(s.startswith("UPDATE entities") for s, _ in cur.executed)
|
|
File without changes
|
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
"""Fusion Drive — scored canonical-node selection (pure functions).
|
|
2
|
+
|
|
3
|
+
When fusion decides a set of entities are the same real thing, ONE becomes
|
|
4
|
+
the master (canonical) and the rest become its aliases. entity_resolution_v2
|
|
5
|
+
(#82) currently picks "richest-row-wins", which crowns the typo "Phil
|
|
6
|
+
Mossop" over "Philip Mossop" if the typo's row happens to be richer. This
|
|
7
|
+
replaces that with a scored pick whose dominant signal is an authoritative
|
|
8
|
+
directory match — so when an org directory / CRM knows the real name, it
|
|
9
|
+
wins regardless of row richness. See RFC-fusion-drive.md A3.
|
|
10
|
+
|
|
11
|
+
Pure: all external signals (directory membership, grounding, teacher
|
|
12
|
+
recency) are passed in, so this is fully unit-testable without DB/LLM.
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
from __future__ import annotations
|
|
16
|
+
|
|
17
|
+
import re
|
|
18
|
+
from dataclasses import dataclass, field
|
|
19
|
+
|
|
20
|
+
# Scoring weights. Directory anchoring dominates everything (a known-good
|
|
21
|
+
# authoritative name beats any heuristic); penalties for ID-like / bare /
|
|
22
|
+
# hallucinated names are large enough to sink an otherwise-rich row.
|
|
23
|
+
W_DIRECTORY = 100.0 # name matches an org-directory / CRM contact
|
|
24
|
+
W_GROUNDED = 10.0 # name appears verbatim in a provenance event
|
|
25
|
+
W_TEACHER_RECENCY = 8.0 # extracted by the current (not superseded) teacher
|
|
26
|
+
W_PER_CORROBORATION = 1.0
|
|
27
|
+
CORROBORATION_CAP = 10.0
|
|
28
|
+
P_LOOKS_LIKE_ID = 60.0 # name is mostly digits (numeric-ID-as-person)
|
|
29
|
+
P_HALLUCINATED_EMAIL = 25.0
|
|
30
|
+
P_BARE_SINGLE_TOKEN = 5.0
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
@dataclass
|
|
34
|
+
class CanonicalCandidate:
|
|
35
|
+
"""One entity in a fuse-set, plus the resolved external signals."""
|
|
36
|
+
entity_id: str
|
|
37
|
+
canonical_name: str
|
|
38
|
+
n_provenance: int = 1
|
|
39
|
+
in_directory: bool = False # authoritative match (HubSpot contact, etc.)
|
|
40
|
+
grounded: bool = False # name verbatim in a provenance event
|
|
41
|
+
from_current_teacher: bool = False
|
|
42
|
+
hallucinated_email: bool = False
|
|
43
|
+
aliases: list[str] = field(default_factory=list)
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def _digit_ratio(s: str) -> float:
|
|
47
|
+
stripped = re.sub(r"\s+", "", s)
|
|
48
|
+
if not stripped:
|
|
49
|
+
return 1.0
|
|
50
|
+
return sum(c.isdigit() for c in stripped) / len(stripped)
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def looks_like_id(name: str) -> bool:
|
|
54
|
+
"""Mostly-digit names are extractor noise (numeric IDs mistyped as
|
|
55
|
+
people), not real canonical names."""
|
|
56
|
+
return _digit_ratio(name) > 0.5
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def is_bare_single_token(name: str) -> bool:
|
|
60
|
+
return len(name.split()) == 1
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def canonical_score(c: CanonicalCandidate) -> float:
|
|
64
|
+
score = 0.0
|
|
65
|
+
if c.in_directory:
|
|
66
|
+
score += W_DIRECTORY
|
|
67
|
+
if c.grounded:
|
|
68
|
+
score += W_GROUNDED
|
|
69
|
+
if c.from_current_teacher:
|
|
70
|
+
score += W_TEACHER_RECENCY
|
|
71
|
+
score += min(CORROBORATION_CAP, W_PER_CORROBORATION * max(0, c.n_provenance))
|
|
72
|
+
if looks_like_id(c.canonical_name):
|
|
73
|
+
score -= P_LOOKS_LIKE_ID
|
|
74
|
+
if c.hallucinated_email:
|
|
75
|
+
score -= P_HALLUCINATED_EMAIL
|
|
76
|
+
if is_bare_single_token(c.canonical_name):
|
|
77
|
+
score -= P_BARE_SINGLE_TOKEN
|
|
78
|
+
return round(score, 4)
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def pick_master(candidates: list[CanonicalCandidate]) -> tuple[CanonicalCandidate, list[CanonicalCandidate]]:
|
|
82
|
+
"""Return (master, losers). Master = highest canonical_score; ties
|
|
83
|
+
break toward more provenance, then longer name (stable, deterministic).
|
|
84
|
+
Losers' surface forms become aliases on the master downstream."""
|
|
85
|
+
if not candidates:
|
|
86
|
+
raise ValueError("pick_master requires at least one candidate")
|
|
87
|
+
ranked = sorted(
|
|
88
|
+
candidates,
|
|
89
|
+
# Final key is entity_id so a total tie is resolved deterministically
|
|
90
|
+
# regardless of input order (stable sort alone would leak order).
|
|
91
|
+
key=lambda c: (canonical_score(c), c.n_provenance, len(c.canonical_name), c.entity_id),
|
|
92
|
+
reverse=True,
|
|
93
|
+
)
|
|
94
|
+
return ranked[0], ranked[1:]
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
"""Put this package dir on sys.path so the flat sibling imports in the
|
|
2
|
+
test modules (`import salience`, `from canonical import ...`) resolve no
|
|
3
|
+
matter which directory pytest is invoked from."""
|
|
4
|
+
|
|
5
|
+
import os
|
|
6
|
+
import sys
|
|
7
|
+
|
|
8
|
+
sys.path.insert(0, os.path.dirname(__file__))
|
|
@@ -0,0 +1,178 @@
|
|
|
1
|
+
"""Fusion Drive — merge & eviction PLAN builders (pure functions).
|
|
2
|
+
|
|
3
|
+
The risky part of fusion/eviction is mutating prod rows: repointing facts
|
|
4
|
+
and relationships off a deprecated entity, summing relationship weights on
|
|
5
|
+
collision, unioning aliases/provenance, and deleting the right rows with a
|
|
6
|
+
recoverable receipt. We isolate ALL of that decision-making into pure plan
|
|
7
|
+
builders here (no DB), so it's exhaustively unit-testable; the scripts then
|
|
8
|
+
execute the returned plan inside a single transaction.
|
|
9
|
+
|
|
10
|
+
A plan is a dict of explicit operations. The executor performs them in order
|
|
11
|
+
and is otherwise dumb. Nothing here touches a database or a clock.
|
|
12
|
+
See RFC-fusion-drive.md Parts A4/A5 + B3.
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
from __future__ import annotations
|
|
16
|
+
|
|
17
|
+
from dataclasses import dataclass, field
|
|
18
|
+
from typing import Any
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
# ── entity fusion plan ───────────────────────────────────────────────
|
|
22
|
+
@dataclass
|
|
23
|
+
class EntityMergePlan:
|
|
24
|
+
arena: str
|
|
25
|
+
master_id: str
|
|
26
|
+
# master row mutations
|
|
27
|
+
master_aliases: list[str]
|
|
28
|
+
master_provenance: list[str]
|
|
29
|
+
# repoints: (table, column, from_id) -> master_id
|
|
30
|
+
fact_subject_repoints: list[str] = field(default_factory=list) # fact ids
|
|
31
|
+
fact_object_repoints: list[str] = field(default_factory=list)
|
|
32
|
+
rel_endpoint_repoints: list[str] = field(default_factory=list) # rel ids simply repointed
|
|
33
|
+
rel_collisions: list[dict] = field(default_factory=list) # {keep, drop, summed_weight, provenance}
|
|
34
|
+
deprecated_entity_ids: list[str] = field(default_factory=list)
|
|
35
|
+
audit_rows: list[dict] = field(default_factory=list) # entity_merges rows
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def _union(*lists: list[str]) -> list[str]:
|
|
39
|
+
seen: dict[str, None] = {}
|
|
40
|
+
for lst in lists:
|
|
41
|
+
for x in lst or []:
|
|
42
|
+
if x not in seen:
|
|
43
|
+
seen[x] = None
|
|
44
|
+
return list(seen.keys())
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def build_entity_merge_plan(
|
|
48
|
+
*,
|
|
49
|
+
arena: str,
|
|
50
|
+
master: dict,
|
|
51
|
+
losers: list[dict],
|
|
52
|
+
facts: list[dict],
|
|
53
|
+
relationships: list[dict],
|
|
54
|
+
merge_signal: str = "online_resolver",
|
|
55
|
+
) -> EntityMergePlan:
|
|
56
|
+
"""Compute every mutation to fold `losers` into `master`.
|
|
57
|
+
|
|
58
|
+
master/losers: {id, canonical_name, aliases, provenance_event_ids}
|
|
59
|
+
facts: {id, subject_entity_id, object_entity_id} touching any loser
|
|
60
|
+
relationships: {id, from_entity_id, to_entity_id, relationship_type,
|
|
61
|
+
weight, provenance_event_ids} touching any loser
|
|
62
|
+
"""
|
|
63
|
+
loser_ids = {l["id"] for l in losers}
|
|
64
|
+
if master["id"] in loser_ids:
|
|
65
|
+
raise ValueError("master cannot also be a loser")
|
|
66
|
+
|
|
67
|
+
# master accretes every loser's surface form + provenance
|
|
68
|
+
aliases = _union(
|
|
69
|
+
master.get("aliases", []),
|
|
70
|
+
[l["canonical_name"] for l in losers],
|
|
71
|
+
*[l.get("aliases", []) for l in losers],
|
|
72
|
+
)
|
|
73
|
+
# don't list the master's own canonical_name as an alias of itself
|
|
74
|
+
aliases = [a for a in aliases if a != master["canonical_name"]]
|
|
75
|
+
provenance = _union(
|
|
76
|
+
master.get("provenance_event_ids", []),
|
|
77
|
+
*[l.get("provenance_event_ids", []) for l in losers],
|
|
78
|
+
)
|
|
79
|
+
|
|
80
|
+
plan = EntityMergePlan(
|
|
81
|
+
arena=arena,
|
|
82
|
+
master_id=master["id"],
|
|
83
|
+
master_aliases=aliases,
|
|
84
|
+
master_provenance=provenance,
|
|
85
|
+
deprecated_entity_ids=sorted(loser_ids),
|
|
86
|
+
)
|
|
87
|
+
|
|
88
|
+
# facts: repoint subject/object off losers onto master
|
|
89
|
+
for f in facts:
|
|
90
|
+
if f.get("subject_entity_id") in loser_ids:
|
|
91
|
+
plan.fact_subject_repoints.append(f["id"])
|
|
92
|
+
if f.get("object_entity_id") in loser_ids:
|
|
93
|
+
plan.fact_object_repoints.append(f["id"])
|
|
94
|
+
|
|
95
|
+
# relationships: repoint endpoints; a repoint can collide with an
|
|
96
|
+
# existing rel of the same (from,to,type) → keep one, sum weights,
|
|
97
|
+
# union provenance, drop the other. Detect collisions on the
|
|
98
|
+
# post-repoint key.
|
|
99
|
+
def repointed_key(r: dict) -> tuple:
|
|
100
|
+
frm = master["id"] if r["from_entity_id"] in loser_ids else r["from_entity_id"]
|
|
101
|
+
to = master["id"] if r["to_entity_id"] in loser_ids else r["to_entity_id"]
|
|
102
|
+
return (frm, to, r["relationship_type"])
|
|
103
|
+
|
|
104
|
+
by_key: dict[tuple, dict] = {}
|
|
105
|
+
for r in relationships:
|
|
106
|
+
touches = r["from_entity_id"] in loser_ids or r["to_entity_id"] in loser_ids
|
|
107
|
+
key = repointed_key(r)
|
|
108
|
+
if key in by_key:
|
|
109
|
+
keep = by_key[key]
|
|
110
|
+
plan.rel_collisions.append({
|
|
111
|
+
"keep": keep["id"],
|
|
112
|
+
"drop": r["id"],
|
|
113
|
+
"summed_weight": round(keep.get("weight", 1.0) + r.get("weight", 1.0), 4),
|
|
114
|
+
"provenance": _union(keep.get("provenance_event_ids", []),
|
|
115
|
+
r.get("provenance_event_ids", [])),
|
|
116
|
+
})
|
|
117
|
+
else:
|
|
118
|
+
by_key[key] = r
|
|
119
|
+
if touches:
|
|
120
|
+
plan.rel_endpoint_repoints.append(r["id"])
|
|
121
|
+
|
|
122
|
+
# audit + rollback receipt, one per deprecated entity
|
|
123
|
+
for l in losers:
|
|
124
|
+
plan.audit_rows.append({
|
|
125
|
+
"arena": arena,
|
|
126
|
+
"canonical_id": master["id"],
|
|
127
|
+
"deprecated_id": l["id"],
|
|
128
|
+
"deprecated_canonical_name": l["canonical_name"],
|
|
129
|
+
"deprecated_aliases": l.get("aliases", []),
|
|
130
|
+
"merge_signal": merge_signal,
|
|
131
|
+
"rollback_payload": l, # full row, sufficient to recreate
|
|
132
|
+
})
|
|
133
|
+
return plan
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
# ── fact fusion plan (exact-triple dupes) ────────────────────────────
|
|
137
|
+
def build_fact_merge_plan(*, arena: str, dup_facts: list[dict]) -> dict | None:
|
|
138
|
+
"""`dup_facts` all share (arena, subject, predicate, object). Master =
|
|
139
|
+
highest confidence, then longest statement (most informative), then id.
|
|
140
|
+
Others' provenance is unioned into the master; they are deleted."""
|
|
141
|
+
if len(dup_facts) < 2:
|
|
142
|
+
return None
|
|
143
|
+
ranked = sorted(
|
|
144
|
+
dup_facts,
|
|
145
|
+
key=lambda f: (f.get("confidence", 0.0), len(f.get("statement", "")), f["id"]),
|
|
146
|
+
reverse=True,
|
|
147
|
+
)
|
|
148
|
+
master, losers = ranked[0], ranked[1:]
|
|
149
|
+
provenance = _union(master.get("provenance_event_ids", []),
|
|
150
|
+
*[l.get("provenance_event_ids", []) for l in losers])
|
|
151
|
+
return {
|
|
152
|
+
"arena": arena,
|
|
153
|
+
"master_id": master["id"],
|
|
154
|
+
"master_provenance": provenance,
|
|
155
|
+
"deprecated_ids": [l["id"] for l in losers],
|
|
156
|
+
"audit_rows": [{
|
|
157
|
+
"arena": arena,
|
|
158
|
+
"canonical_id": master["id"],
|
|
159
|
+
"deprecated_id": l["id"],
|
|
160
|
+
"deprecated_statement": l.get("statement", ""),
|
|
161
|
+
"merge_signal": "exact_triple",
|
|
162
|
+
"provenance_unioned": len(l.get("provenance_event_ids", [])),
|
|
163
|
+
"rollback_payload": l,
|
|
164
|
+
} for l in losers],
|
|
165
|
+
}
|
|
166
|
+
|
|
167
|
+
|
|
168
|
+
# ── eviction plan ────────────────────────────────────────────────────
|
|
169
|
+
def build_eviction_receipt(node_kind: str, row: dict) -> dict:
|
|
170
|
+
"""A node_evictions audit row carrying enough to recreate the deleted
|
|
171
|
+
node. The executor only deletes nodes the salience pass already
|
|
172
|
+
classified evictable; this just packages the rollback receipt."""
|
|
173
|
+
return {
|
|
174
|
+
"node_kind": node_kind, # entity | fact | relationship
|
|
175
|
+
"node_id": row["id"],
|
|
176
|
+
"arena": row["arena"],
|
|
177
|
+
"rollback_payload": row,
|
|
178
|
+
}
|
|
@@ -0,0 +1,118 @@
|
|
|
1
|
+
"""Fusion Drive — salience scoring + time decay (pure functions).
|
|
2
|
+
|
|
3
|
+
Salience is a node's RETENTION PRIORITY (0..1), distinct from a fact's
|
|
4
|
+
`confidence` (which means corroboration/truth and only moves up). Salience
|
|
5
|
+
is seeded at birth from extraction-quality signals, decays with time since
|
|
6
|
+
last activity on a per-category half-life, and is reset/raised by
|
|
7
|
+
re-corroboration or retrieval. Eviction (a later phase) keys on salience.
|
|
8
|
+
|
|
9
|
+
Everything here is a pure function — no DB, no clock — so the decay pass
|
|
10
|
+
just supplies `now` and the stored fields. See RFC-fusion-drive.md Part B.
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
from __future__ import annotations
|
|
14
|
+
|
|
15
|
+
# ── born salience ────────────────────────────────────────────────────
|
|
16
|
+
# A node is born at BASE, nudged up by corroboration and down by each
|
|
17
|
+
# extraction-quality red flag. Junk (noise name, numeric-ID person,
|
|
18
|
+
# hallucinated email, ungrounded) is born near the floor so it decays
|
|
19
|
+
# below the eviction threshold fast even with no fusion match — this is
|
|
20
|
+
# the "born-low" mechanism that lets decay target pollution rather than
|
|
21
|
+
# just age everything on the same clock.
|
|
22
|
+
BASE_SALIENCE = 0.50
|
|
23
|
+
CORROB_PER_SOURCE = 0.10 # each extra corroborating event
|
|
24
|
+
CORROB_CAP = 0.30 # max uplift from corroboration
|
|
25
|
+
SALIENCE_FLOOR = 0.01
|
|
26
|
+
SALIENCE_CEIL = 1.00
|
|
27
|
+
|
|
28
|
+
# Quality penalties (subtracted from born salience). Tuned so any single
|
|
29
|
+
# hard-junk signal alone lands a node below the nominal 0.3 decay-sweep
|
|
30
|
+
# threshold; combined signals drive it to the floor.
|
|
31
|
+
QUALITY_PENALTIES = {
|
|
32
|
+
"noise_name": 0.45, # noise_filter.is_noise_entity_name hit
|
|
33
|
+
"numeric_id_person": 0.45, # person whose name is mostly digits (ID-as-person)
|
|
34
|
+
"hallucinated_email": 0.40, # email not present in any provenance event
|
|
35
|
+
"ungrounded": 0.35, # name/statement not substring of any source
|
|
36
|
+
"subject_undeclared": 0.25, # fact subject not among the event's entities
|
|
37
|
+
"low_signal": 0.15, # extracted from <60 chars of content
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def _clamp(x: float) -> float:
|
|
42
|
+
return max(SALIENCE_FLOOR, min(SALIENCE_CEIL, x))
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def born_salience(*, n_sources: int = 1, quality_flags: list[str] | None = None) -> float:
|
|
46
|
+
"""Salience to stamp on a freshly extracted node.
|
|
47
|
+
|
|
48
|
+
n_sources: corroborating events (cardinality of provenance).
|
|
49
|
+
quality_flags: subset of QUALITY_PENALTIES keys that fired for this node.
|
|
50
|
+
"""
|
|
51
|
+
s = BASE_SALIENCE
|
|
52
|
+
if n_sources > 1:
|
|
53
|
+
s += min(CORROB_CAP, CORROB_PER_SOURCE * (n_sources - 1))
|
|
54
|
+
for flag in quality_flags or []:
|
|
55
|
+
s -= QUALITY_PENALTIES.get(flag, 0.0)
|
|
56
|
+
return round(_clamp(s), 4)
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
# ── time decay ───────────────────────────────────────────────────────
|
|
60
|
+
# Half-lives in DAYS, by fact category (entities/relationships use the
|
|
61
|
+
# kind-level defaults). Durable categories (decisions, commitments) are
|
|
62
|
+
# effectively non-decaying; ephemeral ones (mentions, observations) fade
|
|
63
|
+
# in weeks. These are starting constants — Part B open question flags a
|
|
64
|
+
# calibration pass against real arenas.
|
|
65
|
+
FACT_HALF_LIFE_DAYS = {
|
|
66
|
+
"decision": 3650,
|
|
67
|
+
"commitment": 3650,
|
|
68
|
+
"state": 180,
|
|
69
|
+
"preference": 180,
|
|
70
|
+
"mention": 30,
|
|
71
|
+
"observation": 30,
|
|
72
|
+
}
|
|
73
|
+
FACT_HALF_LIFE_DEFAULT = 90
|
|
74
|
+
ENTITY_HALF_LIFE_DAYS = 365
|
|
75
|
+
RELATIONSHIP_HALF_LIFE_DAYS = 180
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
def half_life_days(kind: str, category: str | None = None) -> float:
|
|
79
|
+
"""kind ∈ {'fact','entity','relationship'}. category only used for facts."""
|
|
80
|
+
if kind == "fact":
|
|
81
|
+
return FACT_HALF_LIFE_DAYS.get((category or "").lower(), FACT_HALF_LIFE_DEFAULT)
|
|
82
|
+
if kind == "entity":
|
|
83
|
+
return ENTITY_HALF_LIFE_DAYS
|
|
84
|
+
if kind == "relationship":
|
|
85
|
+
return RELATIONSHIP_HALF_LIFE_DAYS
|
|
86
|
+
return FACT_HALF_LIFE_DEFAULT
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
def decayed_salience(salience0: float, age_days: float, hl_days: float) -> float:
|
|
90
|
+
"""Exponential half-life decay. age_days is time since the most recent
|
|
91
|
+
of (last_accessed, last_seen/asserted) — i.e. the clock resets on
|
|
92
|
+
access or re-corroboration, so used/reconfirmed memories don't fade."""
|
|
93
|
+
if age_days <= 0 or hl_days <= 0:
|
|
94
|
+
return round(_clamp(salience0), 4)
|
|
95
|
+
return round(_clamp(salience0 * (0.5 ** (age_days / hl_days))), 4)
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
# ── eviction predicate (computed in Phase 1, ACTED ON in a later phase) ─
|
|
99
|
+
EVICT_THRESHOLD = 0.05 # salience below this → eviction candidate
|
|
100
|
+
EVICT_MIN_AGE_DAYS = 30 # ...and untouched at least this long
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
def is_evictable(
|
|
104
|
+
*,
|
|
105
|
+
current_salience: float,
|
|
106
|
+
age_days: float,
|
|
107
|
+
referenced_by_live_node: bool,
|
|
108
|
+
disclosure_class: str = "private",
|
|
109
|
+
) -> bool:
|
|
110
|
+
"""An entity that is the subject/object of a surviving higher-salience
|
|
111
|
+
fact is NOT evictable (would orphan the fact). Restricted disclosure is
|
|
112
|
+
never auto-evicted (needs sign-off). Phase 1 only REPORTS this; the
|
|
113
|
+
decay pass does not delete until a later flagged phase."""
|
|
114
|
+
if disclosure_class == "restricted":
|
|
115
|
+
return False
|
|
116
|
+
if referenced_by_live_node:
|
|
117
|
+
return False
|
|
118
|
+
return current_salience < EVICT_THRESHOLD and age_days >= EVICT_MIN_AGE_DAYS
|